diff --git a/Dockerfile.backend b/Dockerfile.backend index 4e340bc3..7a7cd356 100644 --- a/Dockerfile.backend +++ b/Dockerfile.backend @@ -31,7 +31,7 @@ RUN LLAMA_LOCAL_ZIP="dependencies/llama.cpp.zip" \ && cd llama.cpp \ && mkdir -p build && cd build \ && cmake .. \ - && cmake --build . --config Release \ + && cmake --build . --config Release -j\ && if [ ! -f "bin/llama-server" ]; then \ echo "Build failed: llama-server executable not found" && exit 1; \ else \ diff --git a/README.md b/README.md index c376c65e..01baaf52 100644 --- a/README.md +++ b/README.md @@ -144,7 +144,7 @@ For data synthesis, we utilized [GraphRAG](https://github.com/microsoft/graphrag For model deployment, we utilized [llama.cpp](https://github.com/ggml-org/llama.cpp), which provides efficient inference capabilities. -Our base models primarily come from the [Qwen2.5](https://huggingface.co/Qwen) series. +Our base models primarily come from the [Qwen](https://huggingface.co/Qwen) series. We also want to extend our sincere gratitude to all users who have experienced Second Me. We recognize that there is significant room for optimization throughout the entire pipeline, and we are fully committed to iterative improvements to ensure everyone can enjoy the best possible experience locally. diff --git a/README_ja.md b/README_ja.md index d88e69ee..e9728cb2 100644 --- a/README_ja.md +++ b/README_ja.md @@ -198,7 +198,7 @@ Made with [contrib.rocks](https://contrib.rocks). モデルのデプロイには、効率的な推論機能を提供する[llama.cpp](https://github.com/ggml-org/llama.cpp)を使用しました。 -私たちのベースモデルは主に[Qwen2.5](https://huggingface.co/Qwen)シリーズから来ています。 +私たちのベースモデルは主に[Qwen](https://huggingface.co/Qwen)シリーズから来ています。 また、Second Meを体験してくれたすべてのユーザーに心から感謝します。パイプライン全体で最適化の余地が大いにあることを認識しており、皆さんがローカルで最高の体験を楽しめるようにするために、継続的な改善に全力を尽くします。 diff --git a/dependencies/llama.cpp.zip b/dependencies/llama.cpp.zip index 8f801d58..20bedcfb 100644 Binary files a/dependencies/llama.cpp.zip and b/dependencies/llama.cpp.zip differ diff --git a/docs/Custom Model Config(Ollama).md b/docs/Custom Model Config(Ollama).md index af783f97..5d5d3eb4 100644 --- a/docs/Custom Model Config(Ollama).md +++ b/docs/Custom Model Config(Ollama).md @@ -84,7 +84,7 @@ EMBEDDING_MAX_TEXT_LENGTH=embedding_model_context_length ``` Chat: -Model Name: qwen2.5:0.5b +Model Name: qwen2.5:0.6b API Key: ollama API Endpoint: http://127.0.0.1:11434/v1 diff --git a/lpm_frontend/src/app/dashboard/playground/chat/page.tsx b/lpm_frontend/src/app/dashboard/playground/chat/page.tsx index 921f9d53..be9a0a2e 100644 --- a/lpm_frontend/src/app/dashboard/playground/chat/page.tsx +++ b/lpm_frontend/src/app/dashboard/playground/chat/page.tsx @@ -47,12 +47,14 @@ export default function PlaygroundChat() { const [activeSessionId, setActiveSessionId] = useState(null); const [messages, setMessages] = useState([]); const [modelType, setModelType] = useState(undefined); + const [modelName, setModelName] = useState(''); const originPrompt = useMemo(() => { const name = loadInfo?.name || 'user'; + const isQwen3 = modelName.toLowerCase().includes('qwen3'); if (modelType === 'chat') { - return `You are ${name}'s "Second Me", which is a personalized AI created by ${name}. You can help ${name} answer questions based on your understanding of ${name}'s background information and past records.`; + return `You are ${name}'s "Second Me", which is a personalized AI created by ${name}. You can help ${name} answer questions based on your understanding of ${name}'s background information and past records.${isQwen3 ? ' /no_think' : ''}`; } if (modelType === 'thinking') { @@ -76,7 +78,7 @@ export default function PlaygroundChat() { } return ''; - }, [loadInfo, modelType]); + }, [loadInfo, modelType, modelName]); const originSettings = useMemo(() => { return { enableL0Retrieval: true, @@ -110,6 +112,7 @@ export default function PlaygroundChat() { localStorage.setItem('trainingParams', JSON.stringify(data)); setModelType(data.is_cot ? 'thinking' : 'chat'); + setModelName(data.model_name || ''); } else { throw new Error(res.data.message); } diff --git a/lpm_frontend/src/app/dashboard/train/training/page.tsx b/lpm_frontend/src/app/dashboard/train/training/page.tsx index 031c75f3..3978092b 100644 --- a/lpm_frontend/src/app/dashboard/train/training/page.tsx +++ b/lpm_frontend/src/app/dashboard/train/training/page.tsx @@ -48,21 +48,38 @@ interface TrainingDetail { } const baseModelOptions = [ + { + value: 'Qwen3-0.6B', + label: 'Qwen3-0.6B (8GB+ RAM Recommended)' + }, + { + value: 'Qwen3-1.7B', + label: 'Qwen3-1.7B (16GB+ RAM Recommended)' + }, + { + value: 'Qwen3-4B', + label: 'Qwen3-4B (32GB+ RAM Recommended)' + }, + { + value: 'Qwen3-8B', + label: 'Qwen3-8B (64GB+ RAM Recommended)' + }, + // Qwen2.5 models { value: 'Qwen2.5-0.5B-Instruct', - label: 'Qwen2.5-0.5B-Instruct (8GB+ RAM Recommended)' + label: 'Qwen2.5-0.5B (8GB+ RAM Recommended)' }, { value: 'Qwen2.5-1.5B-Instruct', - label: 'Qwen2.5-1.5B-Instruct (16GB+ RAM Recommended)' + label: 'Qwen2.5-1.5B (16GB+ RAM Recommended)' }, { value: 'Qwen2.5-3B-Instruct', - label: 'Qwen2.5-3B-Instruct (32GB+ RAM Recommended)' + label: 'Qwen2.5-3B (32GB+ RAM Recommended)' }, { value: 'Qwen2.5-7B-Instruct', - label: 'Qwen2.5-7B-Instruct (64GB+ RAM Recommended)' + label: 'Qwen2.5-7B (64GB+ RAM Recommended)' } ]; diff --git a/lpm_frontend/src/store/useTrainingStore.ts b/lpm_frontend/src/store/useTrainingStore.ts index 2a713cbb..0a618bc0 100644 --- a/lpm_frontend/src/store/useTrainingStore.ts +++ b/lpm_frontend/src/store/useTrainingStore.ts @@ -151,7 +151,7 @@ export const useTrainingStore = create((set, get) => ({ try { const res = await getTrainProgress({ - model_name: config.model_name || 'Qwen2.5-0.5B-Instruct' + model_name: config.model_name || 'Qwen3-0.6B' }); if (res.data.code === 0) { diff --git a/lpm_kernel/L2/convert_hf_to_gguf.py b/lpm_kernel/L2/convert_hf_to_gguf.py index 1993babe..b9cea7e4 100644 --- a/lpm_kernel/L2/convert_hf_to_gguf.py +++ b/lpm_kernel/L2/convert_hf_to_gguf.py @@ -14,18 +14,7 @@ from enum import IntEnum from pathlib import Path from hashlib import sha256 -from typing import ( - TYPE_CHECKING, - Any, - Callable, - ContextManager, - Iterable, - Iterator, - Literal, - Sequence, - TypeVar, - cast, -) +from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast from itertools import chain import math @@ -35,8 +24,8 @@ if TYPE_CHECKING: from torch import Tensor -if "NO_LOCAL_GGUF" not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / "gguf-py")) +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) import gguf logger = logging.getLogger("hf-to-gguf") @@ -44,7 +33,6 @@ ###### MODEL DEFINITIONS ###### - class SentencePieceTokenTypes(IntEnum): NORMAL = 1 UNKNOWN = 2 @@ -54,11 +42,19 @@ class SentencePieceTokenTypes(IntEnum): BYTE = 6 -AnyModel = TypeVar("AnyModel", bound="type[Model]") +class ModelType(IntEnum): + TEXT = 1 + VISION = 2 + +AnyModel = TypeVar("AnyModel", bound="type[ModelBase]") -class Model: - _model_classes: dict[str, type[Model]] = {} + +class ModelBase: + _model_classes: dict[ModelType, dict[str, type[ModelBase]]] = { + ModelType.TEXT: {}, + ModelType.VISION: {}, + } dir_model: Path ftype: gguf.LlamaFileType @@ -77,54 +73,47 @@ class Model: model_name: str | None metadata_override: Path | None dir_model_card: Path + remote_hf_model_id: str | None # subclasses should define this! model_arch: gguf.MODEL_ARCH - def __init__( - self, - dir_model: Path, - ftype: gguf.LlamaFileType, - fname_out: Path, - is_big_endian: bool = False, - use_temp_file: bool = False, - eager: bool = False, - metadata_override: Path | None = None, - model_name: str | None = None, - split_max_tensors: int = 0, - split_max_size: int = 0, - dry_run: bool = False, - small_first_shard: bool = False, - hparams: dict[str, Any] | None = None, - ): - if type(self) is Model: - raise TypeError( - f"{type(self).__name__!r} should not be directly instantiated" - ) + def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, *, is_big_endian: bool = False, + use_temp_file: bool = False, eager: bool = False, + metadata_override: Path | None = None, model_name: str | None = None, + split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, + small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None): + if type(self) is ModelBase or \ + type(self) is TextModel or \ + type(self) is VisionModel: + raise TypeError(f"{type(self).__name__!r} should not be directly instantiated") self.dir_model = dir_model self.ftype = ftype self.fname_out = fname_out self.is_big_endian = is_big_endian - self.endianess = ( - gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE - ) + self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE self.use_temp_file = use_temp_file - self.lazy = not eager - self.part_names = Model.get_model_part_names( - self.dir_model, "model", ".safetensors" - ) - self.is_safetensors = len(self.part_names) > 0 - if not self.is_safetensors: - self.part_names = Model.get_model_part_names( - self.dir_model, "pytorch_model", ".bin" - ) - self.hparams = ( - Model.load_hparams(self.dir_model) if hparams is None else hparams - ) - self.block_count = self.find_hparam( - ["n_layers", "num_hidden_layers", "n_layer", "num_layers"] - ) + self.lazy = not eager or (remote_hf_model_id is not None) + self.remote_hf_model_id = remote_hf_model_id + if remote_hf_model_id is not None: + self.is_safetensors = True + + def get_remote_tensors() -> Iterator[tuple[str, Tensor]]: + logger.info(f"Using remote model with HuggingFace id: {remote_hf_model_id}") + remote_tensors = gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id) + self.tensor_names = set(name for name in remote_tensors.keys()) + for name, remote_tensor in gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id).items(): + yield (name, LazyTorchTensor.from_remote_tensor(remote_tensor)) + + self.get_tensors = get_remote_tensors + else: + self.part_names = ModelBase.get_model_part_names(self.dir_model, "model", ".safetensors") + self.is_safetensors = len(self.part_names) > 0 + if not self.is_safetensors: + self.part_names = ModelBase.get_model_part_names(self.dir_model, "pytorch_model", ".bin") + self.hparams = ModelBase.load_hparams(self.dir_model) if hparams is None else hparams + self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"]) self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) self.tensor_names = None self.metadata_override = metadata_override @@ -136,34 +125,21 @@ def __init__( # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie. _, first_tensor = next(self.get_tensors()) if first_tensor.dtype == torch.float16: - logger.info( - f"choosing --outtype f16 from first tensor type ({first_tensor.dtype})" - ) + logger.info(f"choosing --outtype f16 from first tensor type ({first_tensor.dtype})") self.ftype = gguf.LlamaFileType.MOSTLY_F16 else: - logger.info( - f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})" - ) + logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})") self.ftype = gguf.LlamaFileType.MOSTLY_BF16 # Configure GGUF Writer - self.gguf_writer = gguf.GGUFWriter( - path=None, - arch=gguf.MODEL_ARCH_NAMES[self.model_arch], - endianess=self.endianess, - use_temp_file=self.use_temp_file, - split_max_tensors=split_max_tensors, - split_max_size=split_max_size, - dry_run=dry_run, - small_first_shard=small_first_shard, - ) + self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file, + split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard) @classmethod - def __init_subclass__(cls): - # can't use an abstract property, because overriding it without type errors - # would require using decorated functions instead of simply defining the property - if "model_arch" not in cls.__dict__: - raise TypeError(f"Missing property 'model_arch' for {cls.__name__!r}") + def add_prefix_to_filename(cls, path: Path, prefix: str) -> Path: + stem, suffix = path.stem, path.suffix + new_name = f"{prefix}{stem}{suffix}" + return path.with_name(new_name) def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any: key = next((k for k in keys if k in self.hparams), None) @@ -173,9 +149,6 @@ def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any: return None raise KeyError(f"could not find any of: {keys}") - def set_vocab(self): - self._set_vocab_gpt2() - def get_tensors(self) -> Iterator[tuple[str, Tensor]]: tensor_names_from_parts: set[str] = set() @@ -201,20 +174,9 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]: ctx: ContextManager[Any] if self.is_safetensors: from safetensors import safe_open - - ctx = cast( - ContextManager[Any], - safe_open(self.dir_model / part_name, framework="pt", device="cpu"), - ) + ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu")) else: - ctx = contextlib.nullcontext( - torch.load( - str(self.dir_model / part_name), - map_location="cpu", - mmap=True, - weights_only=True, - ) - ) + ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True)) with ctx as model_part: tensor_names_from_parts.update(model_part.keys()) @@ -236,38 +198,25 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]: if len(tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0: missing = sorted(self.tensor_names.difference(tensor_names_from_parts)) extra = sorted(tensor_names_from_parts.difference(self.tensor_names)) - missing_files = sorted( - set(weight_map[n] for n in missing if n in weight_map) - ) + missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map)) if len(extra) == 0 and len(missing_files) > 0: - raise ValueError(f"Missing or incomplete model files: {missing_files}") + raise ValueError(f"Missing or incomplete model files: {missing_files}\n" + f"Missing tensors: {missing}") else: - raise ValueError( - "Mismatch between weight map and model parts for tensor names:\n" - f"Missing tensors: {missing}\n" - f"Extra tensors: {extra}" - ) - - def format_tensor_name( - self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight" - ) -> str: + raise ValueError("Mismatch between weight map and model parts for tensor names:\n" + f"Missing tensors: {missing}\n" + f"Extra tensors: {extra}") + + def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str: if key not in gguf.MODEL_TENSORS[self.model_arch]: - raise ValueError( - f"Missing {key!r} for MODEL_TENSORS of {self.model_arch!r}" - ) + raise ValueError(f"Missing {key!r} for MODEL_TENSORS of {self.model_arch!r}") name: str = gguf.TENSOR_NAMES[key] if "{bid}" in name: assert bid is not None name = name.format(bid=bid) return name + suffix - def match_model_tensor_name( - self, - name: str, - key: gguf.MODEL_TENSOR, - bid: int | None, - suffix: str = ".weight", - ) -> bool: + def match_model_tensor_name(self, name: str, key: gguf.MODEL_TENSOR, bid: int | None, suffix: str = ".weight") -> bool: if key not in gguf.MODEL_TENSORS[self.model_arch]: return False key_name: str = gguf.TENSOR_NAMES[key] @@ -280,84 +229,21 @@ def match_model_tensor_name( return False return name == (key_name + suffix) - def map_tensor_name( - self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias") - ) -> str: + def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str: new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes) if new_name is None: raise ValueError(f"Can not map tensor {name!r}") return new_name def set_gguf_parameters(self): - self.gguf_writer.add_block_count(self.block_count) - - if ( - n_ctx := self.find_hparam( - ["max_position_embeddings", "n_ctx"], optional=True - ) - ) is not None: - self.gguf_writer.add_context_length(n_ctx) - logger.info(f"gguf: context length = {n_ctx}") - - if ( - n_embd := self.find_hparam(["hidden_size", "n_embd"], optional=True) - ) is not None: - self.gguf_writer.add_embedding_length(n_embd) - logger.info(f"gguf: embedding length = {n_embd}") - - if ( - n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True) - ) is not None: - self.gguf_writer.add_feed_forward_length(n_ff) - logger.info(f"gguf: feed forward length = {n_ff}") - - if ( - n_head := self.find_hparam(["num_attention_heads", "n_head"], optional=True) - ) is not None: - self.gguf_writer.add_head_count(n_head) - logger.info(f"gguf: head count = {n_head}") - - if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None: - self.gguf_writer.add_head_count_kv(n_head_kv) - logger.info(f"gguf: key-value head count = {n_head_kv}") - - if (rope_theta := self.hparams.get("rope_theta")) is not None: - self.gguf_writer.add_rope_freq_base(rope_theta) - logger.info(f"gguf: rope theta = {rope_theta}") - if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None: - self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps) - logger.info(f"gguf: rms norm epsilon = {f_rms_eps}") - if ( - f_norm_eps := self.find_hparam( - ["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True - ) - ) is not None: - self.gguf_writer.add_layer_norm_eps(f_norm_eps) - logger.info(f"gguf: layer norm epsilon = {f_norm_eps}") - if (n_experts := self.hparams.get("num_local_experts")) is not None: - self.gguf_writer.add_expert_count(n_experts) - logger.info(f"gguf: expert count = {n_experts}") - if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None: - self.gguf_writer.add_expert_used_count(n_experts_used) - logger.info(f"gguf: experts used count = {n_experts_used}") - - if (head_dim := self.hparams.get("head_dim")) is not None: - self.gguf_writer.add_key_length(head_dim) - self.gguf_writer.add_value_length(head_dim) - - self.gguf_writer.add_file_type(self.ftype) - logger.info(f"gguf: file type = {self.ftype}") + raise NotImplementedError("set_gguf_parameters() must be implemented in subclasses") - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused return [(self.map_tensor_name(name), data_torch)] - def tensor_force_quant( - self, name: str, new_name: str, bid: int | None, n_dims: int - ) -> gguf.GGMLQuantizationType | bool: + def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool: del name, new_name, bid, n_dims # unused return False @@ -367,17 +253,11 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: return () def prepare_tensors(self): - max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len( - ".weight," - ) + max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,") - for name, data_torch in chain( - self.generate_extra_tensors(), self.get_tensors() - ): + for name, data_torch in chain(self.generate_extra_tensors(), self.get_tensors()): # we don't need these - if name.endswith( - (".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq") - ): + if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")): continue old_dtype = data_torch.dtype @@ -393,7 +273,7 @@ def prepare_tensors(self): bid = int(part) break - for new_name, data_torch in self.modify_tensors(data_torch, name, bid): + for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)): # TODO: why do we squeeze here? # data = data_torch.squeeze().numpy() data = data_torch.numpy() @@ -403,9 +283,7 @@ def prepare_tensors(self): data = data_torch.numpy() n_dims = len(data.shape) - data_qtype: gguf.GGMLQuantizationType | bool = self.tensor_force_quant( - name, new_name, bid, n_dims - ) + data_qtype: gguf.GGMLQuantizationType | bool = self.tensor_force_quant(name, new_name, bid, n_dims) # Most of the codebase that takes in 1D tensors or norms only handles F32 tensors if n_dims <= 1 or new_name.endswith("_norm.weight"): @@ -473,19 +351,13 @@ def prepare_tensors(self): data_qtype = gguf.GGMLQuantizationType.F16 data = gguf.quants.quantize(data, data_qtype) - shape = ( - gguf.quant_shape_from_byte_shape(data.shape, data_qtype) - if data.dtype == np.uint8 - else data.shape - ) + shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape # reverse shape to make it similar to the internal ggml dimension order shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}" # n_dims is implicit in the shape - logger.info( - f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}" - ) + logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}") self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype) @@ -493,16 +365,14 @@ def set_type(self): self.gguf_writer.add_type(gguf.GGUFType.MODEL) def prepare_metadata(self, vocab_only: bool): - ( - total_params, - shared_params, - expert_params, - expert_count, - ) = self.gguf_writer.get_total_parameter_count() - - self.metadata = gguf.Metadata.load( - self.metadata_override, self.dir_model_card, self.model_name, total_params - ) + + total_params, shared_params, expert_params, expert_count = self.gguf_writer.get_total_parameter_count() + + self.metadata = gguf.Metadata.load(self.metadata_override, self.dir_model_card, self.model_name, total_params) + + # If we are using HF model id, set the metadata name to the model id + if self.remote_hf_model_id: + self.metadata.name = self.remote_hf_model_id # Fallback to model directory name if metadata name is still missing if self.metadata.name is None: @@ -510,48 +380,7 @@ def prepare_metadata(self, vocab_only: bool): # Generate parameter weight class (useful for leader boards) if not yet determined if self.metadata.size_label is None and total_params > 0: - self.metadata.size_label = gguf.size_label( - total_params, shared_params, expert_params, expert_count - ) - - # Extract the encoding scheme from the file type name. e.g. 'gguf.LlamaFileType.MOSTLY_Q8_0' --> 'Q8_0' - output_type: str = self.ftype.name.partition("_")[2] - - # Filename Output - if self.fname_out.is_dir(): - # Generate default filename based on model specification and available metadata - if not vocab_only: - fname_default: str = gguf.naming_convention( - self.metadata.name, - self.metadata.basename, - self.metadata.finetune, - self.metadata.version, - self.metadata.size_label, - output_type, - model_type="LoRA" if total_params < 0 else None, - ) - else: - fname_default: str = gguf.naming_convention( - self.metadata.name, - self.metadata.basename, - self.metadata.finetune, - self.metadata.version, - size_label=None, - output_type=None, - model_type="vocab", - ) - - # Use the default filename - self.fname_out = self.fname_out / f"{fname_default}.gguf" - else: - # Output path is a custom defined templated filename - # Note: `not is_dir()` is used because `.is_file()` will not detect - # file template strings as it doesn't actually exist as a file - - # Process templated file name with the output ftype, useful with the "auto" ftype - self.fname_out = self.fname_out.parent / gguf.fill_templated_filename( - self.fname_out.name, output_type - ) + self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count) self.set_type() @@ -561,12 +390,12 @@ def prepare_metadata(self, vocab_only: bool): logger.info("Set model parameters") self.set_gguf_parameters() - logger.info("Set model tokenizer") - self.set_vocab() - logger.info("Set model quantization version") self.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) + def write_vocab(self): + raise NotImplementedError("write_vocab() must be implemented in subclasses") + def write(self): self.prepare_tensors() self.prepare_metadata(vocab_only=False) @@ -575,15 +404,6 @@ def write(self): self.gguf_writer.write_tensors_to_file(progress=True) self.gguf_writer.close() - def write_vocab(self): - if len(self.gguf_writer.tensors) != 1: - raise ValueError("Splitting the vocabulary is not supported") - - self.prepare_metadata(vocab_only=True) - self.gguf_writer.write_header_to_file(path=self.fname_out) - self.gguf_writer.write_kv_data_to_file() - self.gguf_writer.close() - @staticmethod def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]: part_names: list[str] = [] @@ -598,30 +418,127 @@ def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str] @staticmethod def load_hparams(dir_model: Path): with open(dir_model / "config.json", "r", encoding="utf-8") as f: - return json.load(f) + hparams = json.load(f) + architectures = hparams.get("architectures") + if "text_config" in hparams: + hparams = {**hparams, **hparams["text_config"]} + if architectures is not None: + # preserve "architectures" from root level config + hparams["architectures"] = architectures + return hparams @classmethod def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]: assert names def func(modelcls: AnyModel) -> AnyModel: + model_type = ModelType.VISION if modelcls.model_arch == gguf.MODEL_ARCH.CLIP_VISION else ModelType.TEXT for name in names: - cls._model_classes[name] = modelcls + cls._model_classes[model_type][name] = modelcls return modelcls - return func @classmethod def print_registered_models(cls): - for name in sorted(cls._model_classes.keys()): - logger.error(f"- {name}") + for model_type, model_classes in cls._model_classes.items(): + logger.error(f"{model_type.name} models:") + for name in sorted(model_classes.keys()): + logger.error(f" - {name}") @classmethod - def from_model_architecture(cls, arch: str) -> type[Model]: + def from_model_architecture(cls, arch: str, model_type = ModelType.TEXT) -> type[ModelBase]: try: - return cls._model_classes[arch] + return cls._model_classes[model_type][arch] except KeyError: - raise NotImplementedError(f"Architecture {arch!r} not supported!") from None + raise NotImplementedError(f'Architecture {arch!r} not supported!') from None + + +class TextModel(ModelBase): + def set_vocab(self): + self._set_vocab_gpt2() + + def prepare_metadata(self, vocab_only: bool): + super().prepare_metadata(vocab_only=vocab_only) + + total_params = self.gguf_writer.get_total_parameter_count()[0] + # Extract the encoding scheme from the file type name. e.g. 'gguf.LlamaFileType.MOSTLY_Q8_0' --> 'Q8_0' + output_type: str = self.ftype.name.partition("_")[2] + + # Filename Output + if self.fname_out.is_dir(): + # Generate default filename based on model specification and available metadata + if not vocab_only: + fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, self.metadata.size_label, output_type, model_type="LoRA" if total_params < 0 else None) + else: + fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=None, model_type="vocab") + + # Use the default filename + self.fname_out = self.fname_out / f"{fname_default}.gguf" + else: + # Output path is a custom defined templated filename + # Note: `not is_dir()` is used because `.is_file()` will not detect + # file template strings as it doesn't actually exist as a file + + # Process templated file name with the output ftype, useful with the "auto" ftype + self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type) + + logger.info("Set model tokenizer") + self.set_vocab() + + def set_gguf_parameters(self): + self.gguf_writer.add_block_count(self.block_count) + + if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None: + self.gguf_writer.add_context_length(n_ctx) + logger.info(f"gguf: context length = {n_ctx}") + + if (n_embd := self.find_hparam(["hidden_size", "n_embd"], optional=True)) is not None: + self.gguf_writer.add_embedding_length(n_embd) + logger.info(f"gguf: embedding length = {n_embd}") + + if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None: + self.gguf_writer.add_feed_forward_length(n_ff) + logger.info(f"gguf: feed forward length = {n_ff}") + + if (n_head := self.find_hparam(["num_attention_heads", "n_head"], optional=True)) is not None: + self.gguf_writer.add_head_count(n_head) + logger.info(f"gguf: head count = {n_head}") + + if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None: + self.gguf_writer.add_head_count_kv(n_head_kv) + logger.info(f"gguf: key-value head count = {n_head_kv}") + + if (rope_theta := self.hparams.get("rope_theta")) is not None: + self.gguf_writer.add_rope_freq_base(rope_theta) + logger.info(f"gguf: rope theta = {rope_theta}") + if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None: + self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps) + logger.info(f"gguf: rms norm epsilon = {f_rms_eps}") + if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None: + self.gguf_writer.add_layer_norm_eps(f_norm_eps) + logger.info(f"gguf: layer norm epsilon = {f_norm_eps}") + if (n_experts := self.hparams.get("num_local_experts")) is not None: + self.gguf_writer.add_expert_count(n_experts) + logger.info(f"gguf: expert count = {n_experts}") + if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None: + self.gguf_writer.add_expert_used_count(n_experts_used) + logger.info(f"gguf: experts used count = {n_experts_used}") + + if (head_dim := self.hparams.get("head_dim")) is not None: + self.gguf_writer.add_key_length(head_dim) + self.gguf_writer.add_value_length(head_dim) + + self.gguf_writer.add_file_type(self.ftype) + logger.info(f"gguf: file type = {self.ftype}") + + def write_vocab(self): + if len(self.gguf_writer.tensors) != 1: + raise ValueError('Splitting the vocabulary is not supported') + + self.prepare_metadata(vocab_only=True) + self.gguf_writer.write_header_to_file(path=self.fname_out) + self.gguf_writer.write_kv_data_to_file() + self.gguf_writer.close() def does_token_look_special(self, token: str | bytes) -> bool: if isinstance(token, (bytes, bytearray)): @@ -635,22 +552,14 @@ def does_token_look_special(self, token: str | bytes) -> bool: # (e.g. command-r, command-r-plus, deepseek-coder, gemma{,-2}) seems_special = token_text in ( "", # deepseek-coder - "", - "<2mass>", - "[@BOS@]", # gemma{,-2} + "", "<2mass>", "[@BOS@]", # gemma{,-2} ) - seems_special = seems_special or ( - token_text.startswith("<|") and token_text.endswith("|>") - ) - seems_special = seems_special or ( - token_text.startswith("<|") and token_text.endswith("|>") - ) # deepseek-coder + seems_special = seems_special or (token_text.startswith("<|") and token_text.endswith("|>")) + seems_special = seems_special or (token_text.startswith("<|") and token_text.endswith("|>")) # deepseek-coder # TODO: should these be marked as UNUSED instead? (maybe not) - seems_special = seems_special or ( - token_text.startswith("") - ) # gemma{,-2} + seems_special = seems_special or (token_text.startswith("")) # gemma{,-2} return seems_special @@ -660,18 +569,17 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: toktypes: list[int] = [] from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(self.dir_model) vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) assert max(tokenizer.vocab.values()) < vocab_size tokpre = self.get_vocab_base_pre(tokenizer) - reverse_vocab = { - id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items() - } + reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} added_vocab = tokenizer.get_added_vocab() + added_tokens_decoder = tokenizer.added_tokens_decoder + for i in range(vocab_size): if i not in reverse_vocab: tokens.append(f"[PAD{i}]") @@ -681,26 +589,18 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: if token in added_vocab: # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized. # To avoid unexpected issues - we make sure to normalize non-normalized tokens - if not tokenizer.added_tokens_decoder[i].normalized: + if not added_tokens_decoder[i].normalized: previous_token = token - token = tokenizer.decode( - tokenizer.encode(token, add_special_tokens=False) - ) + token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) if previous_token != token: - logger.info( - f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer" - ) + logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer") - if tokenizer.added_tokens_decoder[ - i - ].special or self.does_token_look_special(token): + if added_tokens_decoder[i].special or self.does_token_look_special(token): toktypes.append(gguf.TokenType.CONTROL) else: # NOTE: this was added for Gemma. # Encoding and decoding the tokens above isn't sufficient for this case. - token = token.replace( - b"\xe2\x96\x81".decode("utf-8"), " " - ) # pre-normalize user-defined spaces + token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces toktypes.append(gguf.TokenType.USER_DEFINED) else: toktypes.append(gguf.TokenType.NORMAL) @@ -710,7 +610,7 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: # NOTE: this function is generated by convert_hf_to_gguf_update.py # do not modify it manually! - # ref: https://github.com/ggerganov/llama.cpp/pull/6920 + # ref: https://github.com/ggml-org/llama.cpp/pull/6920 # Marker: Start get_vocab_base_pre def get_vocab_base_pre(self, tokenizer) -> str: # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that @@ -718,7 +618,7 @@ def get_vocab_base_pre(self, tokenizer) -> str: # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can # use in llama.cpp to implement the same pre-tokenizer - chktxt = "\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````\"\"\"\"......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL" + chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL' chktok = tokenizer.encode(chktxt) chkhsh = sha256(str(chktok).encode()).hexdigest() @@ -800,7 +700,7 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a": # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code res = "jina-v2-code" - if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b": + if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b" or chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516": # ref: https://huggingface.co/THUDM/glm-4-9b-chat res = "chatglm-bpe" if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee": @@ -851,35 +751,42 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "b3f499bb4255f8ca19fccd664443283318f2fd2414d5e0b040fbdd0cc195d6c5": # ref: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B res = "deepseek-r1-qwen" + if chkhsh == "ccc2ef013c104be7bae2965776d611e1d7a8a2a9c547dd93a682c9a9fc80352e": + # ref: https://huggingface.co/Xenova/gpt-4o + res = "gpt-4o" + if chkhsh == "7dec86086fcc38b66b7bc1575a160ae21cf705be7718b9d5598190d7c12db76f": + # ref: https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k + res = "superbpe" + if chkhsh == "1994ffd01900cfb37395608534236ecd63f2bd5995d6cb1004dda1af50240f15": + # ref: https://huggingface.co/trillionlabs/Trillion-7B-preview + res = "trillion" + if chkhsh == "96a5f08be6259352137b512d4157e333e21df7edd3fcd152990608735a65b224": + # ref: https://huggingface.co/inclusionAI/Ling-lite + res = "bailingmoe" + if chkhsh == "d353350c764d8c3b39c763113960e4fb4919bea5fbf208a0e3b22e8469dc7406": + # ref: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct + res = "llama4" + if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2": + # ref: https://huggingface.co/THUDM/glm-4-9b-hf + res = "glm4" + if chkhsh == "0e9433cbbb161f89e264eb32e8e64bfe69e834973ffca5d41d3948a604a3e2a3": + # ref: https://huggingface.co/mistral-community/pixtral-12b + res = "pixtral" if res is None: logger.warning("\n") - logger.warning( - "**************************************************************************************" - ) + logger.warning("**************************************************************************************") logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!") logger.warning("** There are 2 possible reasons for this:") - logger.warning( - "** - the model has not been added to convert_hf_to_gguf_update.py yet" - ) - logger.warning( - "** - the pre-tokenization config has changed upstream" - ) - logger.warning( - "** Check your model files and convert_hf_to_gguf_update.py and update them accordingly." - ) - logger.warning( - "** ref: https://github.com/ggerganov/llama.cpp/pull/6920" - ) + logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet") + logger.warning("** - the pre-tokenization config has changed upstream") + logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.") + logger.warning("** ref: https://github.com/ggml-org/llama.cpp/pull/6920") logger.warning("**") logger.warning(f"** chkhsh: {chkhsh}") - logger.warning( - "**************************************************************************************" - ) + logger.warning("**************************************************************************************") logger.warning("\n") - raise NotImplementedError( - "BPE pre-tokenizer was not recognized - update get_vocab_base_pre()" - ) + raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()") logger.debug(f"tokenizer.ggml.pre: {repr(res)}") logger.debug(f"chkhsh: {chkhsh}") @@ -907,7 +814,6 @@ def _set_vocab_qwen(self): toktypes: list[int] = [] from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) vocab_size = hparams["vocab_size"] assert max(tokenizer.get_vocab().values()) < vocab_size @@ -923,13 +829,11 @@ def _set_vocab_qwen(self): continue merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) assert len(merged) == 2 - merges.append(" ".join(map(QwenModel.token_bytes_to_string, merged))) + merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined added_vocab = tokenizer.special_tokens - reverse_vocab = { - id_: encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items() - } + reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()} for i in range(vocab_size): if i not in reverse_vocab: @@ -951,16 +855,10 @@ def _set_vocab_qwen(self): special_vocab.merges = merges # only add special tokens when they were not already loaded from config.json if len(special_vocab.special_token_ids) == 0: - special_vocab._set_special_token( - "bos", tokenizer.special_tokens["<|endoftext|>"] - ) - special_vocab._set_special_token( - "eos", tokenizer.special_tokens["<|endoftext|>"] - ) + special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"]) + special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"]) # this one is usually not in config.json anyway - special_vocab._set_special_token( - "unk", tokenizer.special_tokens["<|endoftext|>"] - ) + special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"]) special_vocab.add_to_gguf(self.gguf_writer) def _set_vocab_sentencepiece(self, add_to_gguf=True): @@ -978,7 +876,7 @@ def _set_vocab_sentencepiece(self, add_to_gguf=True): def _create_vocab_sentencepiece(self): from sentencepiece import SentencePieceProcessor - tokenizer_path = self.dir_model / "tokenizer.model" + tokenizer_path = self.dir_model / 'tokenizer.model' if not tokenizer_path.is_file(): raise FileNotFoundError(f"File not found: {tokenizer_path}") @@ -986,7 +884,7 @@ def _create_vocab_sentencepiece(self): tokenizer = SentencePieceProcessor() tokenizer.LoadFromFile(str(tokenizer_path)) - vocab_size = self.hparams.get("vocab_size", tokenizer.vocab_size()) + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] scores: list[float] = [-10000.0] * vocab_size @@ -1011,43 +909,38 @@ def _create_vocab_sentencepiece(self): scores[token_id] = score toktypes[token_id] = toktype - added_tokens_file = self.dir_model / "added_tokens.json" + added_tokens_file = self.dir_model / 'added_tokens.json' if added_tokens_file.is_file(): with open(added_tokens_file, "r", encoding="utf-8") as f: added_tokens_json = json.load(f) for key in added_tokens_json: token_id = added_tokens_json[key] if token_id >= vocab_size: - logger.warning( - f"ignore token {token_id}: id is out of range, max={vocab_size - 1}" - ) + logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') continue tokens[token_id] = key.encode("utf-8") scores[token_id] = -1000.0 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED - tokenizer_config_file = self.dir_model / "tokenizer_config.json" + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' if tokenizer_config_file.is_file(): with open(tokenizer_config_file, "r", encoding="utf-8") as f: tokenizer_config_json = json.load(f) - added_tokens_decoder = tokenizer_config_json.get( - "added_tokens_decoder", {} - ) + added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {}) for token_id, token_data in added_tokens_decoder.items(): token_id = int(token_id) token: str = token_data["content"] + if token_id >= vocab_size: + logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') + continue if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: if tokens[token_id] != token.encode("utf-8"): - logger.warning( - f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}' - ) + logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}') if token_data.get("special") or self.does_token_look_special(token): toktypes[token_id] = SentencePieceTokenTypes.CONTROL else: - token = token.replace( - b"\xe2\x96\x81".decode("utf-8"), " " - ) # pre-normalize user-defined spaces + token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED scores[token_id] = -1000.0 @@ -1055,9 +948,7 @@ def _create_vocab_sentencepiece(self): if vocab_size > len(tokens): pad_count = vocab_size - len(tokens) - logger.debug( - f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]" - ) + logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") for i in range(1, pad_count + 1): tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) scores.append(-1000.0) @@ -1087,13 +978,43 @@ def _set_vocab_llama_hf(self): special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) special_vocab.add_to_gguf(self.gguf_writer) - def _set_vocab_builtin( - self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int - ): + def _set_vocab_rwkv_world(self): + assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file() + vocab_size = self.hparams.get("vocab_size", 65536) + + tokens: list[bytes] = [''.encode("utf-8")] + toktypes: list[int] = [gguf.TokenType.CONTROL] + + with open(self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8") as f: + lines = f.readlines() + for line in lines: + parts = line.split(' ') + assert len(parts) >= 3 + token, token_len = ast.literal_eval(' '.join(parts[1:-1])), int(parts[-1]) + token = token.encode("utf-8") if isinstance(token, str) else token + assert isinstance(token, bytes) + assert len(token) == token_len + token_text: str = repr(token)[2:-1] # "b'\xff'" -> "\xff" + tokens.append(token_text.encode("utf-8")) + toktypes.append(gguf.TokenType.NORMAL) + remainder = vocab_size - len(tokens) + assert remainder >= 0 + for i in range(len(tokens), vocab_size): + tokens.append(f"[PAD{i}]".encode("utf-8")) + toktypes.append(gguf.TokenType.UNUSED) + + self.gguf_writer.add_tokenizer_model("rwkv") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) + special_vocab.chat_template = "rwkv-world" + # hack: Add '\n\n' as the EOT token to make it chat normally + special_vocab._set_special_token("eot", 261) + special_vocab.add_to_gguf(self.gguf_writer) + + def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int): tokenizer_path = Path(sys.path[0]) / "models" / f"ggml-vocab-{model_name}.gguf" - logger.warning( - f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'" - ) + logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'") vocab_reader = gguf.GGUFReader(tokenizer_path, "r") default_pre = "mpt" if model_name == "gpt-neox" else "default" @@ -1103,35 +1024,25 @@ def _set_vocab_builtin( self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8")) field = vocab_reader.get_field(gguf.Keys.Tokenizer.PRE) - self.gguf_writer.add_tokenizer_pre( - bytes(field.parts[-1]).decode("utf-8") if field else default_pre - ) + self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]).decode("utf-8") if field else default_pre) field = vocab_reader.get_field(gguf.Keys.Tokenizer.LIST) assert field # token list - self.gguf_writer.add_token_list( - [bytes(field.parts[i]) for i in field.data][:vocab_size] - ) + self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size]) if model_name == "llama-spm": field = vocab_reader.get_field(gguf.Keys.Tokenizer.SCORES) assert field # token scores - self.gguf_writer.add_token_scores( - [field.parts[i].tolist()[0] for i in field.data][:vocab_size] - ) + self.gguf_writer.add_token_scores([field.parts[i].tolist()[0] for i in field.data][:vocab_size]) field = vocab_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE) assert field # token types - self.gguf_writer.add_token_types( - [field.parts[i].tolist()[0] for i in field.data][:vocab_size] - ) + self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size]) if model_name != "llama-spm": field = vocab_reader.get_field(gguf.Keys.Tokenizer.MERGES) assert field # token merges - self.gguf_writer.add_token_merges( - [bytes(field.parts[i]) for i in field.data] - ) + self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data]) if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)) is not None: self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0]) @@ -1147,8 +1058,59 @@ def _set_vocab_builtin( self.gguf_writer.add_add_eos_token(field.parts[-1].tolist()[0]) -@Model.register("GPTNeoXForCausalLM") -class GPTNeoXModel(Model): +class VisionModel(ModelBase): + model_arch = gguf.MODEL_ARCH.CLIP_VISION + n_text_embd = 0 + preprocessor_config: dict[str, Any] + global_config: dict[str, Any] + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + if self.model_arch != gguf.MODEL_ARCH.CLIP_VISION: + raise TypeError("VisionModel must be subclassed with model_arch = gguf.MODEL_ARCH.CLIP_VISION") + + # small hack to correct the number of layers + self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.CLIP_VISION, 128) + self.n_embd_text = self.find_hparam(["hidden_size", "n_embd"]) + assert self.n_embd_text > 0, "n_embd not found in hparams" + + if "vision_config" not in self.hparams: + raise ValueError("vision_config not found in hparams") + # move vision config to the top level, while preserving the original hparams in global_config + self.global_config = self.hparams + self.hparams = self.hparams["vision_config"] + + # load preprocessor config + with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f: + self.preprocessor_config = json.load(f) + + def set_type(self): + self.gguf_writer.add_type(gguf.GGUFType.CLIP_VISION) + + def set_gguf_parameters(self): + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_vision_projection_dim(self.n_embd_text) + self.gguf_writer.add_vision_has_vision_encoder(True) + + # vision config + self.gguf_writer.add_vision_image_size(self.find_hparam(["image_size"])) + self.gguf_writer.add_vision_patch_size(self.find_hparam(["patch_size"])) + self.gguf_writer.add_vision_embedding_length(self.find_hparam(["hidden_size"])) + self.gguf_writer.add_vision_feed_forward_length(self.find_hparam(["intermediate_size"])) + self.gguf_writer.add_vision_block_count(self.find_hparam(["num_hidden_layers"])) + self.gguf_writer.add_vision_head_count(self.find_hparam(["num_attention_heads"])) + + # preprocessor config + self.gguf_writer.add_vision_image_mean(self.preprocessor_config["image_mean"]) + self.gguf_writer.add_vision_image_std(self.preprocessor_config["image_mean"]) + + def write_vocab(self): + raise ValueError("VisionModel does not support vocab writing") + + +@ModelBase.register("GPTNeoXForCausalLM") +class GPTNeoXModel(TextModel): model_arch = gguf.MODEL_ARCH.GPTNEOX def set_gguf_parameters(self): @@ -1159,20 +1121,13 @@ def set_gguf_parameters(self): self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) self.gguf_writer.add_rope_dimension_count( - int( - self.hparams["rotary_pct"] - * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - ), + int(self.hparams["rotary_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])), ) self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - self.gguf_writer.add_parallel_residual( - self.hparams.get("use_parallel_residual", True) - ) + self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True)) self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) @@ -1211,8 +1166,8 @@ def modify_tensors( return tensors -@Model.register("BloomForCausalLM", "BloomModel") -class BloomModel(Model): +@ModelBase.register("BloomForCausalLM", "BloomModel") +class BloomModel(TextModel): model_arch = gguf.MODEL_ARCH.BLOOM def set_gguf_parameters(self): @@ -1227,15 +1182,13 @@ def set_gguf_parameters(self): self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) self.gguf_writer.add_file_type(self.ftype) - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) - name = re.sub(r"transformer\.", "", name) + name = re.sub(r'transformer\.', '', name) tensors: list[tuple[str, Tensor]] = [] @@ -1267,22 +1220,11 @@ def modify_tensors( tensors.append((self.map_tensor_name(name), data_torch)) - if name == "word_embeddings.weight": - assert self.tensor_names is not None - - # TODO: tie them at runtime, don't duplicate in the model file - if all( - s not in self.tensor_names for s in ("lm_head.weight", "output.weight") - ): - tensors.append( - (self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch) - ) - return tensors -@Model.register("MPTForCausalLM") -class MPTModel(Model): +@ModelBase.register("MPTForCausalLM") +class MPTModel(TextModel): model_arch = gguf.MODEL_ARCH.MPT def set_vocab(self): @@ -1309,21 +1251,15 @@ def set_gguf_parameters(self): if self.hparams["attn_config"]["clip_qkv"] is not None: self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"]) if self.hparams["attn_config"]["alibi"]: - self.gguf_writer.add_max_alibi_bias( - self.hparams["attn_config"]["alibi_bias_max"] - ) + self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"]) else: self.gguf_writer.add_max_alibi_bias(0.0) - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused if "scales" in name: - new_name = self.map_tensor_name( - name, try_suffixes=(".weight", ".bias", ".scales") - ) + new_name = self.map_tensor_name(name, try_suffixes=(".weight", ".bias", ".scales")) new_name = new_name.replace("scales", "act.scales") else: new_name = self.map_tensor_name(name, try_suffixes=(".weight", ".bias")) @@ -1331,8 +1267,8 @@ def modify_tensors( return [(new_name, data_torch)] -@Model.register("OrionForCausalLM") -class OrionModel(Model): +@ModelBase.register("OrionForCausalLM") +class OrionModel(TextModel): model_arch = gguf.MODEL_ARCH.ORION def set_vocab(self): @@ -1366,8 +1302,8 @@ def set_gguf_parameters(self): self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"]) -@Model.register("BaichuanForCausalLM", "BaiChuanForCausalLM") -class BaichuanModel(Model): +@ModelBase.register("BaichuanForCausalLM", "BaiChuanForCausalLM") +class BaichuanModel(TextModel): model_arch = gguf.MODEL_ARCH.BAICHUAN def set_vocab(self): @@ -1393,27 +1329,18 @@ def set_gguf_parameters(self): self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count( - self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - ) + self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) self.gguf_writer.add_head_count(head_count) self.gguf_writer.add_head_count_kv(head_count_kv) self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) self.gguf_writer.add_file_type(self.ftype) - if ( - self.hparams.get("rope_scaling") is not None - and "factor" in self.hparams["rope_scaling"] - ): + if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: if self.hparams["rope_scaling"].get("type") == "linear": self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor( - self.hparams["rope_scaling"]["factor"] - ) + self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: head_count = self.hparams["num_attention_heads"] head_count_kv = self.hparams.get("num_key_value_heads", head_count) @@ -1422,61 +1349,41 @@ def modify_tensors( if bid is not None and name == f"model.layers.{bid}.self_attn.W_pack.weight": logger.info(f"Unpacking and permuting layer {bid}") tensors = [ - ( - self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), - self._reverse_hf_permute_part( - data_torch, 0, head_count, head_count - ), - ), - ( - self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), - self._reverse_hf_permute_part( - data_torch, 1, head_count, head_count_kv - ), - ), - ( - self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), - self._reverse_hf_part(data_torch, 2), - ), + (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), + self._reverse_hf_permute_part(data_torch, 0, head_count, head_count)), + (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), + self._reverse_hf_permute_part(data_torch, 1, head_count, head_count_kv)), + (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), + self._reverse_hf_part(data_torch, 2)), ] else: tensors = [(self.map_tensor_name(name), data_torch)] return tensors - def _reverse_hf_permute( - self, weights: Tensor, n_head: int, n_kv_head: int | None = None - ) -> Tensor: + def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: if n_kv_head is not None and n_head != n_kv_head: n_head //= n_kv_head return ( - weights.reshape( - n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:] - ) + weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) .swapaxes(1, 2) .reshape(weights.shape) ) def _reverse_hf_permute_part( - self, - weights: Tensor, - n_part: int, - n_head: int, - n_head_kv: int | None = None, + self, weights: Tensor, n_part: int, n_head: int, n_head_kv: int | None = None, ) -> Tensor: r = weights.shape[0] // 3 - return self._reverse_hf_permute( - weights[r * n_part : r * n_part + r, ...], n_head, n_head_kv - ) + return self._reverse_hf_permute(weights[r * n_part:r * n_part + r, ...], n_head, n_head_kv) def _reverse_hf_part(self, weights: Tensor, n_part: int) -> Tensor: r = weights.shape[0] // 3 - return weights[r * n_part : r * n_part + r, ...] + return weights[r * n_part:r * n_part + r, ...] -@Model.register("XverseForCausalLM") -class XverseModel(Model): +@ModelBase.register("XverseForCausalLM") +class XverseModel(TextModel): model_arch = gguf.MODEL_ARCH.XVERSE def set_vocab(self): @@ -1488,7 +1395,6 @@ def set_vocab(self): toktypes: list[int] = [] from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(dir_model) vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) # Since we are checking the maximum index, we need to ensure it's strictly less than vocab_size, @@ -1497,18 +1403,16 @@ def set_vocab(self): if max_vocab_index >= vocab_size: raise ValueError("Vocabulary size exceeds expected maximum size.") - reverse_vocab: dict[int, str] = { - id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items() - } + reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} added_vocab = tokenizer.get_added_vocab() for token_id in range(vocab_size): - token_text = reverse_vocab[token_id].encode("utf-8") + token_text = reverse_vocab[token_id].encode('utf-8') # replace "\x00" to string with length > 0 if token_text == b"\x00": toktype = gguf.TokenType.BYTE # special - token_text = f"<{token_text}>".encode("utf-8") - elif re.fullmatch(rb"<0x[0-9A-Fa-f]{2}>", token_text): + token_text = f"<{token_text}>".encode('utf-8') + elif re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text): toktype = gguf.TokenType.BYTE # special elif reverse_vocab[token_id] in added_vocab: if tokenizer.added_tokens_decoder[token_id].special: @@ -1549,27 +1453,18 @@ def set_gguf_parameters(self): self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count( - self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - ) + self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) self.gguf_writer.add_head_count(head_count) self.gguf_writer.add_head_count_kv(head_count_kv) self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) self.gguf_writer.add_file_type(self.ftype) - if ( - self.hparams.get("rope_scaling") is not None - and "factor" in self.hparams["rope_scaling"] - ): + if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: if self.hparams["rope_scaling"].get("type") == "linear": self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor( - self.hparams["rope_scaling"]["factor"] - ) + self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused head_count = self.hparams["num_attention_heads"] @@ -1583,23 +1478,19 @@ def modify_tensors( return [(self.map_tensor_name(name), data_torch)] - def _reverse_hf_permute( - self, weights: Tensor, n_head: int, n_kv_head: int | None = None - ) -> Tensor: + def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: if n_kv_head is not None and n_head != n_kv_head: n_head //= n_kv_head return ( - weights.reshape( - n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:] - ) + weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) .swapaxes(1, 2) .reshape(weights.shape) ) -@Model.register("FalconForCausalLM", "RWForCausalLM") -class FalconModel(Model): +@ModelBase.register("FalconForCausalLM", "RWForCausalLM") +class FalconModel(TextModel): model_arch = gguf.MODEL_ARCH.FALCON def set_gguf_parameters(self): @@ -1625,9 +1516,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) self.gguf_writer.add_file_type(self.ftype) - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused # QKV tensor transform @@ -1642,14 +1531,10 @@ def modify_tensors( if "query_key_value" in name: n_head = self.find_hparam(["num_attention_heads", "n_head"]) - n_head_kv = ( - self.find_hparam(["num_kv_heads", "n_head_kv"], optional=True) or 1 - ) + n_head_kv = self.find_hparam(["num_kv_heads", "n_head_kv"], optional=True) or 1 head_dim = self.hparams["hidden_size"] // n_head - qkv = data_torch.view( - n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head - ) + qkv = data_torch.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head) q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head) k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head) v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head) @@ -1658,8 +1543,8 @@ def modify_tensors( return [(self.map_tensor_name(name), data_torch)] -@Model.register("GPTBigCodeForCausalLM") -class StarCoderModel(Model): +@ModelBase.register("GPTBigCodeForCausalLM") +class StarCoderModel(TextModel): model_arch = gguf.MODEL_ARCH.STARCODER def set_gguf_parameters(self): @@ -1675,19 +1560,16 @@ def set_gguf_parameters(self): self.gguf_writer.add_file_type(self.ftype) -@Model.register("GPTRefactForCausalLM") -class RefactModel(Model): +@ModelBase.register("GPTRefactForCausalLM") +class RefactModel(TextModel): model_arch = gguf.MODEL_ARCH.REFACT def set_vocab(self): super().set_vocab() # TODO: how to determine special FIM tokens automatically? - special_vocab = gguf.SpecialVocab( - self.dir_model, - load_merges=False, - special_token_types=["prefix", "suffix", "middle", "eot"], - ) + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False, + special_token_types = ['prefix', 'suffix', 'middle', 'eot']) special_vocab._set_special_token("prefix", 1) special_vocab._set_special_token("suffix", 3) special_vocab._set_special_token("middle", 2) @@ -1714,9 +1596,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) self.gguf_writer.add_file_type(self.ftype) - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: hidden_dim = self.hparams["n_embd"] inner_dim = 4 * hidden_dim hidden_dim = int(2 * inner_dim / 3) @@ -1730,35 +1610,13 @@ def modify_tensors( if bid is not None: if name == f"transformer.h.{bid}.attn.kv.weight": - tensors.append( - ( - self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), - data_torch[: n_head_kv * head_dim], - ) - ) - tensors.append( - ( - self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), - data_torch[n_head_kv * head_dim :], - ) - ) + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), data_torch[:n_head_kv * head_dim])) + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), data_torch[n_head_kv * head_dim:])) elif name == f"transformer.h.{bid}.attn.q.weight": - tensors.append( - (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), data_torch) - ) + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), data_torch)) elif name == f"transformer.h.{bid}.mlp.gate_up_proj.weight": - tensors.append( - ( - self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), - data_torch[:ff_dim], - ) - ) - tensors.append( - ( - self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), - data_torch[ff_dim:], - ) - ) + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), data_torch[:ff_dim])) + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), data_torch[ff_dim:])) if len(tensors) == 0: tensors.append((self.map_tensor_name(name), data_torch)) @@ -1766,10 +1624,8 @@ def modify_tensors( return tensors -@Model.register( - "StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM" -) -class StableLMModel(Model): +@ModelBase.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM") +class StableLMModel(TextModel): model_arch = gguf.MODEL_ARCH.STABLELM def set_vocab(self): @@ -1788,30 +1644,17 @@ def set_gguf_parameters(self): self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"]) - self.gguf_writer.add_rope_dimension_count( - int( - rotary_factor - * (hparams["hidden_size"] // hparams["num_attention_heads"]) - ) - ) + self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"]))) self.gguf_writer.add_head_count(hparams["num_attention_heads"]) self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"]) - self.gguf_writer.add_parallel_residual( - hparams["use_parallel_residual"] - if "use_parallel_residual" in hparams - else True - ) - self.gguf_writer.add_layer_norm_eps( - self.find_hparam(["layer_norm_eps", "norm_eps"]) - ) + self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True) + self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"])) self.gguf_writer.add_file_type(self.ftype) _q_norms: list[dict[str, Tensor]] | None = None _k_norms: list[dict[str, Tensor]] | None = None - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: n_head = self.hparams["num_attention_heads"] n_kv_head = self.hparams["num_key_value_heads"] @@ -1824,9 +1667,7 @@ def modify_tensors( self._q_norms[bid][name] = data_torch if len(self._q_norms[bid]) >= n_head: - return self._stack_qk_norm( - bid, n_head, self._q_norms[bid], "q_layernorm" - ) + return self._stack_qk_norm(bid, n_head, self._q_norms[bid], "q_layernorm") else: return [] @@ -1839,21 +1680,13 @@ def modify_tensors( self._k_norms[bid][name] = data_torch if len(self._k_norms[bid]) >= n_kv_head: - return self._stack_qk_norm( - bid, n_kv_head, self._k_norms[bid], "k_layernorm" - ) + return self._stack_qk_norm(bid, n_kv_head, self._k_norms[bid], "k_layernorm") else: return [] return [(self.map_tensor_name(name), data_torch)] - def _stack_qk_norm( - self, - bid: int, - n_head: int, - norms: dict[str, Tensor], - layer_name: str = "q_layernorm", - ): + def _stack_qk_norm(self, bid: int, n_head: int, norms: dict[str, Tensor], layer_name: str = "q_layernorm"): datas: list[Tensor] = [] # extract the norms in order for xid in range(n_head): @@ -1873,23 +1706,35 @@ def prepare_tensors(self): if self._q_norms is not None or self._k_norms is not None: # flatten two `list[dict[str, Tensor]]` into a single `list[str]` norms = ( - [k for d in self._q_norms for k in d.keys()] - if self._q_norms is not None - else [] + [k for d in self._q_norms for k in d.keys()] if self._q_norms is not None else [] ) + ( - [k for d in self._k_norms for k in d.keys()] - if self._k_norms is not None - else [] + [k for d in self._k_norms for k in d.keys()] if self._k_norms is not None else [] ) if len(norms) > 0: raise ValueError(f"Unprocessed norms: {norms}") -@Model.register( - "LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM" -) -class LlamaModel(Model): +@ModelBase.register( + "LLaMAForCausalLM", + "LlamaForCausalLM", + "MistralForCausalLM", + "MixtralForCausalLM", + "Idefics3ForConditionalGeneration", + "SmolVLMForConditionalGeneration", + "LlavaForConditionalGeneration") +class LlamaModel(TextModel): model_arch = gguf.MODEL_ARCH.LLAMA + undo_permute = True + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # fix for SmolVLM2, missing `num_attention_heads` in config.json + if self.hparams["architectures"][0] == "SmolVLMForConditionalGeneration": + self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32) + # fix for Pixtral, missing `num_attention_heads` in config.json + if self.hparams["architectures"][0] == "LlavaForConditionalGeneration" \ + and self.hparams.get("model_type") == "mistral": + self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32) def set_vocab(self): try: @@ -1904,24 +1749,21 @@ def set_vocab(self): # Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256) if self.hparams.get("vocab_size", 32000) == 32016: special_vocab = gguf.SpecialVocab( - self.dir_model, - load_merges=False, - special_token_types=["prefix", "suffix", "middle", "eot"], + self.dir_model, load_merges=False, + special_token_types = ['prefix', 'suffix', 'middle', 'eot'] ) special_vocab._set_special_token("prefix", 32007) special_vocab._set_special_token("suffix", 32008) special_vocab._set_special_token("middle", 32009) - special_vocab._set_special_token("eot", 32010) + special_vocab._set_special_token("eot", 32010) special_vocab.add_to_gguf(self.gguf_writer) - tokenizer_config_file = self.dir_model / "tokenizer_config.json" + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' if tokenizer_config_file.is_file(): with open(tokenizer_config_file, "r", encoding="utf-8") as f: tokenizer_config_json = json.load(f) if "add_prefix_space" in tokenizer_config_json: - self.gguf_writer.add_add_space_prefix( - tokenizer_config_json["add_prefix_space"] - ) + self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"]) # Apply to granite small models only if self.hparams.get("vocab_size", 32000) == 49152: @@ -1938,40 +1780,41 @@ def set_gguf_parameters(self): rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] self.gguf_writer.add_rope_dimension_count(rope_dim) - if ( - self.hparams.get("rope_scaling") is not None - and "factor" in self.hparams["rope_scaling"] - ): + if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: if self.hparams["rope_scaling"].get("type") == "linear": self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor( - self.hparams["rope_scaling"]["factor"] - ) + self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) @staticmethod def permute(weights: Tensor, n_head: int, n_head_kv: int | None): if n_head_kv is not None and n_head != n_head_kv: n_head = n_head_kv - return ( - weights.reshape( - n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:] - ) - .swapaxes(1, 2) - .reshape(weights.shape) - ) + return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape)) _experts: list[dict[str, Tensor]] | None = None - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: n_head = self.hparams["num_attention_heads"] n_kv_head = self.hparams.get("num_key_value_heads") - - if name.endswith(("q_proj.weight", "q_proj.bias")): - data_torch = LlamaModel.permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight", "k_proj.bias")): - data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + is_vision_tensor = "vision_tower" in name \ + or "vision_model" in name \ + or "model.connector" in name \ + or "multi_modal_projector" in name + + if is_vision_tensor: + return [] # skip vision tensors + elif name.startswith("model.text_model"): + name = name.replace("text_model.", "") # for SmolVLM + elif name.startswith("language_model."): + name = name.replace("language_model.", "") # for the rest + + if self.undo_permute: + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = LlamaModel.permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) # process the experts separately if name.find("block_sparse_moe.experts") != -1: @@ -2011,26 +1854,19 @@ def modify_tensors( def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): - if rope_scaling.get("rope_type", "").lower() == "llama3": + if rope_scaling.get("rope_type", '').lower() == "llama3": base = self.hparams.get("rope_theta", 10000.0) - dim = self.hparams.get( - "head_dim", - self.hparams["hidden_size"] // self.hparams["num_attention_heads"], - ) - freqs = 1.0 / ( - base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim) - ) + dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) factor = rope_scaling.get("factor", 8.0) low_freq_factor = rope_scaling.get("low_freq_factor", 1.0) high_freq_factor = rope_scaling.get("high_freq_factor", 4.0) - old_context_len = self.hparams.get( - "original_max_position_embeddings", 8192 - ) + old_context_len = self.hparams.get("original_max_position_embeddings", 8192) low_freq_wavelen = old_context_len / low_freq_factor high_freq_wavelen = old_context_len / high_freq_factor - assert low_freq_wavelen != high_freq_wavelen + # assert low_freq_wavelen != high_freq_wavelen # Errors for Llama4 rope_factors = [] for freq in freqs: @@ -2040,15 +1876,10 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: elif wavelen > low_freq_wavelen: rope_factors.append(factor) else: - smooth = (old_context_len / wavelen - low_freq_factor) / ( - high_freq_factor - low_freq_factor - ) + smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) rope_factors.append(1 / ((1 - smooth) / factor + smooth)) - yield ( - self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), - torch.tensor(rope_factors, dtype=torch.float32), - ) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) def prepare_tensors(self): super().prepare_tensors() @@ -2060,8 +1891,144 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") -@Model.register("DeciLMForCausalLM") -class DeciModel(Model): +@ModelBase.register("LlavaForConditionalGeneration") +class LlavaVisionModel(VisionModel): + img_break_tok_id = -1 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if self.hparams["model_type"] == "pixtral": + # fix missing config.json values + self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 16) + self.hparams["num_hidden_layers"] = self.hparams.get("num_hidden_layers", 24) + self.hparams["intermediate_size"] = self.hparams.get("intermediate_size", 4096) + self.hparams["hidden_size"] = self.hparams.get("hidden_size", 1024) + self.hparams["layer_norm_eps"] = self.hparams.get("layer_norm_eps", 1e-5) + self.img_break_tok_id = 12 # see tokenizer_config.json + else: + raise ValueError(f"Unsupported model type: {self.hparams['model_type']}") + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + if hparams["model_type"] == "pixtral": + self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.PIXTRAL) + # default values below are taken from HF tranformers code + self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"]) + self.gguf_writer.add_vision_use_silu(True) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + n_head = self.hparams["num_attention_heads"] + n_kv_head = n_head + + if name.startswith("multi_modal_projector.") or name.startswith("vision_tower."): + # process vision tensors + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = LlamaModel.permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + return [(self.map_tensor_name(name), data_torch)] + + if self.img_break_tok_id > 0 and "embed_tokens.weight" in name: + logger.info(f"Extracting [IMG_BREAK] token embedding from {name}") + # for pixtral model, we need to extract the [IMG_BREAK] token embedding + img_break_embd = data_torch[self.img_break_tok_id] + name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK] + return [(self.map_tensor_name(name), img_break_embd)] + + return [] # skip other tensors + + +@ModelBase.register("Idefics3ForConditionalGeneration", "SmolVLMForConditionalGeneration") +class SmolVLMModel(VisionModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # fix for SmolVLM2, missing some keys in config.json + # default values are taken from transformers code + if self.hparams["model_type"] == "smolvlm_vision": + self.hparams["hidden_size"] = self.hparams.get("hidden_size", 1152) + self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 16) + self.hparams["intermediate_size"] = self.hparams.get("intermediate_size", 3072) + self.hparams["num_hidden_layers"] = self.hparams.get("num_hidden_layers", 12) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.IDEFICS3) + self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5)) + self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("scale_factor", 2)) + self.gguf_writer.add_vision_use_gelu(True) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + del bid, new_name, n_dims # unused + if ".embeddings." in name: + return gguf.GGMLQuantizationType.F32 + return False + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + is_vision_tensor = "vision_tower" in name or "vision_model" in name or "model.connector" in name + + if is_vision_tensor: + return [(self.map_tensor_name(name), data_torch)] + + return [] # skip other tensors + + +@ModelBase.register("Llama4ForConditionalGeneration") +class Llama4Model(LlamaModel): + model_arch = gguf.MODEL_ARCH.LLAMA4 + undo_permute = False + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # IMPORTANT: the normal "intermediate_size" is renamed to "intermediate_size_mlp", we need to undo this + self.hparams["intermediate_size_moe"] = self.hparams["intermediate_size"] + self.hparams["intermediate_size"] = self.hparams["intermediate_size_mlp"] + + def set_vocab(self): + self._set_vocab_gpt2() + self.gguf_writer.add_add_bos_token(True) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_interleave_moe_layer_step(self.hparams["interleave_moe_layer_step"]) + self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size_moe"]) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): + # split the gate_up into gate and up + if "gate_up_proj" in name: + name_up = name.replace("gate_up_proj", "up_proj.weight") + name_gate = name.replace("gate_up_proj", "gate_proj.weight") + dim_half = data_torch.shape[-1] // 2 + gate_proj_weight, up_proj_weight = data_torch.transpose(-1, -2).split(dim_half, dim=-2) + return [ + (self.map_tensor_name(name_gate), gate_proj_weight), + (self.map_tensor_name(name_up), up_proj_weight) + ] + + if name.endswith("down_proj"): + name += ".weight" + data_torch = data_torch.transpose(-1, -2) + + if "multi_modal_projector" in name or "vision_model" in name: + return [] + return super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("Mistral3ForConditionalGeneration") +class Mistral3Model(LlamaModel): + model_arch = gguf.MODEL_ARCH.LLAMA + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): + name = name.replace("language_model.", "") + if "multi_modal_projector" in name or "vision_tower" in name: + return [] + return super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("DeciLMForCausalLM") +class DeciModel(TextModel): model_arch = gguf.MODEL_ARCH.DECI @staticmethod @@ -2080,8 +2047,8 @@ def _find_multiple(n: int, k: int) -> int: def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B - _block_configs: list[dict[str, Any]] = self.hparams["block_configs"] + if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B + _block_configs: list[dict[str,Any]] = self.hparams["block_configs"] assert self.block_count == len(_block_configs) self._num_kv_heads = list() self._num_heads = list() @@ -2105,28 +2072,17 @@ def __init__(self, *args, **kwargs): self._num_kv_heads.append(0) self._num_heads.append(0) else: - self._num_kv_heads.append( - self.hparams["num_attention_heads"] - // _block_configs[il]["attention"]["n_heads_in_group"] - ) + self._num_kv_heads.append(self.hparams["num_attention_heads"] // _block_configs[il]["attention"]["n_heads_in_group"]) self._num_heads.append(self.hparams["num_attention_heads"]) _ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"]) assert self.block_count == len(self._num_kv_heads) assert self.block_count == len(self._num_heads) assert self.block_count == len(_ffn_multipliers) - assert isinstance(self._num_kv_heads, list) and isinstance( - self._num_kv_heads[0], int - ) - assert isinstance(self._num_heads, list) and isinstance( - self._num_heads[0], int - ) - assert isinstance(_ffn_multipliers, list) and isinstance( - _ffn_multipliers[0], float - ) + assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int) + assert isinstance(self._num_heads, list) and isinstance(self._num_heads[0], int) + assert isinstance(_ffn_multipliers, list) and isinstance(_ffn_multipliers[0], float) self._ffn_dims: list[int] = [ - DeciModel._ffn_mult_to_intermediate_size( - multiplier, self.hparams["hidden_size"] - ) + DeciModel._ffn_mult_to_intermediate_size(multiplier, self.hparams["hidden_size"]) for multiplier in _ffn_multipliers ] @@ -2147,7 +2103,7 @@ def set_vocab(self): self._set_vocab_llama_hf() def set_gguf_parameters(self): - if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B + if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B assert self.block_count == len(self._num_kv_heads) assert self.block_count == len(self._num_heads) assert self.block_count == len(self._ffn_dims) @@ -2160,19 +2116,13 @@ def set_gguf_parameters(self): self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - self.gguf_writer.add_key_length( - self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - ) - self.gguf_writer.add_value_length( - self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - ) + self.gguf_writer.add_key_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + self.gguf_writer.add_value_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) self.gguf_writer.add_file_type(self.ftype) - else: # DeciLM-7B + else: # DeciLM-7B super().set_gguf_parameters() - if "num_key_value_heads_per_layer" in self.hparams: # DeciLM-7B - self._num_kv_heads: list[int] = self.hparams[ - "num_key_value_heads_per_layer" - ] + if "num_key_value_heads_per_layer" in self.hparams: # DeciLM-7B + self._num_kv_heads: list[int] = self.hparams["num_key_value_heads_per_layer"] assert self.block_count == len(self._num_kv_heads) self.gguf_writer.add_head_count_kv(self._num_kv_heads) hparams = self.hparams @@ -2184,31 +2134,20 @@ def set_gguf_parameters(self): rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] self.gguf_writer.add_rope_dimension_count(rope_dim) - if ( - self.hparams.get("rope_scaling") is not None - and "factor" in self.hparams["rope_scaling"] - ): + if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: if self.hparams["rope_scaling"].get("type") == "linear": self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor( - self.hparams["rope_scaling"]["factor"] - ) + self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) @staticmethod def permute(weights: Tensor, n_head: int, n_head_kv: int | None): if n_head_kv is not None and n_head != n_head_kv: n_head = n_head_kv - return ( - weights.reshape( - n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:] - ) - .swapaxes(1, 2) - .reshape(weights.shape) - ) + return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape)) - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: n_head = self.hparams["num_attention_heads"] if bid is not None: if "num_key_value_heads_per_layer" in self.hparams: @@ -2229,22 +2168,15 @@ def modify_tensors( def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): - if rope_scaling.get("rope_type", "").lower() == "llama3": + if rope_scaling.get("rope_type", '').lower() == "llama3": base = self.hparams.get("rope_theta", 10000.0) - dim = self.hparams.get( - "head_dim", - self.hparams["hidden_size"] // self.hparams["num_attention_heads"], - ) - freqs = 1.0 / ( - base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim) - ) + dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) factor = rope_scaling.get("factor", 8.0) low_freq_factor = rope_scaling.get("low_freq_factor", 1.0) high_freq_factor = rope_scaling.get("high_freq_factor", 4.0) - old_context_len = self.hparams.get( - "original_max_position_embeddings", 8192 - ) + old_context_len = self.hparams.get("original_max_position_embeddings", 8192) low_freq_wavelen = old_context_len / low_freq_factor high_freq_wavelen = old_context_len / high_freq_factor @@ -2258,22 +2190,17 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: elif wavelen > low_freq_wavelen: rope_factors.append(factor) else: - smooth = (old_context_len / wavelen - low_freq_factor) / ( - high_freq_factor - low_freq_factor - ) + smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) rope_factors.append(1 / ((1 - smooth) / factor + smooth)) - yield ( - self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), - torch.tensor(rope_factors, dtype=torch.float32), - ) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) def prepare_tensors(self): super().prepare_tensors() -@Model.register("BitnetForCausalLM") -class BitnetModel(Model): +@ModelBase.register("BitnetForCausalLM") +class BitnetModel(TextModel): model_arch = gguf.MODEL_ARCH.BITNET def set_vocab(self): @@ -2295,31 +2222,26 @@ def weight_quant(self, weight: Tensor) -> Tensor: result = (weight * iscale).round().clamp(-1, 1) / iscale return result.type(dtype) - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: new_name = self.map_tensor_name(name) - if any( - self.match_model_tensor_name(new_name, key, bid) - for key in [ - gguf.MODEL_TENSOR.ATTN_Q, - gguf.MODEL_TENSOR.ATTN_K, - gguf.MODEL_TENSOR.ATTN_V, - gguf.MODEL_TENSOR.ATTN_OUT, - gguf.MODEL_TENSOR.FFN_UP, - gguf.MODEL_TENSOR.FFN_DOWN, - gguf.MODEL_TENSOR.FFN_GATE, - ] - ): + if any(self.match_model_tensor_name(new_name, key, bid) for key in [ + gguf.MODEL_TENSOR.ATTN_Q, + gguf.MODEL_TENSOR.ATTN_K, + gguf.MODEL_TENSOR.ATTN_V, + gguf.MODEL_TENSOR.ATTN_OUT, + gguf.MODEL_TENSOR.FFN_UP, + gguf.MODEL_TENSOR.FFN_DOWN, + gguf.MODEL_TENSOR.FFN_GATE, + ]): # transform weight into 1/0/-1 (in fp32) data_torch = self.weight_quant(data_torch) yield (new_name, data_torch) -@Model.register("GrokForCausalLM") -class GrokModel(Model): +@ModelBase.register("GrokForCausalLM") +class GrokModel(TextModel): model_arch = gguf.MODEL_ARCH.GROK def set_vocab(self): @@ -2333,9 +2255,7 @@ def set_gguf_parameters(self): _experts: list[dict[str, Tensor]] | None = None - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # process the experts separately if name.find(".moe.") != -1: n_experts = self.hparams["num_local_experts"] @@ -2355,9 +2275,7 @@ def modify_tensors( datas: list[Tensor] = [] for xid in range(n_experts): - ename = ( - f"transformer.decoder_layer.{bid}.moe.{xid}.{wid}.weight" - ) + ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid}.weight" datas.append(self._experts[bid][ename]) del self._experts[bid][ename] @@ -2375,8 +2293,8 @@ def modify_tensors( return [(self.map_tensor_name(name), data_torch)] -@Model.register("DbrxForCausalLM") -class DbrxModel(Model): +@ModelBase.register("DbrxForCausalLM") +class DbrxModel(TextModel): model_arch = gguf.MODEL_ARCH.DBRX def set_gguf_parameters(self): @@ -2403,9 +2321,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_file_type(self.ftype) logger.info(f"gguf: file type = {self.ftype}") - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused n_expert = self.hparams["ffn_config"]["moe_num_experts"] @@ -2417,15 +2333,9 @@ def modify_tensors( # But llama.cpp moe graph works differently # AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions # so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor - exp_tensor_names = { - "ffn.experts.mlp.w1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} - "ffn.experts.mlp.w2": ( - 0, - 2, - 1, - ), # LLM_TENSOR_FFN_DOWN_EXPS ggml_tensor->ne{n_ff, n_embd, n_expert} - "ffn.experts.mlp.v1": None, - } # LLM_TENSOR_FFN_UP_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} + exp_tensor_names = {"ffn.experts.mlp.w1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} + "ffn.experts.mlp.w2": (0, 2, 1), # LLM_TENSOR_FFN_DOWN_EXPS ggml_tensor->ne{n_ff, n_embd, n_expert} + "ffn.experts.mlp.v1": None} # LLM_TENSOR_FFN_UP_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} experts = False for exp_tensor_name in exp_tensor_names.keys(): @@ -2442,22 +2352,18 @@ def modify_tensors( # Every other model has the weight names ending in .weight, # let's assume that is the convention which is not the case for dbrx: # https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15 - new_name = self.map_tensor_name( - name if not experts else name + ".weight", try_suffixes=(".weight",) - ) + new_name = self.map_tensor_name(name if not experts else name + ".weight", try_suffixes=(".weight",)) return [(new_name, data_torch)] - def tensor_force_quant( - self, name: str, new_name: str, bid: int | None, n_dims: int - ) -> gguf.GGMLQuantizationType | bool: + def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool: del name, new_name, bid # unused return n_dims > 1 -@Model.register("MiniCPMForCausalLM") -class MiniCPMModel(Model): +@ModelBase.register("MiniCPMForCausalLM") +class MiniCPMModel(TextModel): model_arch = gguf.MODEL_ARCH.MINICPM def set_gguf_parameters(self): @@ -2465,9 +2371,7 @@ def set_gguf_parameters(self): embedding_scale = float(self.hparams["scale_emb"]) self.gguf_writer.add_embedding_scale(embedding_scale) logger.info(f"gguf: (minicpm) embedding_scale = {embedding_scale}") - residual_scale = ( - self.hparams["scale_depth"] / self.hparams["num_hidden_layers"] ** 0.5 - ) + residual_scale = self.hparams["scale_depth"] / self.hparams["num_hidden_layers"] ** 0.5 self.gguf_writer.add_residual_scale(residual_scale) logger.info(f"gguf: (minicpm) residual_scale = {residual_scale}") logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"] @@ -2476,46 +2380,29 @@ def set_gguf_parameters(self): if self.hparams.get("rope_scaling") is not None: if self.hparams["rope_scaling"].get("type") == "longrope": self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE) - logger.info( - f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}" - ) + logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}") def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - rope_scaling = self.find_hparam(["rope_scaling"], True) + rope_scaling = self.find_hparam(['rope_scaling'], True) if rope_scaling is not None: - long_factors = rope_scaling.get("long_factor", None) - short_factors = rope_scaling.get("short_factor", None) + long_factors = rope_scaling.get('long_factor', None) + short_factors = rope_scaling.get('short_factor', None) if long_factors is None or short_factors is None: - raise KeyError( - "Missing the required key rope_scaling.long_factor or rope_scaling_short_factor" - ) + raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor') - if ( - len(long_factors) != len(short_factors) - or len(long_factors) != rope_dims / 2 - ): - raise ValueError( - f"The length of rope long and short factors must be {rope_dims / 2}" - ) + if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: + raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}') - yield ( - self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), - torch.tensor(long_factors, dtype=torch.float32), - ) - yield ( - self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), - torch.tensor(short_factors, dtype=torch.float32), - ) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32)) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32)) def set_vocab(self): self._set_vocab_sentencepiece() - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused n_head = self.hparams["num_attention_heads"] @@ -2530,8 +2417,8 @@ def modify_tensors( return [(self.map_tensor_name(name), data_torch)] -@Model.register("MiniCPM3ForCausalLM") -class MiniCPM3Model(Model): +@ModelBase.register("MiniCPM3ForCausalLM") +class MiniCPM3Model(TextModel): model_arch = gguf.MODEL_ARCH.MINICPM3 def set_gguf_parameters(self): @@ -2549,74 +2436,52 @@ def set_gguf_parameters(self): if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None: self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"]) self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"]) - self.gguf_writer.add_key_length( - hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"] - ) + self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"]) self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - rope_scaling = self.find_hparam(["rope_scaling"], True) + rope_scaling = self.find_hparam(['rope_scaling'], True) if rope_scaling is not None: rope_dims = self.hparams["qk_rope_head_dim"] - long_factors = rope_scaling.get("long_factor", None) - short_factors = rope_scaling.get("short_factor", None) + long_factors = rope_scaling.get('long_factor', None) + short_factors = rope_scaling.get('short_factor', None) if long_factors is None or short_factors is None: - raise KeyError( - "Missing the required key rope_scaling.long_factor or rope_scaling_short_factor" - ) + raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor') - if ( - len(long_factors) != len(short_factors) - or len(long_factors) != rope_dims / 2 - ): - raise ValueError( - f"The length of rope long and short factors must be {rope_dims / 2}" - ) + if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: + raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}') - yield ( - self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), - torch.tensor(long_factors, dtype=torch.float32), - ) - yield ( - self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), - torch.tensor(short_factors, dtype=torch.float32), - ) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32)) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32)) def set_vocab(self): self._set_vocab_sentencepiece() - def _reverse_hf_permute( - self, weights: Tensor, n_head: int, n_kv_head: int | None = None - ) -> Tensor: + def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: if n_kv_head is not None and n_head != n_kv_head: n_head //= n_kv_head return ( - weights.reshape( - n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:] - ) + weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) .swapaxes(1, 2) .reshape(weights.shape) ) -@Model.register("QWenLMHeadModel") -class QwenModel(Model): +@ModelBase.register("QWenLMHeadModel") +class QwenModel(TextModel): model_arch = gguf.MODEL_ARCH.QWEN @staticmethod def token_bytes_to_string(b): from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode - byte_encoder = bytes_to_unicode() - return "".join([byte_encoder[ord(char)] for char in b.decode("latin-1")]) + return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')]) @staticmethod - def bpe( - mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None - ) -> list[bytes]: + def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]: parts = [bytes([b]) for b in token] while True: min_idx = None @@ -2629,11 +2494,7 @@ def bpe( if min_rank is None or (max_rank is not None and min_rank >= max_rank): break assert min_idx is not None - parts = ( - parts[:min_idx] - + [parts[min_idx] + parts[min_idx + 1]] - + parts[min_idx + 2 :] - ) + parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:] return parts def set_vocab(self): @@ -2645,16 +2506,14 @@ def set_gguf_parameters(self): self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"]) - self.gguf_writer.add_rope_dimension_count( - self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - ) + self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) self.gguf_writer.add_file_type(self.ftype) -@Model.register("Qwen2ForCausalLM") -class Qwen2Model(Model): +@ModelBase.register("Qwen2ForCausalLM") +class Qwen2Model(TextModel): model_arch = gguf.MODEL_ARCH.QWEN2 def set_vocab(self): @@ -2665,22 +2524,15 @@ def set_vocab(self): def set_gguf_parameters(self): super().set_gguf_parameters() - if ( - self.hparams.get("rope_scaling") is not None - and "factor" in self.hparams["rope_scaling"] - ): + if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: if self.hparams["rope_scaling"].get("type") == "yarn": self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor( - self.hparams["rope_scaling"]["factor"] - ) - self.gguf_writer.add_rope_scaling_orig_ctx_len( - self.hparams["rope_scaling"]["original_max_position_embeddings"] - ) + self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) + self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"]) -@Model.register("Qwen2VLForConditionalGeneration") -class Qwen2VLModel(Model): +@ModelBase.register("Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration") +class Qwen2VLModel(TextModel): model_arch = gguf.MODEL_ARCH.QWEN2VL def set_gguf_parameters(self): @@ -2695,27 +2547,25 @@ def set_vocab(self): except FileNotFoundError: self._set_vocab_gpt2() - def get_tensors(self) -> Iterator[tuple[str, Tensor]]: - for name, data in super().get_tensors(): - if name.startswith("visual."): - continue - yield name, data + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + if name.startswith("visual."): + # skip visual tensors + return [] + return [(self.map_tensor_name(name), data_torch)] -@Model.register("WavTokenizerDec") -class WavTokenizerDecModel(Model): +@ModelBase.register("WavTokenizerDec") +class WavTokenizerDecModel(TextModel): model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused - if ( - name.endswith("codebook.cluster_size") - or name.endswith("codebook.embed_avg") - or name.endswith("codebook.inited") - ): + if \ + name.endswith("codebook.cluster_size") or \ + name.endswith("codebook.embed_avg") or \ + name.endswith("codebook.inited"): logger.debug(f"Skipping {name!r}") return [] @@ -2728,53 +2578,39 @@ def set_vocab(self): def set_gguf_parameters(self): super().set_gguf_parameters() - self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) - self.gguf_writer.add_features_length(self.hparams["n_embd_features"]) + self.gguf_writer.add_vocab_size (self.hparams["vocab_size"]) + self.gguf_writer.add_features_length (self.hparams["n_embd_features"]) self.gguf_writer.add_feed_forward_length(self.hparams["n_ff"]) - self.gguf_writer.add_group_norm_eps(self.hparams["group_norm_epsilon"]) - self.gguf_writer.add_group_norm_groups(self.hparams["group_norm_groups"]) + self.gguf_writer.add_group_norm_eps (self.hparams["group_norm_epsilon"]) + self.gguf_writer.add_group_norm_groups (self.hparams["group_norm_groups"]) self.gguf_writer.add_posnet_embedding_length(self.hparams["posnet"]["n_embd"]) - self.gguf_writer.add_posnet_block_count(self.hparams["posnet"]["n_layer"]) + self.gguf_writer.add_posnet_block_count (self.hparams["posnet"]["n_layer"]) - self.gguf_writer.add_convnext_embedding_length( - self.hparams["convnext"]["n_embd"] - ) - self.gguf_writer.add_convnext_block_count(self.hparams["convnext"]["n_layer"]) + self.gguf_writer.add_convnext_embedding_length(self.hparams["convnext"]["n_embd"]) + self.gguf_writer.add_convnext_block_count (self.hparams["convnext"]["n_layer"]) self.gguf_writer.add_causal_attention(False) -@Model.register("Qwen2MoeForCausalLM") -class Qwen2MoeModel(Model): +@ModelBase.register("Qwen2MoeForCausalLM") +class Qwen2MoeModel(TextModel): model_arch = gguf.MODEL_ARCH.QWEN2MOE def set_gguf_parameters(self): super().set_gguf_parameters() if (n_experts := self.hparams.get("num_experts")) is not None: self.gguf_writer.add_expert_count(n_experts) - if ( - moe_intermediate_size := self.hparams.get("moe_intermediate_size") - ) is not None: + if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None: self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}") - if ( - shared_expert_intermediate_size := self.hparams.get( - "shared_expert_intermediate_size" - ) - ) is not None: - self.gguf_writer.add_expert_shared_feed_forward_length( - shared_expert_intermediate_size - ) - logger.info( - f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}" - ) + if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None: + self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size) + logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}") _experts: list[dict[str, Tensor]] | None = None - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # process the experts separately if name.find("experts") != -1: n_experts = self.hparams["num_experts"] @@ -2820,8 +2656,18 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") -@Model.register("GPT2LMHeadModel") -class GPT2Model(Model): +@ModelBase.register("Qwen3ForCausalLM") +class Qwen3Model(Qwen2Model): + model_arch = gguf.MODEL_ARCH.QWEN3 + + +@ModelBase.register("Qwen3MoeForCausalLM") +class Qwen3MoeModel(Qwen2MoeModel): + model_arch = gguf.MODEL_ARCH.QWEN3MOE + + +@ModelBase.register("GPT2LMHeadModel") +class GPT2Model(TextModel): model_arch = gguf.MODEL_ARCH.GPT2 def set_gguf_parameters(self): @@ -2833,9 +2679,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) self.gguf_writer.add_file_type(self.ftype) - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused tensors: list[tuple[str, Tensor]] = [] @@ -2844,26 +2688,18 @@ def modify_tensors( if name.endswith((".attn.bias", ".attn.masked_bias")): return tensors - if name.endswith( - (".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight") - ): + if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")): data_torch = data_torch.transpose(1, 0) new_name = self.map_tensor_name(name) tensors.append((new_name, data_torch)) - # note: GPT2 output is tied to (same as) wte in original model - if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): - tensors.append( - (self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch) - ) - return tensors -@Model.register("PhiForCausalLM") -class Phi2Model(Model): +@ModelBase.register("PhiForCausalLM") +class Phi2Model(TextModel): model_arch = gguf.MODEL_ARCH.PHI2 def set_gguf_parameters(self): @@ -2873,54 +2709,51 @@ def set_gguf_parameters(self): n_embd = self.find_hparam(["hidden_size", "n_embd"]) n_head = self.find_hparam(["num_attention_heads", "n_head"]) - self.gguf_writer.add_context_length( - self.find_hparam(["n_positions", "max_position_embeddings"]) - ) + self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"])) self.gguf_writer.add_embedding_length(n_embd) self.gguf_writer.add_feed_forward_length(4 * n_embd) self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_head_count(n_head) self.gguf_writer.add_head_count_kv(n_head) - self.gguf_writer.add_layer_norm_eps( - self.find_hparam(["layer_norm_epsilon", "layer_norm_eps"]) - ) + self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_epsilon", "layer_norm_eps"])) self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head) self.gguf_writer.add_file_type(self.ftype) self.gguf_writer.add_add_bos_token(False) -@Model.register("Phi3ForCausalLM") -class Phi3MiniModel(Model): +@ModelBase.register("Phi3ForCausalLM") +class Phi3MiniModel(TextModel): model_arch = gguf.MODEL_ARCH.PHI3 def set_vocab(self): # Phi-4 model uses GPT2Tokenizer - tokenizer_config_file = self.dir_model / "tokenizer_config.json" + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' if tokenizer_config_file.is_file(): with open(tokenizer_config_file, "r", encoding="utf-8") as f: tokenizer_config_json = json.load(f) - tokenizer_class = tokenizer_config_json["tokenizer_class"] - if tokenizer_class == "GPT2Tokenizer": + tokenizer_class = tokenizer_config_json['tokenizer_class'] + if tokenizer_class == 'GPT2Tokenizer': return self._set_vocab_gpt2() from sentencepiece import SentencePieceProcessor - tokenizer_path = self.dir_model / "tokenizer.model" + tokenizer_path = self.dir_model / 'tokenizer.model' if not tokenizer_path.is_file(): - raise ValueError(f"Error: Missing {tokenizer_path}") + raise ValueError(f'Error: Missing {tokenizer_path}') tokenizer = SentencePieceProcessor() tokenizer.LoadFromFile(str(tokenizer_path)) - vocab_size = self.hparams.get("vocab_size", tokenizer.vocab_size()) + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] scores: list[float] = [-10000.0] * vocab_size toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size for token_id in range(tokenizer.vocab_size()): + piece = tokenizer.IdToPiece(token_id) text = piece.encode("utf-8") score = tokenizer.GetScore(token_id) @@ -2939,7 +2772,7 @@ def set_vocab(self): scores[token_id] = score toktypes[token_id] = toktype - added_tokens_file = self.dir_model / "added_tokens.json" + added_tokens_file = self.dir_model / 'added_tokens.json' if added_tokens_file.is_file(): with open(added_tokens_file, "r", encoding="utf-8") as f: added_tokens_json = json.load(f) @@ -2947,37 +2780,31 @@ def set_vocab(self): for key in added_tokens_json: token_id = added_tokens_json[key] if token_id >= vocab_size: - logger.debug( - f"ignore token {token_id}: id is out of range, max={vocab_size - 1}" - ) + logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') continue tokens[token_id] = key.encode("utf-8") scores[token_id] = -1000.0 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED - tokenizer_config_file = self.dir_model / "tokenizer_config.json" + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' if tokenizer_config_file.is_file(): with open(tokenizer_config_file, "r", encoding="utf-8") as f: tokenizer_config_json = json.load(f) - added_tokens_decoder = tokenizer_config_json.get( - "added_tokens_decoder", {} - ) + added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {}) for token_id, foken_data in added_tokens_decoder.items(): token_id = int(token_id) token = foken_data["content"].encode("utf-8") if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: if tokens[token_id] != token: - logger.warning( - f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}' - ) + logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') tokens[token_id] = token scores[token_id] = -1000.0 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED if foken_data.get("special"): toktypes[token_id] = SentencePieceTokenTypes.CONTROL - tokenizer_file = self.dir_model / "tokenizer.json" + tokenizer_file = self.dir_model / 'tokenizer.json' if tokenizer_file.is_file(): with open(tokenizer_file, "r", encoding="utf-8") as f: tokenizer_json = json.load(f) @@ -2987,9 +2814,7 @@ def set_vocab(self): token = foken_data["content"].encode("utf-8") if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: if tokens[token_id] != token: - logger.warning( - f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}' - ) + logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') tokens[token_id] = token scores[token_id] = -1000.0 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED @@ -3014,14 +2839,13 @@ def set_gguf_parameters(self): rms_eps = self.find_hparam(["rms_norm_eps"]) max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"]) orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"]) - rope_dims = n_embd // n_head + rot_pct = self.hparams.get("partial_rotary_factor", 1.0) + rope_dims = int(rot_pct * n_embd) // n_head self.gguf_writer.add_context_length(max_pos_embds) self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds) self.gguf_writer.add_embedding_length(n_embd) - self.gguf_writer.add_feed_forward_length( - self.find_hparam(["intermediate_size"]) - ) + self.gguf_writer.add_feed_forward_length(self.find_hparam(["intermediate_size"])) self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_head_count(n_head) self.gguf_writer.add_head_count_kv(n_head_kv) @@ -3040,61 +2864,43 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: n_head = self.find_hparam(["num_attention_heads", "n_head"]) max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"]) orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"]) - rope_dims = n_embd // n_head + rot_pct = self.hparams.get("partial_rotary_factor", 1.0) + rope_dims = int(rot_pct * n_embd) // n_head # write rope scaling for long context (128k) model - rope_scaling = self.find_hparam(["rope_scaling"], True) + rope_scaling = self.find_hparam(['rope_scaling'], True) if rope_scaling is None: return scale = max_pos_embds / orig_max_pos_embds - rope_scaling_type = rope_scaling.get("type", "").lower() + rope_scaling_type = rope_scaling.get('type', '').lower() if len(rope_scaling_type) == 0: - raise KeyError("Missing the required key rope_scaling.type") + raise KeyError('Missing the required key rope_scaling.type') - if rope_scaling_type == "su" or rope_scaling_type == "longrope": - attn_factor = ( - math.sqrt(1 + math.log(scale) / math.log(orig_max_pos_embds)) - if scale > 1.0 - else 1.0 - ) - elif rope_scaling_type == "yarn": + if rope_scaling_type == 'su' or rope_scaling_type == 'longrope': + attn_factor = math.sqrt(1 + math.log(scale) / math.log(orig_max_pos_embds)) if scale > 1.0 else 1.0 + elif rope_scaling_type == 'yarn': attn_factor = 0.1 * math.log(scale) + 1.0 if scale > 1.0 else 1.0 else: - raise NotImplementedError( - f"The rope scaling type {rope_scaling_type} is not supported yet" - ) + raise NotImplementedError(f'The rope scaling type {rope_scaling_type} is not supported yet') self.gguf_writer.add_rope_scaling_attn_factors(attn_factor) - long_factors = rope_scaling.get("long_factor", None) - short_factors = rope_scaling.get("short_factor", None) + long_factors = rope_scaling.get('long_factor', None) + short_factors = rope_scaling.get('short_factor', None) if long_factors is None or short_factors is None: - raise KeyError( - "Missing the required key rope_scaling.long_factor or rope_scaling_short_factor" - ) + raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor') - if ( - len(long_factors) != len(short_factors) - or len(long_factors) != rope_dims / 2 - ): - raise ValueError( - f"The length of rope long and short factors must be {rope_dims / 2}" - ) + if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: + raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}. long_factors = {len(long_factors)}, short_factors = {len(short_factors)}.') - yield ( - self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), - torch.tensor(long_factors, dtype=torch.float32), - ) - yield ( - self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), - torch.tensor(short_factors, dtype=torch.float32), - ) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32)) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32)) -@Model.register("PhiMoEForCausalLM") +@ModelBase.register("PhiMoEForCausalLM") class PhiMoeModel(Phi3MiniModel): model_arch = gguf.MODEL_ARCH.PHIMOE @@ -3105,9 +2911,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"]) self.gguf_writer.add_expert_count(self.hparams["num_local_experts"]) - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # process the experts separately if name.find("block_sparse_moe.experts") != -1: n_experts = self.hparams["num_local_experts"] @@ -3132,9 +2936,7 @@ def modify_tensors( data_torch = torch.stack(datas, dim=0) - merged_name = ( - f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight" - ) + merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight" new_name = self.map_tensor_name(merged_name) @@ -3155,8 +2957,8 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") -@Model.register("PlamoForCausalLM") -class PlamoModel(Model): +@ModelBase.register("PlamoForCausalLM") +class PlamoModel(TextModel): model_arch = gguf.MODEL_ARCH.PLAMO def set_vocab(self): @@ -3171,9 +2973,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_head_count(hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv( - 5 - ) # hparams["num_key_value_heads"]) is wrong + self.gguf_writer.add_head_count_kv(5) # hparams["num_key_value_heads"]) is wrong self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) self.gguf_writer.add_file_type(self.ftype) @@ -3191,9 +2991,7 @@ def shuffle_attn_output_weight(self, data_torch): data_torch = torch.reshape(data_torch, (5120, 5120)) return data_torch - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused new_name = self.map_tensor_name(name) @@ -3207,8 +3005,8 @@ def modify_tensors( return [(new_name, data_torch)] -@Model.register("CodeShellForCausalLM") -class CodeShellModel(Model): +@ModelBase.register("CodeShellForCausalLM") +class CodeShellModel(TextModel): model_arch = gguf.MODEL_ARCH.CODESHELL def set_gguf_parameters(self): @@ -3226,31 +3024,30 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) self.gguf_writer.add_rope_scaling_factor(1.0) - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: - del bid # unused + _has_tok_embd = False - new_name = self.map_tensor_name(name) + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused - tensors: list[tuple[str, Tensor]] = [(new_name, data_torch)] + output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT) + tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD) - if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): - assert self.tensor_names is not None + new_name = self.map_tensor_name(name) - if all( - s not in self.tensor_names for s in ("lm_head.weight", "output.weight") - ): - # copy tok_embd.weight to output.weight - tensors.append( - (self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch) - ) + # assuming token_embd.weight is seen before output.weight + if not self._has_tok_embd and new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT): + # even though the tensor file(s) does not contain the word embeddings they are still in the weight map + if self.tensor_names and "transformer.wte.weight" in self.tensor_names: + logger.debug(f"{tok_embd_name} not found before {output_name}, assuming they are tied") + self.tensor_names.remove("transformer.wte.weight") + elif new_name == tok_embd_name: + self._has_tok_embd = True - return tensors + return [(new_name, data_torch)] -@Model.register("InternLM2ForCausalLM") -class InternLM2Model(Model): +@ModelBase.register("InternLM2ForCausalLM") +class InternLM2Model(TextModel): model_arch = gguf.MODEL_ARCH.INTERNLM2 def set_vocab(self): @@ -3261,25 +3058,24 @@ def set_vocab(self): from sentencepiece import SentencePieceProcessor from sentencepiece import sentencepiece_model_pb2 as model - tokenizer_path = self.dir_model / "tokenizer.model" + tokenizer_path = self.dir_model / 'tokenizer.model' tokens: list[bytes] = [] scores: list[float] = [] toktypes: list[int] = [] if not tokenizer_path.is_file(): - logger.error(f"Error: Missing {tokenizer_path}") + logger.error(f'Error: Missing {tokenizer_path}') sys.exit(1) sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] - with open(tokenizer_path, "rb") as file: - sentencepiece_model.ParseFromString(file.read()) + sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix tokenizer = SentencePieceProcessor() tokenizer.LoadFromFile(str(tokenizer_path)) - vocab_size = self.hparams.get("vocab_size", tokenizer.vocab_size()) + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) for token_id in range(vocab_size): piece = tokenizer.IdToPiece(token_id) @@ -3301,14 +3097,14 @@ def set_vocab(self): elif tokenizer.IsByte(token_id): toktype = SentencePieceTokenTypes.BYTE # take care of ununsed raw token - if piece.startswith("[UNUSED"): + if piece.startswith('[UNUSED'): toktype = SentencePieceTokenTypes.UNUSED tokens.append(text) scores.append(score) toktypes.append(toktype) - added_tokens_file = self.dir_model / "added_tokens.json" + added_tokens_file = self.dir_model / 'added_tokens.json' if added_tokens_file.is_file(): with open(added_tokens_file, "r", encoding="utf-8") as f: added_tokens_json = json.load(f) @@ -3318,16 +3114,14 @@ def set_vocab(self): scores.append(-1000.0) toktypes.append(SentencePieceTokenTypes.USER_DEFINED) - chat_eos_token = "<|im_end|>" + chat_eos_token = '<|im_end|>' chat_eos_token_id = None - tokenizer_config_file = self.dir_model / "tokenizer_config.json" + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' if tokenizer_config_file.is_file(): with open(tokenizer_config_file, "r", encoding="utf-8") as f: tokenizer_config_json = json.load(f) - added_tokens_decoder = tokenizer_config_json.get( - "added_tokens_decoder", {} - ) + added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {}) for token_id, foken_data in added_tokens_decoder.items(): token_id = int(token_id) token = foken_data["content"] @@ -3336,16 +3130,14 @@ def set_vocab(self): token = token.encode("utf-8") if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: if tokens[token_id] != token: - logger.warning( - f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}' - ) + logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') tokens[token_id] = token scores[token_id] = -1000.0 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED if foken_data.get("special"): toktypes[token_id] = SentencePieceTokenTypes.CONTROL - tokenizer_file = self.dir_model / "tokenizer.json" + tokenizer_file = self.dir_model / 'tokenizer.json' if tokenizer_file.is_file(): with open(tokenizer_file, "r", encoding="utf-8") as f: tokenizer_json = json.load(f) @@ -3358,9 +3150,7 @@ def set_vocab(self): token = token.encode("utf-8") if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: if tokens[token_id] != token: - logger.warning( - f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}' - ) + logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') tokens[token_id] = token scores[token_id] = -1000.0 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED @@ -3379,12 +3169,10 @@ def set_vocab(self): if chat_eos_token_id is not None: # For the chat model, we replace the eos with '<|im_end|>'. # TODO: this is a hack, should be fixed - # https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048 + # https://github.com/ggml-org/llama.cpp/pull/6745#issuecomment-2067687048 special_vocab.special_token_ids["eos"] = chat_eos_token_id - logger.warning( - f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}" - " in chat mode so that the conversation can end normally." - ) + logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}" + " in chat mode so that the conversation can end normally.") special_vocab.add_to_gguf(self.gguf_writer) @@ -3398,19 +3186,12 @@ def set_gguf_parameters(self): self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) self.gguf_writer.add_file_type(self.ftype) - if ( - self.hparams.get("rope_scaling") is not None - and "factor" in self.hparams["rope_scaling"] - ): + if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: if self.hparams["rope_scaling"].get("type") == "linear": self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor( - self.hparams["rope_scaling"]["factor"] - ) + self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: num_heads = self.hparams["num_attention_heads"] num_kv_heads = self.hparams["num_key_value_heads"] n_embd = self.hparams["hidden_size"] @@ -3422,13 +3203,11 @@ def modify_tensors( qkv = data_torch qkv = qkv.reshape((num_groups, q_per_kv + 2, head_dim, n_embd)) - q, k, v = qkv[:, :q_per_kv], qkv[:, -2], qkv[:, -1] + q, k, v = qkv[:, : q_per_kv], qkv[:, -2], qkv[:, -1] # The model weights of q and k equire additional reshape. q = LlamaModel.permute(q.reshape((-1, q.shape[-1])), num_heads, num_heads) - k = LlamaModel.permute( - k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads - ) + k = LlamaModel.permute(k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads) v = v.reshape((-1, v.shape[-1])) return [ @@ -3440,8 +3219,8 @@ def modify_tensors( return [(self.map_tensor_name(name), data_torch)] -@Model.register("InternLM3ForCausalLM") -class InternLM3Model(Model): +@ModelBase.register("InternLM3ForCausalLM") +class InternLM3Model(TextModel): model_arch = gguf.MODEL_ARCH.LLAMA def set_vocab(self): @@ -3455,28 +3234,21 @@ def set_vocab(self): special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - tokenizer_config_file = self.dir_model / "tokenizer_config.json" + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' if tokenizer_config_file.is_file(): with open(tokenizer_config_file, "r", encoding="utf-8") as f: tokenizer_config_json = json.load(f) if "add_prefix_space" in tokenizer_config_json: - self.gguf_writer.add_add_space_prefix( - tokenizer_config_json["add_prefix_space"] - ) + self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"]) if "added_tokens_decoder" in tokenizer_config_json: - for token_id, token_data in tokenizer_config_json[ - "added_tokens_decoder" - ].items(): + for token_id, token_data in tokenizer_config_json["added_tokens_decoder"].items(): if token_data.get("special"): token_id = int(token_id) token = token_data["content"] special_vocab._set_special_token(token, token_id) # update eos token - if ( - token == "<|im_end|>" - and "eos" in special_vocab.special_token_ids - ): + if token == '<|im_end|>' and "eos" in special_vocab.special_token_ids: special_vocab.special_token_ids["eos"] = token_id special_vocab.add_to_gguf(self.gguf_writer) @@ -3492,22 +3264,12 @@ def set_gguf_parameters(self): rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] self.gguf_writer.add_rope_dimension_count(rope_dim) - if ( - self.hparams.get("rope_scaling") is not None - and "factor" in self.hparams["rope_scaling"] - ): - if ( - self.hparams["rope_scaling"].get("type") == "linear" - or self.hparams["rope_scaling"].get("rope_type") == "linear" - ): + if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: + if self.hparams["rope_scaling"].get("type") == "linear" or self.hparams["rope_scaling"].get("rope_type") == "linear": self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor( - self.hparams["rope_scaling"]["factor"] - ) + self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: n_head = self.hparams["num_attention_heads"] n_kv_head = self.hparams.get("num_key_value_heads") if name.endswith(("q_proj.weight", "q_proj.bias")): @@ -3517,8 +3279,8 @@ def modify_tensors( return [(self.map_tensor_name(name), data_torch)] -@Model.register("BertModel", "BertForMaskedLM", "CamembertModel") -class BertModel(Model): +@ModelBase.register("BertModel", "BertForMaskedLM", "CamembertModel") +class BertModel(TextModel): model_arch = gguf.MODEL_ARCH.BERT def __init__(self, *args, **kwargs): @@ -3542,9 +3304,7 @@ def set_gguf_parameters(self): # get pooling type if pooling_path is not None: - with open( - self.dir_model / pooling_path / "config.json", encoding="utf-8" - ) as f: + with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f: pooling = json.load(f) if pooling["pooling_mode_mean_tokens"]: pooling_type = gguf.PoolingType.MEAN @@ -3570,7 +3330,6 @@ def phantom(tok): if tok.startswith("##"): return tok[2:] return "\u2581" + tok - tokens = list(map(phantom, tokens)) # add vocab to gguf @@ -3583,9 +3342,7 @@ def phantom(tok): special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) special_vocab.add_to_gguf(self.gguf_writer) - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused if name.startswith("bert."): @@ -3598,12 +3355,8 @@ def modify_tensors( name = name[:-5] + ".bias" # we are only using BERT for embeddings so we don't need the pooling layer - if name in ( - "embeddings.position_ids", - "pooler.dense.weight", - "pooler.dense.bias", - ): - return [] # we don't need these + if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"): + return [] # we don't need these if name.startswith("cls.predictions"): return [] @@ -3613,14 +3366,7 @@ def modify_tensors( return [(self.map_tensor_name(name), data_torch)] - -@Model.register("RobertaModel") -class RobertaModel(BertModel): - model_arch = gguf.MODEL_ARCH.BERT - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - + def _xlmroberta_tokenizer_init(self) -> None: # we need the pad_token_id to know how to chop down position_embd matrix if (pad_token_id := self.hparams.get("pad_token_id")) is not None: self._position_offset = 1 + pad_token_id @@ -3629,111 +3375,29 @@ def __init__(self, *args, **kwargs): else: self._position_offset = None - def set_vocab(self): - """Support BPE tokenizers for roberta models""" - bpe_tok_path = self.dir_model / "tokenizer.json" - if bpe_tok_path.exists(): - self._set_vocab_gpt2() - self.gguf_writer.add_add_bos_token(True) - self.gguf_writer.add_add_eos_token(True) - - # we need this to validate the size of the token_type embeddings - # though currently we are passing all zeros to the token_type embeddings - # "Sequence A" or "Sequence B" - self.gguf_writer.add_token_type_count( - self.hparams.get("type_vocab_size", 1) - ) - - else: - return super().set_vocab() - - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: - # if name starts with "roberta.", remove the prefix - # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main - if name.startswith("roberta."): - name = name[8:] - - # position embeddings start at pad_token_id + 1, so just chop down the weight tensor - if name == "embeddings.position_embeddings.weight": - if self._position_offset is not None: - data_torch = data_torch[self._position_offset :, :] - - return super().modify_tensors(data_torch, name, bid) - - -@Model.register("NomicBertModel") -class NomicBertModel(BertModel): - model_arch = gguf.MODEL_ARCH.NOMIC_BERT - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # the HF config claims n_ctx=8192, but it uses RoPE scaling - self.hparams["n_ctx"] = 2048 - - # SwigLU activation - assert self.hparams["activation_function"] == "swiglu" - # this doesn't do anything in the HF version - assert self.hparams["causal"] is False - # no bias tensors - assert self.hparams["qkv_proj_bias"] is False - assert self.hparams["mlp_fc1_bias"] is False - assert self.hparams["mlp_fc2_bias"] is False - # norm at end of layer - assert self.hparams["prenorm"] is False - # standard RoPE - assert self.hparams["rotary_emb_fraction"] == 1.0 - assert self.hparams["rotary_emb_interleaved"] is False - assert self.hparams["rotary_emb_scale_base"] is None - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"]) - - -@Model.register("XLMRobertaModel", "XLMRobertaForSequenceClassification") -class XLMRobertaModel(BertModel): - model_arch = gguf.MODEL_ARCH.BERT - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # we need the pad_token_id to know how to chop down position_embd matrix - if (pad_token_id := self.hparams.get("pad_token_id")) is not None: - self._position_offset = 1 + pad_token_id - if "max_position_embeddings" in self.hparams: - self.hparams["max_position_embeddings"] -= self._position_offset - else: - self._position_offset = None - - def set_vocab(self): + def _xlmroberta_set_vocab(self) -> None: # to avoid TypeError: Descriptors cannot be created directly # exception when importing sentencepiece_model_pb2 os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" from sentencepiece import SentencePieceProcessor from sentencepiece import sentencepiece_model_pb2 as model - tokenizer_path = self.dir_model / "sentencepiece.bpe.model" + tokenizer_path = self.dir_model / 'sentencepiece.bpe.model' if not tokenizer_path.is_file(): raise FileNotFoundError(f"File not found: {tokenizer_path}") sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] - with open(tokenizer_path, "rb") as file: - sentencepiece_model.ParseFromString(file.read()) + sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix - remove_whitespaces = ( - sentencepiece_model.normalizer_spec.remove_extra_whitespaces - ) + remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap tokenizer = SentencePieceProcessor() tokenizer.LoadFromFile(str(tokenizer_path)) - vocab_size = self.hparams.get("vocab_size", tokenizer.vocab_size()) + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] scores: list[float] = [-10000.0] * vocab_size @@ -3760,16 +3424,14 @@ def set_vocab(self): if vocab_size > len(tokens): pad_count = vocab_size - len(tokens) - logger.debug( - f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]" - ) + logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") for i in range(1, pad_count + 1): tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) scores.append(-1000.0) toktypes.append(SentencePieceTokenTypes.UNUSED) # realign tokens (see HF tokenizer code) - tokens = [b"", b"", b"", b""] + tokens[3:-1] + tokens = [b'', b'', b'', b''] + tokens[3:-1] scores = [0.0, 0.0, 0.0, 0.0] + scores[3:-1] toktypes = [ SentencePieceTokenTypes.CONTROL, @@ -3795,9 +3457,39 @@ def set_vocab(self): self.gguf_writer.add_add_bos_token(True) self.gguf_writer.add_add_eos_token(True) - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: + +@ModelBase.register("RobertaModel") +class RobertaModel(BertModel): + model_arch = gguf.MODEL_ARCH.BERT + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # we need the pad_token_id to know how to chop down position_embd matrix + if (pad_token_id := self.hparams.get("pad_token_id")) is not None: + self._position_offset = 1 + pad_token_id + if "max_position_embeddings" in self.hparams: + self.hparams["max_position_embeddings"] -= self._position_offset + else: + self._position_offset = None + + def set_vocab(self): + """Support BPE tokenizers for roberta models""" + bpe_tok_path = self.dir_model / "tokenizer.json" + if bpe_tok_path.exists(): + self._set_vocab_gpt2() + self.gguf_writer.add_add_bos_token(True) + self.gguf_writer.add_add_eos_token(True) + + # we need this to validate the size of the token_type embeddings + # though currently we are passing all zeros to the token_type embeddings + # "Sequence A" or "Sequence B" + self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1)) + + else: + return super().set_vocab() + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # if name starts with "roberta.", remove the prefix # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main if name.startswith("roberta."): @@ -3806,29 +3498,126 @@ def modify_tensors( # position embeddings start at pad_token_id + 1, so just chop down the weight tensor if name == "embeddings.position_embeddings.weight": if self._position_offset is not None: - data_torch = data_torch[self._position_offset :, :] + data_torch = data_torch[self._position_offset:,:] return super().modify_tensors(data_torch, name, bid) -@Model.register("GemmaForCausalLM") -class GemmaModel(Model): +@ModelBase.register("NomicBertModel") +class NomicBertModel(BertModel): + def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, **kwargs: Any): + hparams = kwargs.pop("hparams", None) + if hparams is None: + hparams = ModelBase.load_hparams(dir_model) + + self.is_moe = bool(hparams.get("moe_every_n_layers")) + self.model_arch = gguf.MODEL_ARCH.NOMIC_BERT_MOE if self.is_moe else gguf.MODEL_ARCH.NOMIC_BERT + + super().__init__(dir_model, ftype, fname_out, hparams=hparams, **kwargs) + + self._tokenizer_is_xlmroberta = self._is_tokenizer_xlmroberta() + if self._tokenizer_is_xlmroberta: + self._xlmroberta_tokenizer_init() + + # the HF config claims n_ctx=8192, but it uses RoPE scaling + self.hparams["n_ctx"] = 2048 + + assert self.hparams["activation_function"] == "gelu" if self.is_moe else "swiglu" + + # this doesn't do anything in the HF version + assert self.hparams["causal"] is False + # no bias tensors unless MoE + assert self.hparams["qkv_proj_bias"] == self.is_moe + assert self.hparams["mlp_fc1_bias"] == self.is_moe + assert self.hparams["mlp_fc2_bias"] == self.is_moe + + # norm at end of layer + assert self.hparams["prenorm"] is False + # standard RoPE + assert self.hparams["rotary_emb_fraction"] == 1.0 + assert self.hparams["rotary_emb_interleaved"] is False + assert self.hparams["rotary_emb_scale_base"] is None + + def set_vocab(self) -> None: + if self._tokenizer_is_xlmroberta: + return self._xlmroberta_set_vocab() + return super().set_vocab() + + def modify_tensors(self, data_torch: torch.Tensor, name: str, bid: int | None) -> Iterable[tuple[str, torch.Tensor]]: + # If the tensor is an experts bias tensor, skip it by returning an empty list. + if "mlp.experts.bias" in name: + return [] # Explicitly return an empty list. + + if "mlp.experts.mlp.w1" in name: + data_torch = data_torch.view(self.hparams["num_experts"], self.hparams["n_inner"], self.hparams["n_embd"]) + name += ".weight" + + if "mlp.experts.mlp.w2" in name: + data_torch = data_torch.view(self.hparams["num_experts"], self.hparams["n_inner"], self.hparams["n_embd"]) + data_torch = data_torch.transpose(1, 2) + name += ".weight" + + return [(self.map_tensor_name(name), data_torch)] + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"]) + if self.is_moe: + self.gguf_writer.add_moe_every_n_layers(self.hparams["moe_every_n_layers"]) + self.gguf_writer.add_expert_count(self.hparams["num_experts"]) + self.gguf_writer.add_expert_used_count(self.hparams["moe_top_k"]) + + def _is_tokenizer_xlmroberta(self) -> bool: + with open(self.dir_model / "tokenizer.json") as f: + tokenizer_json = json.load(f) + toktyp = tokenizer_json["model"]["type"] + if toktyp == "Unigram": + return True + if toktyp == "WordPiece": + return False + raise ValueError(f"unknown tokenizer: {toktyp}") + + +@ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification") +class XLMRobertaModel(BertModel): + model_arch = gguf.MODEL_ARCH.BERT + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._xlmroberta_tokenizer_init() + + def set_vocab(self): + self._xlmroberta_set_vocab() + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # if name starts with "roberta.", remove the prefix + # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main + if name.startswith("roberta."): + name = name[8:] + + # position embeddings start at pad_token_id + 1, so just chop down the weight tensor + if name == "embeddings.position_embeddings.weight": + if self._position_offset is not None: + data_torch = data_torch[self._position_offset:,:] + + return super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("GemmaForCausalLM") +class GemmaModel(TextModel): model_arch = gguf.MODEL_ARCH.GEMMA def set_vocab(self): self._set_vocab_sentencepiece() # TODO: these special tokens should be exported only for the CodeGemma family - special_vocab = gguf.SpecialVocab( - self.dir_model, - load_merges=False, - special_token_types=["prefix", "suffix", "middle", "fsep", "eot"], - ) + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False, + special_token_types = ['prefix', 'suffix', 'middle', 'fsep', 'eot']) special_vocab._set_special_token("prefix", 67) special_vocab._set_special_token("suffix", 69) special_vocab._set_special_token("middle", 68) - special_vocab._set_special_token("fsep", 70) - special_vocab._set_special_token("eot", 107) + special_vocab._set_special_token("fsep", 70) + special_vocab._set_special_token("eot", 107) special_vocab.chat_template = None # do not add it twice special_vocab.add_to_gguf(self.gguf_writer) @@ -3843,27 +3632,19 @@ def set_gguf_parameters(self): self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) self.gguf_writer.add_head_count(hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv( - self.hparams["num_key_value_heads"] - if "num_key_value_heads" in hparams - else hparams["num_attention_heads"] - ) + self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"]) self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) self.gguf_writer.add_key_length(hparams["head_dim"]) self.gguf_writer.add_value_length(hparams["head_dim"]) self.gguf_writer.add_file_type(self.ftype) - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused # lm_head is not used in llama.cpp, while autoawq will include this tensor in model # To prevent errors, skip loading lm_head.weight. if name == "lm_head.weight": - logger.debug( - f"Skipping get tensor {name!r} in safetensors so that convert can end normally." - ) + logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.") return [] # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 @@ -3873,8 +3654,8 @@ def modify_tensors( return [(self.map_tensor_name(name), data_torch)] -@Model.register("Gemma2ForCausalLM") -class Gemma2Model(Model): +@ModelBase.register("Gemma2ForCausalLM") +class Gemma2Model(TextModel): model_arch = gguf.MODEL_ARCH.GEMMA2 def set_vocab(self): @@ -3891,11 +3672,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) self.gguf_writer.add_head_count(hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv( - self.hparams["num_key_value_heads"] - if "num_key_value_heads" in hparams - else hparams["num_attention_heads"] - ) + self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"]) self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) self.gguf_writer.add_key_length(hparams["head_dim"]) self.gguf_writer.add_value_length(hparams["head_dim"]) @@ -3908,17 +3685,13 @@ def set_gguf_parameters(self): ) self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused # lm_head is not used in llama.cpp, while autoawq will include this tensor in model # To prevent errors, skip loading lm_head.weight. if name == "lm_head.weight": - logger.debug( - f"Skipping get tensor {name!r} in safetensors so that convert can end normally." - ) + logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.") return [] # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 @@ -3928,53 +3701,115 @@ def modify_tensors( return [(self.map_tensor_name(name), data_torch)] -@Model.register("Starcoder2ForCausalLM") -class StarCoder2Model(Model): - model_arch = gguf.MODEL_ARCH.STARCODER2 +@ModelBase.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration") +class Gemma3Model(TextModel): + model_arch = gguf.MODEL_ARCH.GEMMA3 + def set_vocab(self): + self._set_vocab_sentencepiece() -@Model.register("Rwkv6ForCausalLM") -class Rwkv6Model(Model): - model_arch = gguf.MODEL_ARCH.RWKV6 + self.gguf_writer.add_add_space_prefix(False) - def set_vocab(self): - assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file() - vocab_size = self.hparams.get("vocab_size", 65536) + def set_gguf_parameters(self): + hparams = self.hparams + block_count = hparams["num_hidden_layers"] - tokens: list[bytes] = ["".encode("utf-8")] - toktypes: list[int] = [gguf.TokenType.CONTROL] + # some default values are not specified in the hparams + self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 131072)) + self.gguf_writer.add_embedding_length(hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) + self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 8)) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-6)) + self.gguf_writer.add_key_length(hparams.get("head_dim", 256)) + self.gguf_writer.add_value_length(hparams.get("head_dim", 256)) + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1_000_000.0)) # for global layers + # both attn_logit_softcapping and final_logit_softcapping are removed in Gemma3 + assert hparams.get("attn_logit_softcapping") is None + assert hparams.get("final_logit_softcapping") is None + self.gguf_writer.add_sliding_window(hparams["sliding_window"]) + self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4)) + if hparams.get("rope_scaling") is not None: + assert hparams["rope_scaling"]["rope_type"] == "linear" + # important: this rope_scaling is only applied for global layers, and not used by 1B model + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"]) - with open( - self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8" - ) as f: - lines = f.readlines() - for line in lines: - parts = line.split(" ") - assert len(parts) >= 3 - token, token_len = ( - ast.literal_eval(" ".join(parts[1:-1])), - int(parts[-1]), - ) - token = token.encode("utf-8") if isinstance(token, str) else token - assert isinstance(token, bytes) - assert len(token) == token_len - token_text: str = repr(token)[2:-1] # "b'\xff'" -> "\xff" - tokens.append(token_text.encode("utf-8")) - toktypes.append(gguf.TokenType.NORMAL) - remainder = vocab_size - len(tokens) - assert remainder >= 0 - for i in range(len(tokens), vocab_size): - tokens.append(f"[PAD{i}]".encode("utf-8")) - toktypes.append(gguf.TokenType.UNUSED) + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused - self.gguf_writer.add_tokenizer_model("rwkv") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) - special_vocab.chat_template = "rwkv-world" - # hack: Add '\n\n' as the EOT token to make it chat normally - special_vocab._set_special_token("eot", 261) - special_vocab.add_to_gguf(self.gguf_writer) + if name.startswith("language_model."): + name = name.replace("language_model.", "") + + elif name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \ + or name.startswith("multimodal_projector.") or name.startswith("vision_model."): + return [] # skip vision tensors + + # remove OOV (out-of-vocabulary) rows in token_embd + if "embed_tokens.weight" in name: + vocab = self._create_vocab_sentencepiece() + tokens = vocab[0] + data_torch = data_torch[:len(tokens)] + + # ref code in Gemma3RMSNorm + # output = output * (1.0 + self.weight.float()) + if name.endswith("norm.weight"): + data_torch = data_torch + 1 + + return [(self.map_tensor_name(name), data_torch)] + + +@ModelBase.register("Gemma3ForConditionalGeneration") +class Gemma3VisionModel(VisionModel): + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.GEMMA3) + # default values below are taken from HF tranformers code + self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6)) + self.gguf_writer.add_vision_use_gelu(True) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + del bid, new_name, n_dims # unused + # related to https://github.com/ggml-org/llama.cpp/issues/13025 + if "input_projection" in name: + return gguf.GGMLQuantizationType.F16 + if ".embeddings." in name: + return gguf.GGMLQuantizationType.F32 + return False + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + if name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \ + or name.startswith("multimodal_projector.") or name.startswith("vision_model."): + # process vision tensors + name = name.replace("_weight", ".weight") + + # correct norm value ; only this "soft_emb_norm" need to be corrected as it's part of Gemma projector + # the other norm values are part of SigLIP model, and they are already correct + # ref code: Gemma3RMSNorm + if "soft_emb_norm.weight" in name: + logger.info(f"Correcting norm value for '{name}'") + data_torch = data_torch + 1 + + return [(self.map_tensor_name(name), data_torch)] + + return [] # skip other tensors + + +@ModelBase.register("Starcoder2ForCausalLM") +class StarCoder2Model(TextModel): + model_arch = gguf.MODEL_ARCH.STARCODER2 + + +@ModelBase.register("Rwkv6ForCausalLM") +class Rwkv6Model(TextModel): + model_arch = gguf.MODEL_ARCH.RWKV6 + + def set_vocab(self): + self._set_vocab_rwkv_world() def set_gguf_parameters(self): block_count = self.hparams["num_hidden_layers"] @@ -3982,11 +3817,7 @@ def set_gguf_parameters(self): hidden_size = self.hparams["hidden_size"] layer_norm_eps = self.hparams["layer_norm_epsilon"] rescale_every_n_layers = self.hparams["rescale_every"] - intermediate_size = ( - self.hparams["intermediate_size"] - if self.hparams["intermediate_size"] is not None - else int((hidden_size * 3.5) // 32 * 32) - ) + intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else int((hidden_size * 3.5) // 32 * 32) time_mix_extra_dim = 64 if hidden_size == 4096 else 32 time_decay_extra_dim = 128 if hidden_size == 4096 else 64 @@ -4007,19 +3838,13 @@ def set_gguf_parameters(self): lerp_weights: dict[int, dict[str, Tensor]] = {} - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: new_name = self.map_tensor_name(name) if not (new_name.endswith(".weight") or new_name.endswith(".bias")): new_name += ".weight" - if ( - new_name.endswith("time_mix_w1.weight") - or new_name.endswith("time_mix_decay_w1.weight") - or new_name.endswith("time_mix_decay_w2.weight") - ): + if new_name.endswith("time_mix_w1.weight") or new_name.endswith("time_mix_decay_w1.weight") or new_name.endswith("time_mix_decay_w2.weight"): data_torch = data_torch.transpose(0, 1) if new_name.endswith("time_mix_w2.weight"): @@ -4031,47 +3856,28 @@ def modify_tensors( try: rescale_every_n_layers = self.hparams["rescale_every"] if rescale_every_n_layers > 0: - if new_name.endswith("time_mix_output.weight") or new_name.endswith( - "channel_mix_value.weight" - ): - data_torch = data_torch.div_( - 2 ** int(bid // rescale_every_n_layers) - ) + if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"): + data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers)) except KeyError: pass # concat time_mix_lerp weights to reduce some cpu overhead # also reduces the number of tensors in the model - if ( - bid is not None - and "time_mix_lerp" in new_name - and "time_mix_lerp_x" not in new_name - ): + if bid is not None and "time_mix_lerp" in new_name and "time_mix_lerp_x" not in new_name: try: self.lerp_weights[bid][new_name] = data_torch except KeyError: self.lerp_weights[bid] = {new_name: data_torch} - if all( - f"blk.{bid}.time_mix_lerp_{i}.weight" in self.lerp_weights[bid].keys() - for i in ["w", "k", "v", "r", "g"] - ): + if all(f"blk.{bid}.time_mix_lerp_{i}.weight" in self.lerp_weights[bid].keys() for i in ["w", "k", "v", "r", "g"]): new_name = f"blk.{bid}.time_mix_lerp_fused.weight" - data = torch.stack( - [ - self.lerp_weights[bid][ - f"blk.{bid}.time_mix_lerp_{i}.weight" - ].unsqueeze(0) - for i in ["w", "k", "v", "r", "g"] - ], - dim=0, - ).unsqueeze(1) + data = torch.stack([self.lerp_weights[bid][f"blk.{bid}.time_mix_lerp_{i}.weight"].unsqueeze(0) for i in ["w", "k", "v", "r", "g"]], dim=0).unsqueeze(1) yield (new_name, data) return yield (new_name, data_torch) -@Model.register("RWKV6Qwen2ForCausalLM") +@ModelBase.register("RWKV6Qwen2ForCausalLM") class RWKV6Qwen2Model(Rwkv6Model): model_arch = gguf.MODEL_ARCH.RWKV6QWEN2 @@ -4083,54 +3889,212 @@ def set_vocab(self): def set_gguf_parameters(self): block_count = self.hparams["num_hidden_layers"] - num_attention_heads = self.hparams["num_attention_heads"] - num_key_value_heads = self.hparams["num_key_value_heads"] + num_attention_heads = self.hparams["num_attention_heads"] + num_key_value_heads = self.hparams["num_key_value_heads"] + hidden_size = self.hparams["hidden_size"] + head_size = hidden_size // num_attention_heads + rms_norm_eps = self.hparams["rms_norm_eps"] + intermediate_size = self.hparams["intermediate_size"] + time_mix_extra_dim = self.hparams.get("lora_rank_tokenshift", 64 if hidden_size >= 4096 else 32) + time_decay_extra_dim = self.hparams.get("lora_rank_decay", 128 if hidden_size >= 4096 else 64) + + # RWKV isn't context limited + self.gguf_writer.add_context_length(1048576) + self.gguf_writer.add_embedding_length(hidden_size) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_wkv_head_size(head_size) + self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim) + self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim) + self.gguf_writer.add_feed_forward_length(intermediate_size) + self.gguf_writer.add_file_type(self.ftype) + + # special parameters for time_mixing in RWKV6QWEN2 + self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) + self.gguf_writer.add_token_shift_count(1) + # RWKV6QWEN2 use grouped key/value like GQA + self.gguf_writer.add_head_count_kv(num_key_value_heads) + + # required by llama.cpp, unused + self.gguf_writer.add_head_count(0) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + for new_name, data in super().modify_tensors(data_torch, name, bid): + if "time_mix_w1" in new_name or "time_mix_w2" in new_name: + data = data.view(5, -1, data.shape[-1]) + # rwkv6qwen2 has a different order of rkvwg instead of the original wkvrg + # permute them here to avoid code changes + data = torch.stack([data[3], data[1], data[2], data[0], data[4]], dim=0).view(-1, data.shape[-1]) + if "w2" in new_name: + data = data.view(5, -1, data.shape[-1]) + yield (new_name, data) + continue + yield (new_name, data) + + +@ModelBase.register("Rwkv7ForCausalLM", "RWKV7ForCausalLM") +class Rwkv7Model(TextModel): + model_arch = gguf.MODEL_ARCH.RWKV7 + + def set_vocab(self): + self._set_vocab_rwkv_world() + + def calc_lora_rank(self, hidden_size, exponent, multiplier): + return max(1, round(hidden_size ** exponent * multiplier / 32)) * 32 + + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + try: + head_size = self.hparams["head_size"] + layer_norm_eps = self.hparams["layer_norm_epsilon"] + except KeyError: + head_size = self.hparams["head_dim"] + layer_norm_eps = self.hparams["norm_eps"] + hidden_size = self.hparams["hidden_size"] + intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else (hidden_size * 4) + + # ICLR: In-Context-Learning-Rate + try: + lora_rank_decay = self.hparams["lora_rank_decay"] if self.hparams["lora_rank_decay"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8) + lora_rank_iclr = self.hparams["lora_rank_iclr"] if self.hparams["lora_rank_iclr"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8) + lora_rank_value_residual_mix = self.hparams["lora_rank_value_residual_mix"] if self.hparams["lora_rank_value_residual_mix"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.3) + lora_rank_gate = self.hparams["lora_rank_gate"] if self.hparams["lora_rank_gate"] is not None else self.calc_lora_rank(hidden_size, 0.8, 0.6) + except KeyError: + lora_rank_decay = self.hparams["decay_low_rank_dim"] if self.hparams["decay_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8) + lora_rank_iclr = self.hparams["a_low_rank_dim"] if self.hparams["a_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8) + lora_rank_value_residual_mix = self.hparams["v_low_rank_dim"] if self.hparams["v_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.3) + lora_rank_gate = self.hparams["gate_low_rank_dim"] if self.hparams["gate_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.8, 0.6) + + # RWKV isn't context limited + self.gguf_writer.add_context_length(1048576) + self.gguf_writer.add_embedding_length(hidden_size) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_layer_norm_eps(layer_norm_eps) + self.gguf_writer.add_wkv_head_size(head_size) + self.gguf_writer.add_decay_lora_rank(lora_rank_decay) + self.gguf_writer.add_iclr_lora_rank(lora_rank_iclr) + self.gguf_writer.add_value_residual_mix_lora_rank(lora_rank_value_residual_mix) + self.gguf_writer.add_gate_lora_rank(lora_rank_gate) + self.gguf_writer.add_feed_forward_length(intermediate_size) + self.gguf_writer.add_file_type(self.ftype) + + # required by llama.cpp, unused + self.gguf_writer.add_head_count(0) + + lerp_weights: dict[int, dict[str, Tensor]] = {} + lora_needs_transpose: bool = True + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # unify tensor names here to make life easier + name = name.replace("blocks", "layers").replace("ffn", "feed_forward") + name = name.replace("self_attn", "attention").replace("attn", "attention") + name = name.replace("time_mixer.", "") + # lora layer names in fla-hub's impl + if "_lora.lora" in name: + self.lora_needs_transpose = False + name = name.replace("_lora.lora.0.weight", "1.weight") + name = name.replace("_lora.lora.2.weight", "2.weight") + name = name.replace("_lora.lora.2.bias", "0.weight") + + name = name.replace("feed_forward_norm", "ln2") + name = name.replace("g_norm", "ln_x") + + if "attention.v" in name and "value" not in self.map_tensor_name(name) and bid == 0: + # some models have dummy v0/v1/v2 on first layer while others don't + # ignore them all since they are not used + return + + wkv_has_gate = self.hparams.get("wkv_has_gate", True) + lerp_list = ["r", "w", "k", "v", "a", "g"] if wkv_has_gate else ["r", "w", "k", "v", "a"] + + if bid is not None and "attention.x_" in name: + if "attention.x_x" in name: + # already concatenated + new_name = f"blk.{bid}.time_mix_lerp_fused.weight" + data = data_torch.reshape(len(lerp_list), 1, 1, -1) + yield (new_name, data) + else: + try: + self.lerp_weights[bid][name] = data_torch + except KeyError: + self.lerp_weights[bid] = {name: data_torch} + if all(f"model.layers.{bid}.attention.x_{i}" in self.lerp_weights[bid].keys() for i in lerp_list): + new_name = f"blk.{bid}.time_mix_lerp_fused.weight" + data = torch.stack([self.lerp_weights[bid][f"model.layers.{bid}.attention.x_{i}"] for i in lerp_list], dim=0) + yield (new_name, data) + return + else: + data_torch = data_torch.squeeze() + new_name = self.map_tensor_name(name) + + if not (new_name.endswith(".weight") or new_name.endswith(".bias")): + new_name += ".weight" + + if self.lora_needs_transpose and any( + new_name.endswith(t) for t in [ + "time_mix_w1.weight", "time_mix_w2.weight", + "time_mix_a1.weight", "time_mix_a2.weight", + "time_mix_v1.weight", "time_mix_v2.weight", + "time_mix_g1.weight", "time_mix_g2.weight", + ] + ): + data_torch = data_torch.transpose(0, 1) + + if 'r_k' in new_name: + data_torch = data_torch.flatten() + + if bid == 0 and "time_mix_a" in new_name: + # dummy v0/v1/v2 on first layer + # easist way to make llama happy + yield (new_name.replace("time_mix_a", "time_mix_v"), data_torch) + + yield (new_name, data_torch) + + +@ModelBase.register("RwkvHybridForCausalLM") +class ARwkv7Model(Rwkv7Model): + model_arch = gguf.MODEL_ARCH.ARWKV7 + + def set_vocab(self): + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] hidden_size = self.hparams["hidden_size"] - head_size = hidden_size // num_attention_heads + head_size = self.hparams["head_size"] rms_norm_eps = self.hparams["rms_norm_eps"] intermediate_size = self.hparams["intermediate_size"] - time_mix_extra_dim = 64 if hidden_size >= 4096 else 32 - time_decay_extra_dim = 128 if hidden_size >= 4096 else 64 + wkv_has_gate = self.hparams["wkv_has_gate"] + assert self.hparams["wkv_version"] == 7 + + # ICLR: In-Context-Learning-Rate + lora_rank_decay = 64 + lora_rank_iclr = 64 + lora_rank_value_residual_mix = 32 + lora_rank_gate = 128 if wkv_has_gate else 0 # RWKV isn't context limited self.gguf_writer.add_context_length(1048576) self.gguf_writer.add_embedding_length(hidden_size) self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) self.gguf_writer.add_wkv_head_size(head_size) - self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim) - self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim) + self.gguf_writer.add_decay_lora_rank(lora_rank_decay) + self.gguf_writer.add_iclr_lora_rank(lora_rank_iclr) + self.gguf_writer.add_value_residual_mix_lora_rank(lora_rank_value_residual_mix) + self.gguf_writer.add_gate_lora_rank(lora_rank_gate) self.gguf_writer.add_feed_forward_length(intermediate_size) self.gguf_writer.add_file_type(self.ftype) - - # special parameters for time_mixing in RWKV6QWEN2 - self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) self.gguf_writer.add_token_shift_count(1) - # RWKV6QWEN2 use grouped key/value like GQA - self.gguf_writer.add_head_count_kv(num_key_value_heads) # required by llama.cpp, unused self.gguf_writer.add_head_count(0) - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: - for new_name, data in super().modify_tensors(data_torch, name, bid): - if "time_mix_w1" in new_name or "time_mix_w2" in new_name: - data = data.view(5, -1, data.shape[-1]) - # rwkv6qwen2 has a different order of rkvwg instead of the original wkvrg - # permute them here to avoid code changes - data = torch.stack( - [data[3], data[1], data[2], data[0], data[4]], dim=0 - ).view(-1, data.shape[-1]) - if "w2" in new_name: - data = data.view(5, -1, data.shape[-1]) - yield (new_name, data) - continue - yield (new_name, data) - -@Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM") -class MambaModel(Model): +@ModelBase.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM") +class MambaModel(TextModel): model_arch = gguf.MODEL_ARCH.MAMBA def set_vocab(self): @@ -4151,23 +4115,15 @@ def set_vocab(self): self._set_vocab_builtin("gpt-neox", vocab_size) def set_gguf_parameters(self): - d_model = self.find_hparam(["hidden_size", "d_model"]) - d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4 - d_inner = ( - self.find_hparam(["intermediate_size", "d_inner"], optional=True) - or 2 * d_model - ) - d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16 + d_model = self.find_hparam(["hidden_size", "d_model"]) + d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4 + d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model + d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16 # ceiling division # ref: https://stackoverflow.com/a/17511341/22827863 # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58 - dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -( - d_model // -16 - ) - rms_norm_eps = ( - self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) - or 1e-5 - ) + dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16) + rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5 use_dt_b_c_norm = False # For falconmamba we do apply RMS norm on B / DT and C layers if self.find_hparam(["model_type"], optional=True) in ("falcon_mamba",): @@ -4175,34 +4131,22 @@ def set_gguf_parameters(self): # Fail early for models which don't have a block expansion factor of 2 assert d_inner == 2 * d_model - self.gguf_writer.add_context_length( - 2**20 - ) # arbitrary value; for those who use the default + self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default self.gguf_writer.add_embedding_length(d_model) - self.gguf_writer.add_feed_forward_length( - 0 - ) # unused, but seemingly required when loading - self.gguf_writer.add_head_count( - 0 - ) # unused, but seemingly required when loading + self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading + self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading self.gguf_writer.add_block_count(self.block_count) self.gguf_writer.add_ssm_conv_kernel(d_conv) self.gguf_writer.add_ssm_inner_size(d_inner) self.gguf_writer.add_ssm_state_size(d_state) self.gguf_writer.add_ssm_time_step_rank(dt_rank) self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) - self.gguf_writer.add_ssm_dt_b_c_rms( - use_dt_b_c_norm - ) # For classic Mamba we don't apply rms norm on B / DT layers + self.gguf_writer.add_ssm_dt_b_c_rms(use_dt_b_c_norm) # For classic Mamba we don't apply rms norm on B / DT layers self.gguf_writer.add_file_type(self.ftype) _tok_embd = None - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: - del bid # unused - + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT) tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD) @@ -4212,12 +4156,14 @@ def modify_tensors( logger.debug("A_log --> A ==> " + new_name) data_torch = -torch.exp(data_torch) + # [4 1 8192 1] -> [4 8192 1 1] + if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid): + data_torch = data_torch.squeeze() + # assuming token_embd.weight is seen before output.weight if self._tok_embd is not None and new_name == output_name: if torch.equal(self._tok_embd, data_torch): - logger.debug( - f"{output_name} is equivalent to {tok_embd_name}, omitting" - ) + logger.debug(f"{output_name} is equivalent to {tok_embd_name}, omitting") return [] elif new_name == tok_embd_name: self._tok_embd = data_torch @@ -4225,8 +4171,8 @@ def modify_tensors( return [(new_name, data_torch)] -@Model.register("CohereForCausalLM") -class CommandR2Model(Model): +@ModelBase.register("CohereForCausalLM") +class CommandR2Model(TextModel): model_arch = gguf.MODEL_ARCH.COMMAND_R def __init__(self, *args, **kwargs): @@ -4235,9 +4181,7 @@ def __init__(self, *args, **kwargs): # max_position_embeddings = 8192 in config.json but model was actually # trained on 128k context length # aya-23 models don't have model_max_length specified - self.hparams["max_position_embeddings"] = self.find_hparam( - ["model_max_length", "max_position_embeddings"] - ) + self.hparams["max_position_embeddings"] = self.find_hparam(["model_max_length", "max_position_embeddings"]) def set_gguf_parameters(self): super().set_gguf_parameters() @@ -4245,8 +4189,8 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) -@Model.register("Cohere2ForCausalLM") -class Cohere2Model(Model): +@ModelBase.register("Cohere2ForCausalLM") +class Cohere2Model(TextModel): model_arch = gguf.MODEL_ARCH.COHERE2 def set_gguf_parameters(self): @@ -4259,15 +4203,13 @@ def set_gguf_parameters(self): rotary_pct = self.hparams["rotary_pct"] hidden_size = self.hparams["hidden_size"] num_attention_heads = self.hparams["num_attention_heads"] - self.gguf_writer.add_rope_dimension_count( - int(rotary_pct * (hidden_size // num_attention_heads)) - ) + self.gguf_writer.add_rope_dimension_count(int(rotary_pct * (hidden_size // num_attention_heads))) self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) -@Model.register("OlmoForCausalLM") -@Model.register("OLMoForCausalLM") -class OlmoModel(Model): +@ModelBase.register("OlmoForCausalLM") +@ModelBase.register("OLMoForCausalLM") +class OlmoModel(TextModel): model_arch = gguf.MODEL_ARCH.OLMO def set_gguf_parameters(self): @@ -4279,9 +4221,7 @@ def set_gguf_parameters(self): # Same as super class, but permuting q_proj, k_proj # Copied from: LlamaModel - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused n_head = self.hparams["num_attention_heads"] @@ -4295,13 +4235,13 @@ def modify_tensors( return [(self.map_tensor_name(name), data_torch)] -@Model.register("Olmo2ForCausalLM") -class Olmo2Model(Model): +@ModelBase.register("Olmo2ForCausalLM") +class Olmo2Model(TextModel): model_arch = gguf.MODEL_ARCH.OLMO2 -@Model.register("OlmoeForCausalLM") -class OlmoeModel(Model): +@ModelBase.register("OlmoeForCausalLM") +class OlmoeModel(TextModel): model_arch = gguf.MODEL_ARCH.OLMOE def set_gguf_parameters(self): @@ -4313,9 +4253,7 @@ def set_gguf_parameters(self): _experts: list[dict[str, Tensor]] | None = None # Copied from: Qwen2MoeModel - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # process the experts separately if name.find("experts") != -1: n_experts = self.hparams["num_experts"] @@ -4362,7 +4300,7 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") -@Model.register("JinaBertModel", "JinaBertForMaskedLM") +@ModelBase.register("JinaBertModel", "JinaBertForMaskedLM") class JinaBertV2Model(BertModel): model_arch = gguf.MODEL_ARCH.JINA_BERT_V2 @@ -4372,13 +4310,13 @@ def __init__(self, *args, **kwargs): def get_tensors(self): for name, data in super().get_tensors(): - if "gated_layer" in name: - d1 = data[: self.intermediate_size, :] - name1 = name.replace("gated_layers", "gated_layers_w") - name1 = name1.replace("up_gated_layer", "gated_layers_v") - d2 = data[self.intermediate_size :, :] - name2 = name.replace("gated_layers", "gated_layers_v") - name2 = name2.replace("up_gated_layer", "gated_layers_w") + if 'gated_layer' in name: + d1 = data[:self.intermediate_size, :] + name1 = name.replace('gated_layers', 'gated_layers_w') + name1 = name1.replace('up_gated_layer', 'gated_layers_v') + d2 = data[self.intermediate_size:, :] + name2 = name.replace('gated_layers', 'gated_layers_v') + name2 = name2.replace('up_gated_layer', 'gated_layers_w') yield name1, d1 yield name2, d2 continue @@ -4386,25 +4324,21 @@ def get_tensors(self): yield name, data def set_vocab(self): - tokenizer_class = "BertTokenizer" + tokenizer_class = 'BertTokenizer' with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f: - tokenizer_class = json.load(f)["tokenizer_class"] + tokenizer_class = json.load(f)['tokenizer_class'] - if tokenizer_class == "BertTokenizer": + if tokenizer_class == 'BertTokenizer': super().set_vocab() - elif tokenizer_class == "RobertaTokenizer": + elif tokenizer_class == 'RobertaTokenizer': self._set_vocab_gpt2() self.gguf_writer.add_token_type_count(2) else: - raise NotImplementedError( - f"Tokenizer {tokenizer_class} is not supported for JinaBertModel" - ) + raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel') self.gguf_writer.add_add_bos_token(True) self.gguf_writer.add_add_eos_token(True) - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # if name starts with "bert.", remove the prefix # e.g. https://huggingface.co/jinaai/jina-reranker-v1-tiny-en if name.startswith("bert."): @@ -4413,8 +4347,8 @@ def modify_tensors( return super().modify_tensors(data_torch, name, bid) -@Model.register("OpenELMForCausalLM") -class OpenELMModel(Model): +@ModelBase.register("OpenELMForCausalLM") +class OpenELMModel(TextModel): model_arch = gguf.MODEL_ARCH.OPENELM @staticmethod @@ -4438,12 +4372,8 @@ def __init__(self, *args, **kwargs): OpenELMModel._make_divisible(multiplier * self._n_embd, ffn_dim_divisor) for multiplier in ffn_multipliers ] - assert isinstance(self._num_kv_heads, list) and isinstance( - self._num_kv_heads[0], int - ) - assert isinstance(self._num_query_heads, list) and isinstance( - self._num_query_heads[0], int - ) + assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int) + assert isinstance(self._num_query_heads, list) and isinstance(self._num_query_heads[0], int) # Uses the tokenizer from meta-llama/Llama-2-7b-hf def set_vocab(self): @@ -4480,27 +4410,20 @@ def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any: return super().find_hparam(keys, optional) - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # split ff if bid is not None and name == f"transformer.layers.{bid}.ffn.proj_1.weight": ff_dim = self._ffn_dims[bid] - yield ( - self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), - data_torch[:ff_dim], - ) - yield ( - self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), - data_torch[ff_dim:], - ) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), data_torch[:ff_dim]) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), data_torch[ff_dim:]) return yield (self.map_tensor_name(name), data_torch) -@Model.register("ArcticForCausalLM") -class ArcticModel(Model): +@ModelBase.register("ArcticForCausalLM") +class ArcticModel(TextModel): model_arch = gguf.MODEL_ARCH.ARCTIC def set_vocab(self): @@ -4509,23 +4432,24 @@ def set_vocab(self): # tokenizer.model and used them as BOS and EOS instead of adding new tokens. from sentencepiece import SentencePieceProcessor - tokenizer_path = self.dir_model / "tokenizer.model" + tokenizer_path = self.dir_model / 'tokenizer.model' if not tokenizer_path.is_file(): - logger.error(f"Error: Missing {tokenizer_path}") + logger.error(f'Error: Missing {tokenizer_path}') sys.exit(1) # Read the whole vocabulary from the tokenizer.model file tokenizer = SentencePieceProcessor() tokenizer.LoadFromFile(str(tokenizer_path)) - vocab_size = self.hparams.get("vocab_size", tokenizer.vocab_size()) + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] scores: list[float] = [-10000.0] * vocab_size toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size for token_id in range(tokenizer.vocab_size()): + piece = tokenizer.IdToPiece(token_id) text = piece.encode("utf-8") score = tokenizer.GetScore(token_id) @@ -4546,7 +4470,7 @@ def set_vocab(self): # Use the added_tokens_decoder field from tokeniser_config.json as the source # of information about added/redefined tokens and modify them accordingly. - tokenizer_config_file = self.dir_model / "tokenizer_config.json" + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' if tokenizer_config_file.is_file(): with open(tokenizer_config_file, "r", encoding="utf-8") as f: tokenizer_config_json = json.load(f) @@ -4556,9 +4480,7 @@ def set_vocab(self): for token_id, token_json in added_tokens_decoder.items(): token_id = int(token_id) if token_id >= vocab_size: - logger.debug( - f"ignore token {token_id}: id is out of range, max={vocab_size - 1}" - ) + logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') continue token_content = token_json["content"] @@ -4574,9 +4496,7 @@ def set_vocab(self): token_type = SentencePieceTokenTypes.CONTROL token_score = 0.0 - logger.info( - f"Setting added token {token_id} to '{token_content}' (type: {token_type}, score: {token_score:.2f})" - ) + logger.info(f"Setting added token {token_id} to '{token_content}' (type: {token_type}, score: {token_score:.2f})") tokens[token_id] = token_content.encode("utf-8") toktypes[token_id] = token_type scores[token_id] = token_score @@ -4594,15 +4514,11 @@ def set_gguf_parameters(self): super().set_gguf_parameters() hparams = self.hparams self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - self.gguf_writer.add_rope_dimension_count( - hparams["hidden_size"] // hparams["num_attention_heads"] - ) + self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"]) _experts: list[dict[str, Tensor]] | None = None - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: n_head = self.hparams["num_attention_heads"] n_kv_head = self.hparams.get("num_key_value_heads") @@ -4657,8 +4573,8 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") -@Model.register("DeepseekForCausalLM") -class DeepseekModel(Model): +@ModelBase.register("DeepseekForCausalLM") +class DeepseekModel(TextModel): model_arch = gguf.MODEL_ARCH.DEEPSEEK def set_vocab(self): @@ -4679,9 +4595,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"]) self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - self.gguf_writer.add_expert_feed_forward_length( - hparams["moe_intermediate_size"] - ) + self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"]) self.gguf_writer.add_expert_weights_scale(1.0) self.gguf_writer.add_expert_count(hparams["n_routed_experts"]) self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"]) @@ -4692,17 +4606,11 @@ def set_gguf_parameters(self): def permute(weights: Tensor, n_head: int, n_head_kv: int | None): if n_head_kv is not None and n_head != n_head_kv: n_head = n_head_kv - return ( - weights.reshape( - n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:] - ) - .swapaxes(1, 2) - .reshape(weights.shape) - ) + return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape)) - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: n_head = self.hparams["num_attention_heads"] n_kv_head = self.hparams.get("num_key_value_heads") @@ -4756,15 +4664,19 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") -@Model.register("DeepseekV2ForCausalLM") -@Model.register("DeepseekV3ForCausalLM") -class DeepseekV2Model(Model): +@ModelBase.register("DeepseekV2ForCausalLM") +@ModelBase.register("DeepseekV3ForCausalLM") +class DeepseekV2Model(TextModel): model_arch = gguf.MODEL_ARCH.DEEPSEEK2 def set_vocab(self): self._set_vocab_gpt2() def set_gguf_parameters(self): + + # note: deepseek2 using MLA converts into MQA (ie: GQA with 1 group) + self.hparams["num_key_value_heads"] = 1 + super().set_gguf_parameters() hparams = self.hparams @@ -4773,13 +4685,14 @@ def set_gguf_parameters(self): if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None: self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"]) self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"]) - self.gguf_writer.add_key_length( - hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"] - ) - self.gguf_writer.add_value_length(hparams["v_head_dim"]) - self.gguf_writer.add_expert_feed_forward_length( - hparams["moe_intermediate_size"] - ) + + # note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA + self.gguf_writer.add_key_length(hparams["kv_lora_rank"] + hparams["qk_rope_head_dim"]) + self.gguf_writer.add_value_length(hparams["kv_lora_rank"]) + self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"]) + self.gguf_writer.add_value_length_mla(hparams["v_head_dim"]) + + self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"]) self.gguf_writer.add_expert_count(hparams["n_routed_experts"]) self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"]) self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"]) @@ -4790,33 +4703,20 @@ def set_gguf_parameters(self): elif hparams["scoring_func"] == "softmax": self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX) else: - raise ValueError( - f"Unsupported scoring_func value: {hparams['scoring_func']}" - ) + raise ValueError(f"Unsupported scoring_func value: {hparams['scoring_func']}") self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) - if ( - self.hparams.get("rope_scaling") is not None - and "factor" in self.hparams["rope_scaling"] - ): + if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: if self.hparams["rope_scaling"].get("type") == "yarn": self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor( - self.hparams["rope_scaling"]["factor"] - ) - self.gguf_writer.add_rope_scaling_orig_ctx_len( - self.hparams["rope_scaling"]["original_max_position_embeddings"] - ) - self.gguf_writer.add_rope_scaling_yarn_log_mul( - 0.1 * hparams["rope_scaling"]["mscale_all_dim"] - ) + self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) + self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"]) + self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * hparams["rope_scaling"]["mscale_all_dim"]) _experts: list[dict[str, Tensor]] | None = None - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # rename e_score_correction_bias tensors if name.endswith("e_score_correction_bias"): name = name.replace("e_score_correction_bias", "e_score_correction.bias") @@ -4860,6 +4760,26 @@ def modify_tensors( else: return [] + # note: MLA with the absorption optimization, needs these two split and k_b_proj transposed + if name.endswith("kv_b_proj.weight"): + name_kb = name.replace("kv_b_proj", "k_b_proj") + name_vb = name.replace("kv_b_proj", "v_b_proj") + + n_head_kv = self.hparams["num_key_value_heads"] + v_head_dim = self.hparams["v_head_dim"] + qk_nope_head_dim = self.hparams["qk_nope_head_dim"] + + assert data_torch.shape[0] == n_head_kv * (v_head_dim + qk_nope_head_dim) + + kv_b = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, data_torch.shape[-1]) + k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1) + k_b = k_b.transpose(1, 2) + + return [ + (self.map_tensor_name(name_kb), k_b), + (self.map_tensor_name(name_vb), v_b) + ] + return [(self.map_tensor_name(name), data_torch)] def prepare_tensors(self): @@ -4872,11 +4792,34 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") -@Model.register("T5WithLMHeadModel") -@Model.register("T5ForConditionalGeneration") -@Model.register("MT5ForConditionalGeneration") -@Model.register("UMT5ForConditionalGeneration") -class T5Model(Model): +@ModelBase.register("PLMForCausalLM") +class PLMModel(TextModel): + model_arch = gguf.MODEL_ARCH.PLM + + def set_vocab(self): + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"]) + self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"]) + self.gguf_writer.add_value_length(hparams["v_head_dim"]) + self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + return [(self.map_tensor_name(name), data_torch)] + + def prepare_tensors(self): + super().prepare_tensors() + + +@ModelBase.register("T5WithLMHeadModel") +@ModelBase.register("T5ForConditionalGeneration") +@ModelBase.register("MT5ForConditionalGeneration") +@ModelBase.register("UMT5ForConditionalGeneration") +class T5Model(TextModel): model_arch = gguf.MODEL_ARCH.T5 def __init__(self, *args, **kwargs): @@ -4890,37 +4833,34 @@ def set_vocab(self): from sentencepiece import SentencePieceProcessor from sentencepiece import sentencepiece_model_pb2 as model - tokenizer_path = self.dir_model / "tokenizer.model" + tokenizer_path = self.dir_model / 'tokenizer.model' # many older models use spiece.model tokenizer model filename if not tokenizer_path.is_file(): - tokenizer_path = self.dir_model / "spiece.model" + tokenizer_path = self.dir_model / 'spiece.model' if not tokenizer_path.is_file(): raise FileNotFoundError(f"File not found: {tokenizer_path}") sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] - with open(tokenizer_path, "rb") as file: - sentencepiece_model.ParseFromString(file.read()) + sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) # some models like Pile-T5 family use BPE tokenizer instead of Unigram if sentencepiece_model.trainer_spec.model_type == 2: # BPE # assure the tokenizer model file name is correct - assert tokenizer_path.name == "tokenizer.model" + assert tokenizer_path.name == 'tokenizer.model' return self._set_vocab_sentencepiece() else: assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix - remove_whitespaces = ( - sentencepiece_model.normalizer_spec.remove_extra_whitespaces - ) + remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap tokenizer = SentencePieceProcessor() tokenizer.LoadFromFile(str(tokenizer_path)) - vocab_size = self.hparams.get("vocab_size", tokenizer.vocab_size()) + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] scores: list[float] = [-10000.0] * vocab_size @@ -4945,16 +4885,14 @@ def set_vocab(self): scores[token_id] = score toktypes[token_id] = toktype - added_tokens_file = self.dir_model / "added_tokens.json" + added_tokens_file = self.dir_model / 'added_tokens.json' if added_tokens_file.is_file(): with open(added_tokens_file, "r", encoding="utf-8") as f: added_tokens_json = json.load(f) for key in added_tokens_json: token_id = added_tokens_json[key] if token_id >= vocab_size: - logger.warning( - f"ignore token {token_id}: id is out of range, max={vocab_size - 1}" - ) + logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') continue tokens[token_id] = key.encode("utf-8") @@ -4963,9 +4901,7 @@ def set_vocab(self): if vocab_size > len(tokens): pad_count = vocab_size - len(tokens) - logger.debug( - f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]" - ) + logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") for i in range(1, pad_count + 1): tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) scores.append(-1000.0) @@ -4989,9 +4925,7 @@ def set_vocab(self): def set_gguf_parameters(self): if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None: - logger.warning( - "Couldn't find context length in config.json, assuming default value of 512" - ) + logger.warning("Couldn't find context length in config.json, assuming default value of 512") n_ctx = 512 self.gguf_writer.add_context_length(n_ctx) self.gguf_writer.add_embedding_length(self.hparams["d_model"]) @@ -5001,43 +4935,31 @@ def set_gguf_parameters(self): self.gguf_writer.add_key_length(self.hparams["d_kv"]) self.gguf_writer.add_value_length(self.hparams["d_kv"]) self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_relative_attn_buckets_count( - self.hparams["relative_attention_num_buckets"] - ) + self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"]) self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_decoder_start_token_id( - self.hparams["decoder_start_token_id"] - ) + self.gguf_writer.add_decoder_start_token_id(self.hparams["decoder_start_token_id"]) self.gguf_writer.add_file_type(self.ftype) - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight", # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder # and decoder and ignore the remaining ones. - if name in [ - "decoder.embed_tokens.weight", - "encoder.embed_tokens.weight", - "shared.weight", - ]: + if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]: if not self.shared_token_embeddings_found: name = "shared.weight" self.shared_token_embeddings_found = True else: - logger.debug( - f"Skipping shared tensor {name!r} in safetensors so that convert can end normally." - ) + logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.") return [] return [(self.map_tensor_name(name), data_torch)] -@Model.register("T5EncoderModel") -class T5EncoderModel(Model): +@ModelBase.register("T5EncoderModel") +class T5EncoderModel(TextModel): model_arch = gguf.MODEL_ARCH.T5ENCODER def __init__(self, *args, **kwargs): @@ -5051,37 +4973,34 @@ def set_vocab(self): from sentencepiece import SentencePieceProcessor from sentencepiece import sentencepiece_model_pb2 as model - tokenizer_path = self.dir_model / "tokenizer.model" + tokenizer_path = self.dir_model / 'tokenizer.model' # many older models use spiece.model tokenizer model filename if not tokenizer_path.is_file(): - tokenizer_path = self.dir_model / "spiece.model" + tokenizer_path = self.dir_model / 'spiece.model' if not tokenizer_path.is_file(): raise FileNotFoundError(f"File not found: {tokenizer_path}") sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] - with open(tokenizer_path, "rb") as file: - sentencepiece_model.ParseFromString(file.read()) + sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) # some models like Pile-T5 family use BPE tokenizer instead of Unigram if sentencepiece_model.trainer_spec.model_type == 2: # BPE # assure the tokenizer model file name is correct - assert tokenizer_path.name == "tokenizer.model" + assert tokenizer_path.name == 'tokenizer.model' return self._set_vocab_sentencepiece() else: assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix - remove_whitespaces = ( - sentencepiece_model.normalizer_spec.remove_extra_whitespaces - ) + remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap tokenizer = SentencePieceProcessor() tokenizer.LoadFromFile(str(tokenizer_path)) - vocab_size = self.hparams.get("vocab_size", tokenizer.vocab_size()) + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] scores: list[float] = [-10000.0] * vocab_size @@ -5106,16 +5025,14 @@ def set_vocab(self): scores[token_id] = score toktypes[token_id] = toktype - added_tokens_file = self.dir_model / "added_tokens.json" + added_tokens_file = self.dir_model / 'added_tokens.json' if added_tokens_file.is_file(): with open(added_tokens_file, "r", encoding="utf-8") as f: added_tokens_json = json.load(f) for key in added_tokens_json: token_id = added_tokens_json[key] if token_id >= vocab_size: - logger.warning( - f"ignore token {token_id}: id is out of range, max={vocab_size - 1}" - ) + logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') continue tokens[token_id] = key.encode("utf-8") @@ -5124,9 +5041,7 @@ def set_vocab(self): if vocab_size > len(tokens): pad_count = vocab_size - len(tokens) - logger.debug( - f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]" - ) + logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") for i in range(1, pad_count + 1): tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) scores.append(-1000.0) @@ -5150,9 +5065,7 @@ def set_vocab(self): def set_gguf_parameters(self): if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None: - logger.warning( - "Couldn't find context length in config.json, assuming default value of 512" - ) + logger.warning("Couldn't find context length in config.json, assuming default value of 512") n_ctx = 512 self.gguf_writer.add_context_length(n_ctx) self.gguf_writer.add_embedding_length(self.hparams["d_model"]) @@ -5162,40 +5075,30 @@ def set_gguf_parameters(self): self.gguf_writer.add_key_length(self.hparams["d_kv"]) self.gguf_writer.add_value_length(self.hparams["d_kv"]) self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_relative_attn_buckets_count( - self.hparams["relative_attention_num_buckets"] - ) + self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"]) self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) self.gguf_writer.add_file_type(self.ftype) - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight", # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder # and decoder and ignore the remaining ones. - if name in [ - "decoder.embed_tokens.weight", - "encoder.embed_tokens.weight", - "shared.weight", - ]: + if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]: if not self.shared_token_embeddings_found: name = "shared.weight" self.shared_token_embeddings_found = True else: - logger.debug( - f"Skipping shared tensor {name!r} in safetensors so that convert can end normally." - ) + logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.") return [] return [(self.map_tensor_name(name), data_torch)] -@Model.register("JAISLMHeadModel") -class JaisModel(Model): +@ModelBase.register("JAISLMHeadModel") +class JaisModel(TextModel): model_arch = gguf.MODEL_ARCH.JAIS def __init__(self, *args, **kwargs): @@ -5208,21 +5111,19 @@ def __init__(self, *args, **kwargs): # Embeddings scale self.embeddings_scale = 1.0 - if "mup_embeddings_scale" in self.hparams: - self.embeddings_scale = self.hparams["mup_embeddings_scale"] - elif "embeddings_scale" in self.hparams: - self.embeddings_scale = self.hparams["embeddings_scale"] + if 'mup_embeddings_scale' in self.hparams: + self.embeddings_scale = self.hparams['mup_embeddings_scale'] + elif 'embeddings_scale' in self.hparams: + self.embeddings_scale = self.hparams['embeddings_scale'] else: assert False self.width_scale = 1.0 - if "mup_output_alpha" in self.hparams: - assert "mup_width_scale" in self.hparams - self.width_scale = ( - self.hparams["mup_output_alpha"] * self.hparams["mup_width_scale"] - ) - elif "width_scale" in self.hparams: - self.width_scale = self.hparams["width_scale"] + if 'mup_output_alpha' in self.hparams: + assert 'mup_width_scale' in self.hparams + self.width_scale = self.hparams['mup_output_alpha'] * self.hparams['mup_width_scale'] + elif 'width_scale' in self.hparams: + self.width_scale = self.hparams['width_scale'] else: assert False @@ -5240,9 +5141,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) self.gguf_writer.add_file_type(self.ftype) - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused tensors: list[tuple[str, Tensor]] = [] @@ -5262,9 +5161,7 @@ def modify_tensors( return tensors - if name.endswith( - (".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_fc2.weight") - ): + if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_fc2.weight")): data_torch = data_torch.transpose(1, 0) new_name = self.map_tensor_name(name) @@ -5283,8 +5180,39 @@ def prepare_tensors(self): self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias) -@Model.register("ChatGLMModel", "ChatGLMForConditionalGeneration") -class ChatGLMModel(Model): +@ModelBase.register("Glm4ForCausalLM") +class Glm4Model(TextModel): + model_arch = gguf.MODEL_ARCH.GLM4 + + def set_vocab(self): + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + tokens, toktypes, tokpre = self.get_vocab_base() + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) + special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) + special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) + special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"]) + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + rope_dim = self.hparams["head_dim"] + self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))) + if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: + if self.hparams["rope_scaling"].get("type") == "yarn": + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) + self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) + self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"]) + + +@ModelBase.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration") +class ChatGLMModel(TextModel): model_arch = gguf.MODEL_ARCH.CHATGLM def set_vocab_chatglm3(self): @@ -5295,23 +5223,11 @@ def set_vocab_chatglm3(self): scores: list[float] = [] from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) vocab_size = hparams.get("padded_vocab_size", len(tokenizer.get_vocab())) assert max(tokenizer.get_vocab().values()) < vocab_size - role_special_tokens = [ - "<|system|>", - "<|user|>", - "<|assistant|>", - "<|observation|>", - ] - special_tokens = [ - "[MASK]", - "[gMASK]", - "[sMASK]", - "sop", - "eop", - ] + role_special_tokens + role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"] + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens for token_id in range(vocab_size): piece = tokenizer._convert_id_to_token(token_id) if token_id == 0: @@ -5369,14 +5285,11 @@ def set_vocab_chatglm3(self): @staticmethod def token_bytes_to_string(b): from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode - byte_encoder = bytes_to_unicode() - return "".join([byte_encoder[ord(char)] for char in b.decode("latin-1")]) + return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')]) @staticmethod - def bpe( - mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None - ) -> list[bytes]: + def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]: parts = [bytes([b]) for b in token] while True: min_idx = None @@ -5389,11 +5302,7 @@ def bpe( if min_rank is None or (max_rank is not None and min_rank >= max_rank): break assert min_idx is not None - parts = ( - parts[:min_idx] - + [parts[min_idx] + parts[min_idx + 1]] - + parts[min_idx + 2 :] - ) + parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:] return parts def set_vocab(self): @@ -5407,97 +5316,58 @@ def set_vocab(self): toktypes: list[int] = [] from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) - vocab_size = hparams["padded_vocab_size"] + vocab_size = hparams.get("padded_vocab_size",hparams["vocab_size"]) assert max(tokenizer.get_vocab().values()) < vocab_size - tokpre = self.get_vocab_base_pre(tokenizer) - - merges = [] - vocab = {} - mergeable_ranks = tokenizer.mergeable_ranks - for token, rank in mergeable_ranks.items(): - vocab[ChatGLMModel.token_bytes_to_string(token)] = rank - if len(token) == 1: - continue - merged = ChatGLMModel.bpe(mergeable_ranks, token, max_rank=rank) - assert len(merged) >= 2 and len(merged) <= 7 - merges.append(" ".join(map(ChatGLMModel.token_bytes_to_string, merged))) - - # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined - added_vocab = tokenizer.get_added_vocab() - reverse_vocab = { - id_: encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items() - } - - for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.UNUSED) - elif reverse_vocab[i] in added_vocab: - tokens.append(reverse_vocab[i]) - if tokenizer.added_tokens_decoder[i].special: - toktypes.append(gguf.TokenType.CONTROL) - else: - toktypes.append(gguf.TokenType.USER_DEFINED) - else: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.NORMAL) - + tokens, toktypes, tokpre = self.get_vocab_base() self.gguf_writer.add_tokenizer_model("gpt2") self.gguf_writer.add_tokenizer_pre(tokpre) self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(dir_model, load_merges=False) - special_vocab.merges = merges + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) # only add special tokens when they were not already loaded from config.json - special_vocab._set_special_token( - "eos", tokenizer.get_added_vocab()["<|endoftext|>"] - ) + special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # this one is usually not in config.json anyway - special_vocab._set_special_token( - "unk", tokenizer.get_added_vocab()["<|endoftext|>"] - ) + special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) special_vocab.add_to_gguf(self.gguf_writer) def set_gguf_parameters(self): n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) - n_head_kv = self.hparams.get("multi_query_group_num", n_head) + n_head_kv = self.hparams.get("multi_query_group_num", self.hparams.get("num_key_value_heads", n_head)) self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed)) self.gguf_writer.add_embedding_length(n_embed) - self.gguf_writer.add_feed_forward_length( - self.hparams.get("ffn_hidden_size", 4 * n_embed) - ) - self.gguf_writer.add_block_count(self.hparams["num_layers"]) + self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", self.hparams.get("intermediate_size", 4 * n_embed))) + self.gguf_writer.add_block_count(self.hparams.get("num_layers", self.hparams["num_hidden_layers"])) self.gguf_writer.add_head_count(n_head) self.gguf_writer.add_head_count_kv(n_head_kv) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layernorm_epsilon"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon",1e-5)) self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_rope_dimension_count(64) + if "attention_dim" in self.hparams: + rope_dim = self.hparams["attention_dim"] + else: + rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))) self.gguf_writer.add_add_bos_token(False) rope_freq = 10000 if "rope_ratio" in self.hparams: rope_freq = rope_freq * self.hparams["rope_ratio"] self.gguf_writer.add_rope_freq_base(rope_freq) - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused - if name.endswith(".rotary_pos_emb.inv_freq"): + if name.endswith(".rotary_pos_emb.inv_freq") or name.startswith("model.vision."): return [] name = name.removeprefix("transformer.") return [(self.map_tensor_name(name), data_torch)] -@Model.register("NemotronForCausalLM") -class NemotronModel(Model): +@ModelBase.register("NemotronForCausalLM") +class NemotronModel(TextModel): model_arch = gguf.MODEL_ARCH.NEMOTRON def set_vocab(self): @@ -5510,15 +5380,11 @@ def set_gguf_parameters(self): hparams = self.hparams self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - f_norm_eps = self.find_hparam( - ["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon", "norm_eps"] - ) + f_norm_eps = self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon", "norm_eps"]) self.gguf_writer.add_layer_norm_eps(f_norm_eps) # * Partial RoPE - rot_pct = self.find_hparam( - ["partial_rotary_factor", "rope_pct", "rope_percent"] - ) + rot_pct = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"]) n_embd = self.find_hparam(["hidden_size", "n_embd"]) n_head = self.find_hparam(["num_attention_heads", "n_head"]) self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head) @@ -5530,9 +5396,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) self.gguf_writer.add_rope_scaling_factor(self.hparams["factor"]) - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # * Adding +1 to LayerNorm's weights here to implement layernorm1p w/o changing anything on the GGML engine side # model.layers.{l}.input_layernorm.weight # model.layers.{l}.post_attention_layernorm.weight @@ -5543,25 +5407,21 @@ def modify_tensors( return [(self.map_tensor_name(name), data_torch)] -@Model.register("ExaoneForCausalLM") -class ExaoneModel(Model): +@ModelBase.register("ExaoneForCausalLM") +class ExaoneModel(TextModel): model_arch = gguf.MODEL_ARCH.EXAONE def set_gguf_parameters(self): hparams = self.hparams - assert hparams["activation_function"] == "silu" + assert (hparams["activation_function"] == "silu") max_position_embeddings = hparams["max_position_embeddings"] embed_dim = hparams["hidden_size"] num_heads = hparams["num_attention_heads"] num_kv_heads = hparams.get("num_key_value_heads", num_heads) layer_norm_eps = hparams["layer_norm_epsilon"] - intermediate_size = ( - hparams["intermediate_size"] - if "intermediate_size" in hparams - else 4 * embed_dim - ) + intermediate_size = hparams["intermediate_size"] if "intermediate_size" in hparams else 4 * embed_dim num_layers = hparams["num_layers"] # ignore for now as EXAONE-3.0-7.8B-Instruct attentino_dropout is 0.0 # attention_dropout_rate = hparams["attention_dropout"] @@ -5578,44 +5438,25 @@ def set_gguf_parameters(self): if (rope_theta := self.hparams.get("rope_theta")) is not None: self.gguf_writer.add_rope_freq_base(rope_theta) - rotary_factor = self.find_hparam( - ["partial_rotary_factor", "rope_pct"], optional=True - ) + rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True) rotary_factor = rotary_factor if rotary_factor is not None else 1.0 - self.gguf_writer.add_rope_dimension_count( - int( - rotary_factor - * (hparams["hidden_size"] // hparams["num_attention_heads"]) - ) - ) - if ( - hparams.get("rope_scaling") is not None - and "factor" in hparams["rope_scaling"] - ): + self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"]))) + if hparams.get("rope_scaling") is not None and "factor" in hparams["rope_scaling"]: if hparams["rope_scaling"].get("type") == "linear": self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor( - hparams["rope_scaling"]["factor"] - ) + self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"]) def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): - if rope_scaling.get("rope_type", "").lower() == "llama3": + if rope_scaling.get("rope_type", '').lower() == "llama3": base = self.hparams.get("rope_theta", 10000.0) - dim = self.hparams.get( - "head_dim", - self.hparams["hidden_size"] // self.hparams["num_attention_heads"], - ) - freqs = 1.0 / ( - base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim) - ) + dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) factor = rope_scaling.get("factor", 8.0) low_freq_factor = rope_scaling.get("low_freq_factor", 1.0) high_freq_factor = rope_scaling.get("high_freq_factor", 4.0) - old_context_len = self.hparams.get( - "original_max_position_embeddings", 8192 - ) + old_context_len = self.hparams.get("original_max_position_embeddings", 8192) low_freq_wavelen = old_context_len / low_freq_factor high_freq_wavelen = old_context_len / high_freq_factor @@ -5629,21 +5470,15 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: elif wavelen > low_freq_wavelen: rope_factors.append(factor) else: - smooth = (old_context_len / wavelen - low_freq_factor) / ( - high_freq_factor - low_freq_factor - ) + smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) rope_factors.append(1 / ((1 - smooth) / factor + smooth)) - yield ( - self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), - torch.tensor(rope_factors, dtype=torch.float32), - ) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) -@Model.register("GraniteForCausalLM") +@ModelBase.register("GraniteForCausalLM") class GraniteModel(LlamaModel): """Conversion for IBM's GraniteForCausalLM""" - model_arch = gguf.MODEL_ARCH.GRANITE def set_gguf_parameters(self): @@ -5675,15 +5510,12 @@ def set_gguf_parameters(self): logger.info("gguf: (granite) logits_scale = %s", logits_scale) -@Model.register("GraniteMoeForCausalLM") +@ModelBase.register("GraniteMoeForCausalLM") class GraniteMoeModel(GraniteModel): """Conversion for IBM's GraniteMoeForCausalLM""" - model_arch = gguf.MODEL_ARCH.GRANITE_MOE - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: """In modeling_granitemoe, the JetMoe implementation of parallel experts is used. This essentially merges w1 and w3 into a single tensor with 2x the hidden size that is then split during forward. To keep compatibility @@ -5692,9 +5524,7 @@ def modify_tensors( if name.endswith("block_sparse_moe.input_linear.weight"): ffn_dim = self.hparams["intermediate_size"] - assert ( - data_torch.shape[-2] == 2 * ffn_dim - ), "Merged FFN tensor size must be 2 * intermediate_size" + assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * intermediate_size" gate, up = data_torch[..., :ffn_dim, :], data_torch[..., ffn_dim:, :] return [ (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), gate), @@ -5704,9 +5534,108 @@ def modify_tensors( return super().modify_tensors(data_torch, name, bid) -@Model.register("ChameleonForConditionalGeneration") -@Model.register("ChameleonForCausalLM") # obsolete -class ChameleonModel(Model): +@ModelBase.register("BailingMoeForCausalLM") +class BailingMoeModel(TextModel): + model_arch = gguf.MODEL_ARCH.BAILINGMOE + + def set_vocab(self): + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + rope_dim = hparams.get("head_dim") or hparams["hidden_size"] // hparams["num_attention_heads"] + + self.gguf_writer.add_rope_dimension_count(rope_dim) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"]) + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"]) + self.gguf_writer.add_expert_weights_scale(1.0) + self.gguf_writer.add_expert_count(hparams["num_experts"]) + self.gguf_writer.add_expert_shared_count(hparams["num_shared_experts"]) + self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"]) + + _experts: list[dict[str, Tensor]] | None = None + + @staticmethod + def permute(weights: Tensor, n_head: int, n_head_kv: int | None): + if n_head_kv is not None and n_head != n_head_kv: + n_head = n_head_kv + return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams.get("num_key_value_heads") + n_embd = self.hparams["hidden_size"] + head_dim = self.hparams.get("head_dim") or n_embd // n_head + + output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT) + + if name.endswith("attention.dense.weight"): + return [(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid), data_torch)] + elif name.endswith("query_key_value.weight"): + q, k, v = data_torch.split([n_head * head_dim, n_kv_head * head_dim, n_kv_head * head_dim], dim=-2) + + return [ + (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), BailingMoeModel.permute(q, n_head, n_head)), + (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), BailingMoeModel.permute(k, n_head, n_kv_head)), + (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), v) + ] + elif name.find("mlp.experts") != -1: + n_experts = self.hparams["num_experts"] + assert bid is not None + + tensors: list[tuple[str, Tensor]] = [] + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + new_name = self.map_tensor_name(merged_name) + + tensors.append((new_name, data_torch)) + + return tensors + + new_name = self.map_tensor_name(name) + + if new_name == output_name and self.hparams.get("norm_head"): + data_torch = data_torch.float() + data_torch /= torch.norm(data_torch, p=2, dim=0, keepdim=True) + 1e-7 + + return [(new_name, data_torch)] + + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + +@ModelBase.register("ChameleonForConditionalGeneration") +@ModelBase.register("ChameleonForCausalLM") # obsolete +class ChameleonModel(TextModel): model_arch = gguf.MODEL_ARCH.CHAMELEON def set_gguf_parameters(self): @@ -5716,9 +5645,7 @@ def set_gguf_parameters(self): def set_vocab(self): self._set_vocab_gpt2() - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # ignore image tokenizer for now # TODO: remove this once image support is implemented for Chameleon if name.startswith("model.vqmodel"): @@ -5733,13 +5660,9 @@ def modify_tensors( if name.endswith(("k_proj.weight", "k_proj.bias")): data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) if name.endswith(("q_norm.weight", "q_norm.bias")): - data_torch = ChameleonModel._reverse_hf_permute( - data_torch, n_head, hidden_dim - ) + data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_head, hidden_dim) if name.endswith(("k_norm.weight", "k_norm.bias")): - data_torch = ChameleonModel._reverse_hf_permute( - data_torch, n_kv_head, hidden_dim - ) + data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_kv_head, hidden_dim) return [(self.map_tensor_name(name), data_torch)] @@ -5794,24 +5717,26 @@ def numpy(self) -> gguf.LazyNumpyTensor: return gguf.LazyNumpyTensor( meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape), args=(self,), - func=(lambda s: s.numpy()), + func=(lambda s: s.numpy()) ) @classmethod - def meta_with_dtype_and_shape( - cls, dtype: torch.dtype, shape: tuple[int, ...] - ) -> Tensor: + def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: tuple[int, ...]) -> Tensor: return torch.empty(size=shape, dtype=dtype, device="meta") @classmethod def from_safetensors_slice(cls, st_slice: Any) -> Tensor: dtype = cls._dtype_str_map[st_slice.get_dtype()] shape: tuple[int, ...] = tuple(st_slice.get_shape()) - lazy = cls( - meta=cls.meta_with_dtype_and_shape(dtype, shape), - args=(st_slice,), - func=lambda s: s[:], - ) + lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[:]) + return cast(torch.Tensor, lazy) + + @classmethod + def from_remote_tensor(cls, remote_tensor: gguf.utility.RemoteTensor): + dtype = cls._dtype_str_map[remote_tensor.dtype] + shape = remote_tensor.shape + meta = cls.meta_with_dtype_and_shape(dtype, shape) + lazy = cls(meta=meta, args=(remote_tensor,), func=lambda r: torch.frombuffer(r.data(), dtype=dtype).reshape(shape)) return cast(torch.Tensor, lazy) @classmethod @@ -5829,88 +5754,75 @@ def __torch_function__(cls, func, types, args=(), kwargs=None): def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( - description="Convert a huggingface model to a GGML compatible file" - ) + description="Convert a huggingface model to a GGML compatible file") parser.add_argument( - "--vocab-only", - action="store_true", + "--vocab-only", action="store_true", help="extract only the vocab", ) parser.add_argument( - "--outfile", - type=Path, + "--outfile", type=Path, help="path to write to; default: based on input. {ftype} will be replaced by the outtype.", ) parser.add_argument( - "--outtype", - type=str, - choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], - default="f16", + "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="f16", help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type", ) parser.add_argument( - "--bigendian", - action="store_true", + "--bigendian", action="store_true", help="model is executed on big endian machine", ) parser.add_argument( - "model", - type=Path, + "model", type=Path, help="directory containing model file", nargs="?", ) parser.add_argument( - "--use-temp-file", - action="store_true", + "--use-temp-file", action="store_true", help="use the tempfile library while processing (helpful when running out of memory, process killed)", ) parser.add_argument( - "--no-lazy", - action="store_true", + "--no-lazy", action="store_true", help="use more RAM by computing all outputs before writing (use in case lazy evaluation is broken)", ) parser.add_argument( - "--model-name", - type=str, - default=None, + "--model-name", type=str, default=None, help="name of the model", ) parser.add_argument( - "--verbose", - action="store_true", + "--verbose", action="store_true", help="increase output verbosity", ) parser.add_argument( - "--split-max-tensors", - type=int, - default=0, + "--split-max-tensors", type=int, default=0, help="max tensors in each split", ) parser.add_argument( - "--split-max-size", - type=str, - default="0", + "--split-max-size", type=str, default="0", help="max size per split N(M|G)", ) parser.add_argument( - "--dry-run", - action="store_true", + "--dry-run", action="store_true", help="only print out a split plan and exit, without writing any new files", ) parser.add_argument( - "--no-tensor-first-split", - action="store_true", - help="do not add tensors to the first split (disabled by default)", + "--no-tensor-first-split", action="store_true", + help="do not add tensors to the first split (disabled by default)" + ) + parser.add_argument( + "--metadata", type=Path, + help="Specify the path for an authorship metadata override file" + ) + parser.add_argument( + "--print-supported-models", action="store_true", + help="Print the supported models" ) parser.add_argument( - "--metadata", - type=Path, - help="Specify the path for an authorship metadata override file", + "--remote", action="store_true", + help="(Experimental) Read safetensors file remotely without downloading to disk. Config and tokenizer files will still be downloaded. To use this feature, you need to specify Hugging Face model repo name instead of a local directory. For example: 'HuggingFaceTB/SmolLM2-1.7B-Instruct'. Note: To access gated repo, set HF_TOKEN environment variable to your Hugging Face token.", ) parser.add_argument( - "--print-supported-models", - action="store_true", - help="Print the supported models", + "--mmproj", action="store_true", + help="(Experimental) Export multimodal projector (mmproj) for vision models. This will only work on some vision models. A prefix 'mmproj-' will be added to the output file name.", ) args = parser.parse_args() @@ -5929,9 +5841,7 @@ def split_str_to_n_bytes(split_str: str) -> int: elif split_str.isnumeric(): n = int(split_str) else: - raise ValueError( - f"Invalid split size: {split_str}, must be a number, optionally followed by K, M, or G" - ) + raise ValueError(f"Invalid split size: {split_str}, must be a number, optionally followed by K, M, or G") if n < 0: raise ValueError(f"Invalid split size: {split_str}, must be positive") @@ -5944,7 +5854,7 @@ def main() -> None: if args.print_supported_models: logger.error("Supported models:") - Model.print_registered_models() + ModelBase.print_registered_models() sys.exit(0) if args.verbose: @@ -5954,8 +5864,16 @@ def main() -> None: dir_model = args.model + if args.remote: + from huggingface_hub import snapshot_download + local_dir = snapshot_download( + repo_id=str(dir_model), + allow_patterns=["LICENSE", "*.json", "*.md", "*.txt", "tokenizer.model"]) + dir_model = Path(local_dir) + logger.info(f"Downloaded config and tokenizer to {local_dir}") + if not dir_model.is_dir(): - logger.error(f"Error: {args.model} is not a directory") + logger.error(f'Error: {args.model} is not a directory') sys.exit(1) ftype_map: dict[str, gguf.LlamaFileType] = { @@ -5975,54 +5893,49 @@ def main() -> None: if args.outfile is not None: fname_out = args.outfile + elif args.remote: + # if remote, use the model ID as the output file name + fname_out = Path("./" + str(args.model).replace("/", "-") + "-{ftype}.gguf") else: fname_out = dir_model logger.info(f"Loading model: {dir_model.name}") - hparams = Model.load_hparams(dir_model) + hparams = ModelBase.load_hparams(dir_model) + + if args.mmproj: + if "mmproj" not in fname_out.name: + fname_out = ModelBase.add_prefix_to_filename(fname_out, "mmproj-") with torch.inference_mode(): output_type = ftype_map[args.outtype] model_architecture = hparams["architectures"][0] - + model_type = ModelType.VISION if args.mmproj else ModelType.TEXT try: - model_class = Model.from_model_architecture(model_architecture) + model_class = ModelBase.from_model_architecture(model_architecture, model_type=model_type) except NotImplementedError: logger.error(f"Model {model_architecture} is not supported") sys.exit(1) - model_instance = model_class( - dir_model=dir_model, - ftype=output_type, - fname_out=fname_out, - is_big_endian=args.bigendian, - use_temp_file=args.use_temp_file, - eager=args.no_lazy, - metadata_override=args.metadata, - model_name=args.model_name, - split_max_tensors=args.split_max_tensors, - split_max_size=split_str_to_n_bytes(args.split_max_size), - dry_run=args.dry_run, - small_first_shard=args.no_tensor_first_split, - ) + model_instance = model_class(dir_model, output_type, fname_out, + is_big_endian=args.bigendian, use_temp_file=args.use_temp_file, + eager=args.no_lazy, + metadata_override=args.metadata, model_name=args.model_name, + split_max_tensors=args.split_max_tensors, + split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run, + small_first_shard=args.no_tensor_first_split, + remote_hf_model_id=str(args.model) if args.remote else None) if args.vocab_only: logger.info("Exporting model vocab...") model_instance.write_vocab() - logger.info( - f"Model vocab successfully exported to {model_instance.fname_out}" - ) + logger.info(f"Model vocab successfully exported to {model_instance.fname_out}") else: logger.info("Exporting model...") model_instance.write() - out_path = ( - f"{model_instance.fname_out.parent}{os.sep}" - if is_split - else model_instance.fname_out - ) + out_path = f"{model_instance.fname_out.parent}{os.sep}" if is_split else model_instance.fname_out logger.info(f"Model successfully exported to {out_path}") -if __name__ == "__main__": +if __name__ == '__main__': main() diff --git a/lpm_kernel/L2/dpo/dpo_data.py b/lpm_kernel/L2/dpo/dpo_data.py index 696ca734..47395596 100644 --- a/lpm_kernel/L2/dpo/dpo_data.py +++ b/lpm_kernel/L2/dpo/dpo_data.py @@ -77,6 +77,8 @@ def preprocess(sample, is_cot=False): if sample.get('assistant') is None and sample.get('enhanced_request') is not None: user_message = f"{USER_NAME}'s request is " + sample['user_request'] infer_prompt = CONTEXT_COT_PROMPT.format(user_name=USER_NAME) if is_cot else CONTEXT_PROMPT.format(user_name=USER_NAME) + if "qwen3" in self.model_name.lower(): + infer_prompt += " no_think" messages = [ {"role": "system", "content": infer_prompt}, {"role": "user", "content": user_message}, @@ -86,6 +88,8 @@ def preprocess(sample, is_cot=False): if sample.get('assistant') is None and sample.get('user_feedback') is not None: user_message = f"{USER_NAME}'s request is " + sample['user_request'] + "\n" + "The response of expert is " + sample['expert_response'] infer_prompt = JUDGE_COT_PROMPT.format(user_name=USER_NAME) if is_cot else JUDGE_PROMPT.format(user_name=USER_NAME) + if "qwen3" in self.model_name.lower(): + infer_prompt += " no_think" messages = [ {"role": "system", "content": infer_prompt}, {"role": "user", "content": user_message}, @@ -128,6 +132,8 @@ def preprocess(sample, is_cot=False): return [{"messages": messages}] else: infer_prompt = MEMORY_COT_PROMPT.format(user_name=USER_NAME) if is_cot else MEMORY_PROMPT.format(user_name=USER_NAME) + if "qwen3" in self.model_name.lower(): + infer_prompt += " no_think" messages = [ {"role": "system", "content": infer_prompt}, {"role": "user", "content": sample['user']}, diff --git a/lpm_kernel/L2/dpo/dpo_train.py b/lpm_kernel/L2/dpo/dpo_train.py index fb5d0bc1..fb06a0ce 100644 --- a/lpm_kernel/L2/dpo/dpo_train.py +++ b/lpm_kernel/L2/dpo/dpo_train.py @@ -34,8 +34,13 @@ def training_data_processor(args, SYS = "You are a helpful assistant.\n\n"): "rejected": [data_point["rejected"] for data_point in data] } tokenizer = AutoTokenizer.from_pretrained(args.base_model_path, padding_side="left") + # Only add enable_thinking for qwen3 models + if "qwen3" in args.base_model_path.lower(): + prompt = tokenizer.apply_chat_template(training_data["prompt"], tokenize=False, enable_thinking=False) + else: + prompt = tokenizer.apply_chat_template(training_data["prompt"], tokenize=False) training_data = { - "prompt": tokenizer.apply_chat_template(training_data["prompt"], tokenize=False), + "prompt": prompt, "chosen": training_data["chosen"], "rejected": training_data["rejected"] } diff --git a/lpm_kernel/L2/gguf-py/README.md b/lpm_kernel/L2/gguf-py/README.md index 2e513633..ca7e09c6 100644 --- a/lpm_kernel/L2/gguf-py/README.md +++ b/lpm_kernel/L2/gguf-py/README.md @@ -1,9 +1,9 @@ ## gguf -This is a Python package for writing binary files in the [GGUF](https://github.com/ggerganov/ggml/pull/302) +This is a Python package for writing binary files in the [GGUF](https://github.com/ggml-org/ggml/pull/302) (GGML Universal File) format. -See [convert_hf_to_gguf.py](https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py) +See [convert_hf_to_gguf.py](https://github.com/ggml-org/llama.cpp/blob/master/convert_hf_to_gguf.py) as an example for its usage. ## Installation @@ -11,19 +11,26 @@ as an example for its usage. pip install gguf ``` +Optionally, you can install gguf with the extra 'gui' to enable the visual GGUF editor. +```sh +pip install gguf[gui] +``` + ## API Examples/Simple Tools -[examples/writer.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/examples/writer.py) — Generates `example.gguf` in the current directory to demonstrate generating a GGUF file. Note that this file cannot be used as a model. +[examples/writer.py](https://github.com/ggml-org/llama.cpp/blob/master/gguf-py/examples/writer.py) — Generates `example.gguf` in the current directory to demonstrate generating a GGUF file. Note that this file cannot be used as a model. + +[examples/reader.py](https://github.com/ggml-org/llama.cpp/blob/master/gguf-py/examples/reader.py) — Extracts and displays key-value pairs and tensor details from a GGUF file in a readable format. -[examples/reader.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/examples/reader.py) — Extracts and displays key-value pairs and tensor details from a GGUF file in a readable format. +[gguf/scripts/gguf_dump.py](https://github.com/ggml-org/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_dump.py) — Dumps a GGUF file's metadata to the console. -[gguf/scripts/gguf_dump.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_dump.py) — Dumps a GGUF file's metadata to the console. +[gguf/scripts/gguf_set_metadata.py](https://github.com/ggml-org/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_set_metadata.py) — Allows changing simple metadata values in a GGUF file by key. -[gguf/scripts/gguf_set_metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_set_metadata.py) — Allows changing simple metadata values in a GGUF file by key. +[gguf/scripts/gguf_convert_endian.py](https://github.com/ggml-org/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_convert_endian.py) — Allows converting the endianness of GGUF files. -[gguf/scripts/gguf_convert_endian.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_convert_endian.py) — Allows converting the endianness of GGUF files. +[gguf/scripts/gguf_new_metadata.py](https://github.com/ggml-org/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_new_metadata.py) — Copies a GGUF file with added/modified/removed metadata values. -[gguf/scripts/gguf_new_metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_new_metadata.py) — Copies a GGUF file with added/modified/removed metadata values. +[gguf/scripts/gguf_editor_gui.py](https://github.com/ggml-org/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_editor_gui.py) — Allows for viewing, editing, adding, or removing metadata values within a GGUF file as well as viewing its tensors with a Qt interface. ## Development Maintainers who participate in development of this package are advised to install it in editable mode: diff --git a/lpm_kernel/L2/gguf-py/examples/reader.py b/lpm_kernel/L2/gguf-py/examples/reader.py index 575fdd34..703b782b 100644 --- a/lpm_kernel/L2/gguf-py/examples/reader.py +++ b/lpm_kernel/L2/gguf-py/examples/reader.py @@ -2,12 +2,14 @@ import logging import sys from pathlib import Path -from gguf.gguf_reader import GGUFReader logger = logging.getLogger("reader") +# Necessary to load the local gguf package sys.path.insert(0, str(Path(__file__).parent.parent)) +from gguf.gguf_reader import GGUFReader + def read_gguf_file(gguf_file_path): """ @@ -20,30 +22,26 @@ def read_gguf_file(gguf_file_path): reader = GGUFReader(gguf_file_path) # List all key-value pairs in a columnized format - print("Key-Value Pairs:") # noqa: NP100 + print("Key-Value Pairs:") # noqa: NP100 max_key_length = max(len(key) for key in reader.fields.keys()) for key, field in reader.fields.items(): value = field.parts[field.data[0]] - print(f"{key:{max_key_length}} : {value}") # noqa: NP100 - print("----") # noqa: NP100 + print(f"{key:{max_key_length}} : {value}") # noqa: NP100 + print("----") # noqa: NP100 # List all tensors - print("Tensors:") # noqa: NP100 + print("Tensors:") # noqa: NP100 tensor_info_format = "{:<30} | Shape: {:<15} | Size: {:<12} | Quantization: {}" - print(tensor_info_format.format("Tensor Name", "Shape", "Size", "Quantization")) # noqa: NP100 - print("-" * 80) # noqa: NP100 + print(tensor_info_format.format("Tensor Name", "Shape", "Size", "Quantization")) # noqa: NP100 + print("-" * 80) # noqa: NP100 for tensor in reader.tensors: shape_str = "x".join(map(str, tensor.shape)) size_str = str(tensor.n_elements) quantization_str = tensor.tensor_type.name - print( - tensor_info_format.format( - tensor.name, shape_str, size_str, quantization_str - ) - ) # noqa: NP100 + print(tensor_info_format.format(tensor.name, shape_str, size_str, quantization_str)) # noqa: NP100 -if __name__ == "__main__": +if __name__ == '__main__': if len(sys.argv) < 2: logger.info("Usage: reader.py ") sys.exit(1) diff --git a/lpm_kernel/L2/gguf-py/examples/writer.py b/lpm_kernel/L2/gguf-py/examples/writer.py index 6263fc0c..731873a7 100755 --- a/lpm_kernel/L2/gguf-py/examples/writer.py +++ b/lpm_kernel/L2/gguf-py/examples/writer.py @@ -35,5 +35,5 @@ def writer_example() -> None: gguf_writer.close() -if __name__ == "__main__": +if __name__ == '__main__': writer_example() diff --git a/lpm_kernel/L2/gguf-py/gguf/constants.py b/lpm_kernel/L2/gguf-py/gguf/constants.py index 6f7d8ff9..326ccdb0 100644 --- a/lpm_kernel/L2/gguf-py/gguf/constants.py +++ b/lpm_kernel/L2/gguf-py/gguf/constants.py @@ -7,10 +7,10 @@ # constants # -GGUF_MAGIC = 0x46554747 # "GGUF" -GGUF_VERSION = 3 +GGUF_MAGIC = 0x46554747 # "GGUF" +GGUF_VERSION = 3 GGUF_DEFAULT_ALIGNMENT = 32 -GGML_QUANT_VERSION = 2 # GGML_QNT_VERSION from ggml.h +GGML_QUANT_VERSION = 2 # GGML_QNT_VERSION from ggml.h # # metadata keys @@ -19,204 +19,227 @@ class Keys: class General: - TYPE = "general.type" - ARCHITECTURE = "general.architecture" - QUANTIZATION_VERSION = "general.quantization_version" - ALIGNMENT = "general.alignment" - FILE_TYPE = "general.file_type" + TYPE = "general.type" + ARCHITECTURE = "general.architecture" + QUANTIZATION_VERSION = "general.quantization_version" + ALIGNMENT = "general.alignment" + FILE_TYPE = "general.file_type" # Authorship Metadata - NAME = "general.name" - AUTHOR = "general.author" - VERSION = "general.version" - ORGANIZATION = "general.organization" + NAME = "general.name" + AUTHOR = "general.author" + VERSION = "general.version" + ORGANIZATION = "general.organization" - FINETUNE = "general.finetune" - BASENAME = "general.basename" + FINETUNE = "general.finetune" + BASENAME = "general.basename" - DESCRIPTION = "general.description" - QUANTIZED_BY = "general.quantized_by" + DESCRIPTION = "general.description" + QUANTIZED_BY = "general.quantized_by" - SIZE_LABEL = "general.size_label" + SIZE_LABEL = "general.size_label" # Licensing details - LICENSE = "general.license" - LICENSE_NAME = "general.license.name" - LICENSE_LINK = "general.license.link" + LICENSE = "general.license" + LICENSE_NAME = "general.license.name" + LICENSE_LINK = "general.license.link" # Typically represents the converted GGUF repo (Unless native) - URL = "general.url" # Model Website/Paper - DOI = "general.doi" - UUID = "general.uuid" - REPO_URL = "general.repo_url" # Model Source Repository (git/svn/etc...) + URL = "general.url" # Model Website/Paper + DOI = "general.doi" + UUID = "general.uuid" + REPO_URL = "general.repo_url" # Model Source Repository (git/svn/etc...) # Model Source during conversion - SOURCE_URL = "general.source.url" # Model Website/Paper - SOURCE_DOI = "general.source.doi" - SOURCE_UUID = "general.source.uuid" - SOURCE_REPO_URL = ( - "general.source.repo_url" # Model Source Repository (git/svn/etc...) - ) + SOURCE_URL = "general.source.url" # Model Website/Paper + SOURCE_DOI = "general.source.doi" + SOURCE_UUID = "general.source.uuid" + SOURCE_REPO_URL = "general.source.repo_url" # Model Source Repository (git/svn/etc...) # Base Model Source. There can be more than one source if it's a merged # model like with 'Mistral-7B-Merge-14-v0.1'. This will assist in # tracing linage of models as it is finetuned or merged over time. - BASE_MODEL_COUNT = "general.base_model.count" - BASE_MODEL_NAME = "general.base_model.{id}.name" - BASE_MODEL_AUTHOR = "general.base_model.{id}.author" - BASE_MODEL_VERSION = "general.base_model.{id}.version" - BASE_MODEL_ORGANIZATION = "general.base_model.{id}.organization" - BASE_MODEL_DESCRIPTION = "general.base_model.{id}.description" - BASE_MODEL_URL = "general.base_model.{id}.url" # Model Website/Paper - BASE_MODEL_DOI = "general.base_model.{id}.doi" - BASE_MODEL_UUID = "general.base_model.{id}.uuid" - BASE_MODEL_REPO_URL = "general.base_model.{id}.repo_url" # Model Source Repository (git/svn/etc...) + BASE_MODEL_COUNT = "general.base_model.count" + BASE_MODEL_NAME = "general.base_model.{id}.name" + BASE_MODEL_AUTHOR = "general.base_model.{id}.author" + BASE_MODEL_VERSION = "general.base_model.{id}.version" + BASE_MODEL_ORGANIZATION = "general.base_model.{id}.organization" + BASE_MODEL_DESCRIPTION = "general.base_model.{id}.description" + BASE_MODEL_URL = "general.base_model.{id}.url" # Model Website/Paper + BASE_MODEL_DOI = "general.base_model.{id}.doi" + BASE_MODEL_UUID = "general.base_model.{id}.uuid" + BASE_MODEL_REPO_URL = "general.base_model.{id}.repo_url" # Model Source Repository (git/svn/etc...) # Dataset Source - DATASET_COUNT = "general.dataset.count" - DATASET_NAME = "general.dataset.{id}.name" - DATASET_AUTHOR = "general.dataset.{id}.author" - DATASET_VERSION = "general.dataset.{id}.version" - DATASET_ORGANIZATION = "general.dataset.{id}.organization" - DATASET_DESCRIPTION = "general.dataset.{id}.description" - DATASET_URL = "general.dataset.{id}.url" # Model Website/Paper - DATASET_DOI = "general.dataset.{id}.doi" - DATASET_UUID = "general.dataset.{id}.uuid" - DATASET_REPO_URL = ( - "general.dataset.{id}.repo_url" # Model Source Repository (git/svn/etc...) - ) + DATASET_COUNT = "general.dataset.count" + DATASET_NAME = "general.dataset.{id}.name" + DATASET_AUTHOR = "general.dataset.{id}.author" + DATASET_VERSION = "general.dataset.{id}.version" + DATASET_ORGANIZATION = "general.dataset.{id}.organization" + DATASET_DESCRIPTION = "general.dataset.{id}.description" + DATASET_URL = "general.dataset.{id}.url" # Model Website/Paper + DATASET_DOI = "general.dataset.{id}.doi" + DATASET_UUID = "general.dataset.{id}.uuid" + DATASET_REPO_URL = "general.dataset.{id}.repo_url" # Model Source Repository (git/svn/etc...) # Array based KV stores - TAGS = "general.tags" - LANGUAGES = "general.languages" + TAGS = "general.tags" + LANGUAGES = "general.languages" class LLM: - VOCAB_SIZE = "{arch}.vocab_size" - CONTEXT_LENGTH = "{arch}.context_length" - EMBEDDING_LENGTH = "{arch}.embedding_length" - FEATURES_LENGTH = "{arch}.features_length" - BLOCK_COUNT = "{arch}.block_count" - LEADING_DENSE_BLOCK_COUNT = "{arch}.leading_dense_block_count" - FEED_FORWARD_LENGTH = "{arch}.feed_forward_length" - EXPERT_FEED_FORWARD_LENGTH = "{arch}.expert_feed_forward_length" + VOCAB_SIZE = "{arch}.vocab_size" + CONTEXT_LENGTH = "{arch}.context_length" + EMBEDDING_LENGTH = "{arch}.embedding_length" + FEATURES_LENGTH = "{arch}.features_length" + BLOCK_COUNT = "{arch}.block_count" + LEADING_DENSE_BLOCK_COUNT = "{arch}.leading_dense_block_count" + FEED_FORWARD_LENGTH = "{arch}.feed_forward_length" + EXPERT_FEED_FORWARD_LENGTH = "{arch}.expert_feed_forward_length" EXPERT_SHARED_FEED_FORWARD_LENGTH = "{arch}.expert_shared_feed_forward_length" - USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual" - TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout" - EXPERT_COUNT = "{arch}.expert_count" - EXPERT_USED_COUNT = "{arch}.expert_used_count" - EXPERT_SHARED_COUNT = "{arch}.expert_shared_count" - EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale" - EXPERT_WEIGHTS_NORM = "{arch}.expert_weights_norm" - EXPERT_GATING_FUNC = "{arch}.expert_gating_func" - POOLING_TYPE = "{arch}.pooling_type" - LOGIT_SCALE = "{arch}.logit_scale" - DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id" - ATTN_LOGIT_SOFTCAPPING = "{arch}.attn_logit_softcapping" - FINAL_LOGIT_SOFTCAPPING = "{arch}.final_logit_softcapping" - SWIN_NORM = "{arch}.swin_norm" - RESCALE_EVERY_N_LAYERS = "{arch}.rescale_every_n_layers" - TIME_MIX_EXTRA_DIM = "{arch}.time_mix_extra_dim" - TIME_DECAY_EXTRA_DIM = "{arch}.time_decay_extra_dim" - RESIDUAL_SCALE = "{arch}.residual_scale" - EMBEDDING_SCALE = "{arch}.embedding_scale" - TOKEN_SHIFT_COUNT = "{arch}.token_shift_count" + USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual" + TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout" + EXPERT_COUNT = "{arch}.expert_count" + EXPERT_USED_COUNT = "{arch}.expert_used_count" + EXPERT_SHARED_COUNT = "{arch}.expert_shared_count" + EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale" + EXPERT_WEIGHTS_NORM = "{arch}.expert_weights_norm" + EXPERT_GATING_FUNC = "{arch}.expert_gating_func" + MOE_EVERY_N_LAYERS = "{arch}.moe_every_n_layers" + POOLING_TYPE = "{arch}.pooling_type" + LOGIT_SCALE = "{arch}.logit_scale" + DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id" + ATTN_LOGIT_SOFTCAPPING = "{arch}.attn_logit_softcapping" + FINAL_LOGIT_SOFTCAPPING = "{arch}.final_logit_softcapping" + SWIN_NORM = "{arch}.swin_norm" + RESCALE_EVERY_N_LAYERS = "{arch}.rescale_every_n_layers" + TIME_MIX_EXTRA_DIM = "{arch}.time_mix_extra_dim" + TIME_DECAY_EXTRA_DIM = "{arch}.time_decay_extra_dim" + RESIDUAL_SCALE = "{arch}.residual_scale" + EMBEDDING_SCALE = "{arch}.embedding_scale" + TOKEN_SHIFT_COUNT = "{arch}.token_shift_count" + INTERLEAVE_MOE_LAYER_STEP = "{arch}.interleave_moe_layer_step" class Attention: - HEAD_COUNT = "{arch}.attention.head_count" - HEAD_COUNT_KV = "{arch}.attention.head_count_kv" - MAX_ALIBI_BIAS = "{arch}.attention.max_alibi_bias" - CLAMP_KQV = "{arch}.attention.clamp_kqv" - KEY_LENGTH = "{arch}.attention.key_length" - VALUE_LENGTH = "{arch}.attention.value_length" - LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon" - LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon" - GROUPNORM_EPS = "{arch}.attention.group_norm_epsilon" - GROUPNORM_GROUPS = "{arch}.attention.group_norm_groups" - CAUSAL = "{arch}.attention.causal" - Q_LORA_RANK = "{arch}.attention.q_lora_rank" - KV_LORA_RANK = "{arch}.attention.kv_lora_rank" - REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count" - SLIDING_WINDOW = "{arch}.attention.sliding_window" - SCALE = "{arch}.attention.scale" + HEAD_COUNT = "{arch}.attention.head_count" + HEAD_COUNT_KV = "{arch}.attention.head_count_kv" + MAX_ALIBI_BIAS = "{arch}.attention.max_alibi_bias" + CLAMP_KQV = "{arch}.attention.clamp_kqv" + KEY_LENGTH = "{arch}.attention.key_length" + VALUE_LENGTH = "{arch}.attention.value_length" + LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon" + LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon" + GROUPNORM_EPS = "{arch}.attention.group_norm_epsilon" + GROUPNORM_GROUPS = "{arch}.attention.group_norm_groups" + CAUSAL = "{arch}.attention.causal" + Q_LORA_RANK = "{arch}.attention.q_lora_rank" + KV_LORA_RANK = "{arch}.attention.kv_lora_rank" + DECAY_LORA_RANK = "{arch}.attention.decay_lora_rank" + ICLR_LORA_RANK = "{arch}.attention.iclr_lora_rank" + VALUE_RESIDUAL_MIX_LORA_RANK = "{arch}.attention.value_residual_mix_lora_rank" + GATE_LORA_RANK = "{arch}.attention.gate_lora_rank" + REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count" + SLIDING_WINDOW = "{arch}.attention.sliding_window" + SCALE = "{arch}.attention.scale" + KEY_LENGTH_MLA = "{arch}.attention.key_length_mla" + VALUE_LENGTH_MLA = "{arch}.attention.value_length_mla" class Rope: - DIMENSION_COUNT = "{arch}.rope.dimension_count" - DIMENSION_SECTIONS = "{arch}.rope.dimension_sections" - FREQ_BASE = "{arch}.rope.freq_base" - SCALING_TYPE = "{arch}.rope.scaling.type" - SCALING_FACTOR = "{arch}.rope.scaling.factor" - SCALING_ATTN_FACTOR = "{arch}.rope.scaling.attn_factor" - SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length" - SCALING_FINETUNED = "{arch}.rope.scaling.finetuned" - SCALING_YARN_LOG_MUL = "{arch}.rope.scaling.yarn_log_multiplier" + DIMENSION_COUNT = "{arch}.rope.dimension_count" + DIMENSION_SECTIONS = "{arch}.rope.dimension_sections" + FREQ_BASE = "{arch}.rope.freq_base" + SCALING_TYPE = "{arch}.rope.scaling.type" + SCALING_FACTOR = "{arch}.rope.scaling.factor" + SCALING_ATTN_FACTOR = "{arch}.rope.scaling.attn_factor" + SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length" + SCALING_FINETUNED = "{arch}.rope.scaling.finetuned" + SCALING_YARN_LOG_MUL = "{arch}.rope.scaling.yarn_log_multiplier" class Split: - LLM_KV_SPLIT_NO = "split.no" - LLM_KV_SPLIT_COUNT = "split.count" + LLM_KV_SPLIT_NO = "split.no" + LLM_KV_SPLIT_COUNT = "split.count" LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count" class SSM: - CONV_KERNEL = "{arch}.ssm.conv_kernel" - INNER_SIZE = "{arch}.ssm.inner_size" - STATE_SIZE = "{arch}.ssm.state_size" + CONV_KERNEL = "{arch}.ssm.conv_kernel" + INNER_SIZE = "{arch}.ssm.inner_size" + STATE_SIZE = "{arch}.ssm.state_size" TIME_STEP_RANK = "{arch}.ssm.time_step_rank" - DT_B_C_RMS = "{arch}.ssm.dt_b_c_rms" + DT_B_C_RMS = "{arch}.ssm.dt_b_c_rms" class WKV: HEAD_SIZE = "{arch}.wkv.head_size" class PosNet: EMBEDDING_LENGTH = "{arch}.posnet.embedding_length" - BLOCK_COUNT = "{arch}.posnet.block_count" + BLOCK_COUNT = "{arch}.posnet.block_count" class ConvNext: EMBEDDING_LENGTH = "{arch}.convnext.embedding_length" - BLOCK_COUNT = "{arch}.convnext.block_count" + BLOCK_COUNT = "{arch}.convnext.block_count" class Tokenizer: - MODEL = "tokenizer.ggml.model" - PRE = "tokenizer.ggml.pre" - LIST = "tokenizer.ggml.tokens" - TOKEN_TYPE = "tokenizer.ggml.token_type" - TOKEN_TYPE_COUNT = ( - "tokenizer.ggml.token_type_count" # for BERT-style token types - ) - SCORES = "tokenizer.ggml.scores" - MERGES = "tokenizer.ggml.merges" - BOS_ID = "tokenizer.ggml.bos_token_id" - EOS_ID = "tokenizer.ggml.eos_token_id" - EOT_ID = "tokenizer.ggml.eot_token_id" - EOM_ID = "tokenizer.ggml.eom_token_id" - UNK_ID = "tokenizer.ggml.unknown_token_id" - SEP_ID = "tokenizer.ggml.seperator_token_id" - PAD_ID = "tokenizer.ggml.padding_token_id" - MASK_ID = "tokenizer.ggml.mask_token_id" - ADD_BOS = "tokenizer.ggml.add_bos_token" - ADD_EOS = "tokenizer.ggml.add_eos_token" - ADD_PREFIX = "tokenizer.ggml.add_space_prefix" - REMOVE_EXTRA_WS = "tokenizer.ggml.remove_extra_whitespaces" + MODEL = "tokenizer.ggml.model" + PRE = "tokenizer.ggml.pre" + LIST = "tokenizer.ggml.tokens" + TOKEN_TYPE = "tokenizer.ggml.token_type" + TOKEN_TYPE_COUNT = "tokenizer.ggml.token_type_count" # for BERT-style token types + SCORES = "tokenizer.ggml.scores" + MERGES = "tokenizer.ggml.merges" + BOS_ID = "tokenizer.ggml.bos_token_id" + EOS_ID = "tokenizer.ggml.eos_token_id" + EOT_ID = "tokenizer.ggml.eot_token_id" + EOM_ID = "tokenizer.ggml.eom_token_id" + UNK_ID = "tokenizer.ggml.unknown_token_id" + SEP_ID = "tokenizer.ggml.seperator_token_id" + PAD_ID = "tokenizer.ggml.padding_token_id" + MASK_ID = "tokenizer.ggml.mask_token_id" + ADD_BOS = "tokenizer.ggml.add_bos_token" + ADD_EOS = "tokenizer.ggml.add_eos_token" + ADD_PREFIX = "tokenizer.ggml.add_space_prefix" + REMOVE_EXTRA_WS = "tokenizer.ggml.remove_extra_whitespaces" PRECOMPILED_CHARSMAP = "tokenizer.ggml.precompiled_charsmap" - HF_JSON = "tokenizer.huggingface.json" - RWKV = "tokenizer.rwkv.world" - CHAT_TEMPLATE = "tokenizer.chat_template" - CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}" - CHAT_TEMPLATES = "tokenizer.chat_templates" + HF_JSON = "tokenizer.huggingface.json" + RWKV = "tokenizer.rwkv.world" + CHAT_TEMPLATE = "tokenizer.chat_template" + CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}" + CHAT_TEMPLATES = "tokenizer.chat_templates" # FIM/Infill special tokens constants - FIM_PRE_ID = "tokenizer.ggml.fim_pre_token_id" - FIM_SUF_ID = "tokenizer.ggml.fim_suf_token_id" - FIM_MID_ID = "tokenizer.ggml.fim_mid_token_id" - FIM_PAD_ID = "tokenizer.ggml.fim_pad_token_id" - FIM_REP_ID = "tokenizer.ggml.fim_rep_token_id" - FIM_SEP_ID = "tokenizer.ggml.fim_sep_token_id" + FIM_PRE_ID = "tokenizer.ggml.fim_pre_token_id" + FIM_SUF_ID = "tokenizer.ggml.fim_suf_token_id" + FIM_MID_ID = "tokenizer.ggml.fim_mid_token_id" + FIM_PAD_ID = "tokenizer.ggml.fim_pad_token_id" + FIM_REP_ID = "tokenizer.ggml.fim_rep_token_id" + FIM_SEP_ID = "tokenizer.ggml.fim_sep_token_id" # deprecated: - PREFIX_ID = "tokenizer.ggml.prefix_token_id" - SUFFIX_ID = "tokenizer.ggml.suffix_token_id" - MIDDLE_ID = "tokenizer.ggml.middle_token_id" + PREFIX_ID = "tokenizer.ggml.prefix_token_id" + SUFFIX_ID = "tokenizer.ggml.suffix_token_id" + MIDDLE_ID = "tokenizer.ggml.middle_token_id" class Adapter: - TYPE = "adapter.type" + TYPE = "adapter.type" LORA_ALPHA = "adapter.lora.alpha" + class ClipVision: + PROJECTOR_TYPE = "clip.projector_type" + HAS_VISION_ENCODER = "clip.has_vision_encoder" + HAS_LLAVA_PROJECTOR = "clip.has_llava_projector" + IMAGE_SIZE = "clip.vision.image_size" + PATCH_SIZE = "clip.vision.patch_size" + EMBEDDING_LENGTH = "clip.vision.embedding_length" + FEED_FORWARD_LENGTH = "clip.vision.feed_forward_length" + PROJECTION_DIM = "clip.vision.projection_dim" + BLOCK_COUNT = "clip.vision.block_count" + IMAGE_MEAN = "clip.vision.image_mean" + IMAGE_STD = "clip.vision.image_std" + USE_GELU = "clip.use_gelu" + USE_SILU = "clip.use_silu" + + class Attention: + HEAD_COUNT = "clip.vision.attention.head_count" + LAYERNORM_EPS = "clip.vision.attention.layer_norm_epsilon" + + class Projector: + SCALE_FACTOR = "clip.vision.projector.scale_factor" # # recommended mapping of model tensor names for storage in gguf @@ -224,385 +247,556 @@ class Adapter: class GGUFType: - MODEL = "model" - ADAPTER = "adapter" + MODEL = "model" + ADAPTER = "adapter" + CLIP_VISION = "clip-vision" class MODEL_ARCH(IntEnum): - LLAMA = auto() - DECI = auto() - FALCON = auto() - BAICHUAN = auto() - GROK = auto() - GPT2 = auto() - GPTJ = auto() - GPTNEOX = auto() - MPT = auto() - STARCODER = auto() - REFACT = auto() - BERT = auto() - NOMIC_BERT = auto() - JINA_BERT_V2 = auto() - BLOOM = auto() - STABLELM = auto() - QWEN = auto() - QWEN2 = auto() - QWEN2MOE = auto() - QWEN2VL = auto() - PHI2 = auto() - PHI3 = auto() - PHIMOE = auto() - PLAMO = auto() - CODESHELL = auto() - ORION = auto() - INTERNLM2 = auto() - MINICPM = auto() - MINICPM3 = auto() - GEMMA = auto() - GEMMA2 = auto() - STARCODER2 = auto() - RWKV6 = auto() - RWKV6QWEN2 = auto() - MAMBA = auto() - XVERSE = auto() - COMMAND_R = auto() - COHERE2 = auto() - DBRX = auto() - OLMO = auto() - OLMO2 = auto() - OLMOE = auto() - OPENELM = auto() - ARCTIC = auto() - DEEPSEEK = auto() - DEEPSEEK2 = auto() - CHATGLM = auto() - BITNET = auto() - T5 = auto() - T5ENCODER = auto() - JAIS = auto() - NEMOTRON = auto() - EXAONE = auto() - GRANITE = auto() - GRANITE_MOE = auto() - CHAMELEON = auto() + CLIP_VISION = auto() # dummy arch for clip.cpp + LLAMA = auto() + LLAMA4 = auto() + DECI = auto() + FALCON = auto() + BAICHUAN = auto() + GROK = auto() + GPT2 = auto() + GPTJ = auto() + GPTNEOX = auto() + MPT = auto() + STARCODER = auto() + REFACT = auto() + BERT = auto() + NOMIC_BERT = auto() + NOMIC_BERT_MOE = auto() + JINA_BERT_V2 = auto() + BLOOM = auto() + STABLELM = auto() + QWEN = auto() + QWEN2 = auto() + QWEN2MOE = auto() + QWEN2VL = auto() + QWEN3 = auto() + QWEN3MOE = auto() + PHI2 = auto() + PHI3 = auto() + PHIMOE = auto() + PLAMO = auto() + CODESHELL = auto() + ORION = auto() + INTERNLM2 = auto() + MINICPM = auto() + MINICPM3 = auto() + GEMMA = auto() + GEMMA2 = auto() + GEMMA3 = auto() + STARCODER2 = auto() + RWKV6 = auto() + RWKV6QWEN2 = auto() + RWKV7 = auto() + ARWKV7 = auto() + MAMBA = auto() + XVERSE = auto() + COMMAND_R = auto() + COHERE2 = auto() + DBRX = auto() + OLMO = auto() + OLMO2 = auto() + OLMOE = auto() + OPENELM = auto() + ARCTIC = auto() + DEEPSEEK = auto() + DEEPSEEK2 = auto() + CHATGLM = auto() + GLM4 = auto() + BITNET = auto() + T5 = auto() + T5ENCODER = auto() + JAIS = auto() + NEMOTRON = auto() + EXAONE = auto() + GRANITE = auto() + GRANITE_MOE = auto() + CHAMELEON = auto() WAVTOKENIZER_DEC = auto() + PLM = auto() + BAILINGMOE = auto() + + +class VISION_PROJECTOR_TYPE(IntEnum): + MLP = auto() + LDP = auto() + LDPV2 = auto() + RESAMPLER = auto() + GLM_EDGE = auto() + MERGER = auto() + GEMMA3 = auto() class MODEL_TENSOR(IntEnum): - TOKEN_EMBD = auto() - TOKEN_EMBD_NORM = auto() - TOKEN_TYPES = auto() - POS_EMBD = auto() - OUTPUT = auto() - OUTPUT_NORM = auto() - ROPE_FREQS = auto() - ROPE_FACTORS_LONG = auto() - ROPE_FACTORS_SHORT = auto() - ATTN_Q = auto() - ATTN_K = auto() - ATTN_V = auto() - ATTN_QKV = auto() - ATTN_OUT = auto() - ATTN_NORM = auto() - ATTN_NORM_2 = auto() - ATTN_OUT_NORM = auto() - ATTN_POST_NORM = auto() - ATTN_ROT_EMBD = auto() - FFN_GATE_INP = auto() - FFN_GATE_INP_SHEXP = auto() - FFN_NORM = auto() - FFN_PRE_NORM = auto() - FFN_POST_NORM = auto() - FFN_GATE = auto() - FFN_DOWN = auto() - FFN_UP = auto() - FFN_ACT = auto() - FFN_NORM_EXP = auto() - FFN_GATE_EXP = auto() - FFN_DOWN_EXP = auto() - FFN_UP_EXP = auto() - FFN_GATE_SHEXP = auto() - FFN_DOWN_SHEXP = auto() - FFN_UP_SHEXP = auto() - FFN_EXP_PROBS_B = auto() - ATTN_Q_NORM = auto() - ATTN_K_NORM = auto() - LAYER_OUT_NORM = auto() - SSM_IN = auto() - SSM_CONV1D = auto() - SSM_X = auto() - SSM_DT = auto() - SSM_A = auto() - SSM_D = auto() - SSM_OUT = auto() - TIME_MIX_W1 = auto() - TIME_MIX_W2 = auto() - TIME_MIX_LERP_X = auto() - TIME_MIX_LERP_K = auto() - TIME_MIX_LERP_V = auto() - TIME_MIX_LERP_R = auto() - TIME_MIX_LERP_G = auto() - TIME_MIX_LERP_FUSED = auto() - TIME_MIX_LERP_W = auto() - TIME_MIX_FIRST = auto() - TIME_MIX_DECAY = auto() - TIME_MIX_DECAY_W1 = auto() - TIME_MIX_DECAY_W2 = auto() - TIME_MIX_KEY = auto() - TIME_MIX_VALUE = auto() - TIME_MIX_RECEPTANCE = auto() - TIME_MIX_GATE = auto() - TIME_MIX_LN = auto() - TIME_MIX_OUTPUT = auto() - CHANNEL_MIX_LERP_K = auto() - CHANNEL_MIX_LERP_R = auto() - CHANNEL_MIX_KEY = auto() + TOKEN_EMBD = auto() + TOKEN_EMBD_NORM = auto() + TOKEN_TYPES = auto() + POS_EMBD = auto() + OUTPUT = auto() + OUTPUT_NORM = auto() + ROPE_FREQS = auto() + ROPE_FACTORS_LONG = auto() + ROPE_FACTORS_SHORT = auto() + ATTN_Q = auto() + ATTN_K = auto() + ATTN_V = auto() + ATTN_QKV = auto() + ATTN_OUT = auto() + ATTN_NORM = auto() + ATTN_NORM_2 = auto() + ATTN_OUT_NORM = auto() + ATTN_POST_NORM = auto() + ATTN_ROT_EMBD = auto() + FFN_GATE_INP = auto() + FFN_GATE_INP_SHEXP = auto() + FFN_NORM = auto() + FFN_PRE_NORM = auto() + FFN_POST_NORM = auto() + FFN_GATE = auto() + FFN_DOWN = auto() + FFN_UP = auto() + FFN_ACT = auto() + FFN_NORM_EXP = auto() + FFN_GATE_EXP = auto() + FFN_DOWN_EXP = auto() + FFN_UP_EXP = auto() + FFN_GATE_SHEXP = auto() + FFN_DOWN_SHEXP = auto() + FFN_UP_SHEXP = auto() + FFN_EXP_PROBS_B = auto() + ATTN_Q_NORM = auto() + ATTN_K_NORM = auto() + LAYER_OUT_NORM = auto() + SSM_IN = auto() + SSM_CONV1D = auto() + SSM_X = auto() + SSM_DT = auto() + SSM_A = auto() + SSM_D = auto() + SSM_OUT = auto() + TIME_MIX_W0 = auto() + TIME_MIX_W1 = auto() + TIME_MIX_W2 = auto() + TIME_MIX_A0 = auto() + TIME_MIX_A1 = auto() + TIME_MIX_A2 = auto() + TIME_MIX_V0 = auto() + TIME_MIX_V1 = auto() + TIME_MIX_V2 = auto() + TIME_MIX_G1 = auto() + TIME_MIX_G2 = auto() + TIME_MIX_K_K = auto() + TIME_MIX_K_A = auto() + TIME_MIX_R_K = auto() + TIME_MIX_LERP_X = auto() + TIME_MIX_LERP_K = auto() + TIME_MIX_LERP_V = auto() + TIME_MIX_LERP_R = auto() + TIME_MIX_LERP_G = auto() + TIME_MIX_LERP_FUSED = auto() + TIME_MIX_LERP_W = auto() + TIME_MIX_FIRST = auto() + TIME_MIX_DECAY = auto() + TIME_MIX_DECAY_W1 = auto() + TIME_MIX_DECAY_W2 = auto() + TIME_MIX_KEY = auto() + TIME_MIX_VALUE = auto() + TIME_MIX_RECEPTANCE = auto() + TIME_MIX_GATE = auto() + TIME_MIX_LN = auto() + TIME_MIX_OUTPUT = auto() + CHANNEL_MIX_LERP_K = auto() + CHANNEL_MIX_LERP_R = auto() + CHANNEL_MIX_KEY = auto() CHANNEL_MIX_RECEPTANCE = auto() - CHANNEL_MIX_VALUE = auto() - ATTN_Q_A = auto() - ATTN_Q_B = auto() - ATTN_KV_A_MQA = auto() - ATTN_KV_B = auto() - ATTN_Q_A_NORM = auto() - ATTN_KV_A_NORM = auto() - FFN_SUB_NORM = auto() - ATTN_SUB_NORM = auto() - DEC_ATTN_NORM = auto() - DEC_ATTN_Q = auto() - DEC_ATTN_K = auto() - DEC_ATTN_V = auto() - DEC_ATTN_OUT = auto() - DEC_ATTN_REL_B = auto() - DEC_CROSS_ATTN_NORM = auto() - DEC_CROSS_ATTN_Q = auto() - DEC_CROSS_ATTN_K = auto() - DEC_CROSS_ATTN_V = auto() - DEC_CROSS_ATTN_OUT = auto() + CHANNEL_MIX_VALUE = auto() + ATTN_Q_A = auto() + ATTN_Q_B = auto() + ATTN_KV_A_MQA = auto() + ATTN_KV_B = auto() + ATTN_K_B = auto() + ATTN_V_B = auto() + ATTN_Q_A_NORM = auto() + ATTN_KV_A_NORM = auto() + FFN_SUB_NORM = auto() + ATTN_SUB_NORM = auto() + DEC_ATTN_NORM = auto() + DEC_ATTN_Q = auto() + DEC_ATTN_K = auto() + DEC_ATTN_V = auto() + DEC_ATTN_OUT = auto() + DEC_ATTN_REL_B = auto() + DEC_CROSS_ATTN_NORM = auto() + DEC_CROSS_ATTN_Q = auto() + DEC_CROSS_ATTN_K = auto() + DEC_CROSS_ATTN_V = auto() + DEC_CROSS_ATTN_OUT = auto() DEC_CROSS_ATTN_REL_B = auto() - DEC_FFN_NORM = auto() - DEC_FFN_GATE = auto() - DEC_FFN_DOWN = auto() - DEC_FFN_UP = auto() - DEC_OUTPUT_NORM = auto() - ENC_ATTN_NORM = auto() - ENC_ATTN_Q = auto() - ENC_ATTN_K = auto() - ENC_ATTN_V = auto() - ENC_ATTN_OUT = auto() - ENC_ATTN_REL_B = auto() - ENC_FFN_NORM = auto() - ENC_FFN_GATE = auto() - ENC_FFN_DOWN = auto() - ENC_FFN_UP = auto() - ENC_OUTPUT_NORM = auto() - CLS = auto() # classifier - CLS_OUT = auto() # classifier output projection - CONV1D = auto() - CONVNEXT_DW = auto() - CONVNEXT_NORM = auto() - CONVNEXT_PW1 = auto() - CONVNEXT_PW2 = auto() - CONVNEXT_GAMMA = auto() - POSNET_CONV1 = auto() - POSNET_CONV2 = auto() - POSNET_NORM = auto() - POSNET_NORM1 = auto() - POSNET_NORM2 = auto() - POSNET_ATTN_NORM = auto() - POSNET_ATTN_Q = auto() - POSNET_ATTN_K = auto() - POSNET_ATTN_V = auto() - POSNET_ATTN_OUT = auto() + DEC_FFN_NORM = auto() + DEC_FFN_GATE = auto() + DEC_FFN_DOWN = auto() + DEC_FFN_UP = auto() + DEC_OUTPUT_NORM = auto() + ENC_ATTN_NORM = auto() + ENC_ATTN_Q = auto() + ENC_ATTN_K = auto() + ENC_ATTN_V = auto() + ENC_ATTN_OUT = auto() + ENC_ATTN_REL_B = auto() + ENC_FFN_NORM = auto() + ENC_FFN_GATE = auto() + ENC_FFN_DOWN = auto() + ENC_FFN_UP = auto() + ENC_OUTPUT_NORM = auto() + CLS = auto() # classifier + CLS_OUT = auto() # classifier output projection + CONV1D = auto() + CONVNEXT_DW = auto() + CONVNEXT_NORM = auto() + CONVNEXT_PW1 = auto() + CONVNEXT_PW2 = auto() + CONVNEXT_GAMMA = auto() + POSNET_CONV1 = auto() + POSNET_CONV2 = auto() + POSNET_NORM = auto() + POSNET_NORM1 = auto() + POSNET_NORM2 = auto() + POSNET_ATTN_NORM = auto() + POSNET_ATTN_Q = auto() + POSNET_ATTN_K = auto() + POSNET_ATTN_V = auto() + POSNET_ATTN_OUT = auto() + # vision + V_MMPROJ = auto() + V_MMPROJ_FC = auto() + V_MMPROJ_MLP = auto() + V_MMPROJ_PEG = auto() + V_ENC_EMBD_CLS = auto() + V_ENC_EMBD_PATCH = auto() + V_ENC_EMBD_POS = auto() + V_ENC_ATTN_Q = auto() + V_ENC_ATTN_K = auto() + V_ENC_ATTN_V = auto() + V_ENC_INPUT_NORM = auto() + V_ENC_OUTPUT = auto() + V_ENC_OUTPUT_NORM = auto() + V_ENC_FFN_UP = auto() + V_ENC_FFN_GATE = auto() + V_ENC_FFN_DOWN = auto() + V_PRE_NORM = auto() + V_POST_NORM = auto() + V_MM_INP_PROJ = auto() # gemma3 + V_MM_SOFT_EMB_NORM = auto() # gemma3 + V_RESMPL_POS_EMBD_K = auto() # minicpmv + V_RESMPL_ATTN_Q = auto() # minicpmv + V_RESMPL_ATTN_K = auto() # minicpmv + V_RESMPL_ATTN_V = auto() # minicpmv + V_RESMPL_ATTN_OUT = auto() # minicpmv + V_RESMPL_KV = auto() # minicpmv + V_RESMPL_KV_NORM = auto() # minicpmv + V_RESMPL_POST_NORM = auto() # minicpmv + V_RESMPL_Q_NORM = auto() # minicpmv + V_RESMPL_PROJ = auto() # minicpmv + V_RESMPL_QUERY = auto() # minicpmv + V_TOK_EMBD_IMG_BREAK = auto() # pixtral MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { - MODEL_ARCH.LLAMA: "llama", - MODEL_ARCH.DECI: "deci", - MODEL_ARCH.FALCON: "falcon", - MODEL_ARCH.BAICHUAN: "baichuan", - MODEL_ARCH.GROK: "grok", - MODEL_ARCH.GPT2: "gpt2", - MODEL_ARCH.GPTJ: "gptj", - MODEL_ARCH.GPTNEOX: "gptneox", - MODEL_ARCH.MPT: "mpt", - MODEL_ARCH.STARCODER: "starcoder", - MODEL_ARCH.REFACT: "refact", - MODEL_ARCH.BERT: "bert", - MODEL_ARCH.NOMIC_BERT: "nomic-bert", - MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2", - MODEL_ARCH.BLOOM: "bloom", - MODEL_ARCH.STABLELM: "stablelm", - MODEL_ARCH.QWEN: "qwen", - MODEL_ARCH.QWEN2: "qwen2", - MODEL_ARCH.QWEN2MOE: "qwen2moe", - MODEL_ARCH.QWEN2VL: "qwen2vl", - MODEL_ARCH.PHI2: "phi2", - MODEL_ARCH.PHI3: "phi3", - MODEL_ARCH.PHIMOE: "phimoe", - MODEL_ARCH.PLAMO: "plamo", - MODEL_ARCH.CODESHELL: "codeshell", - MODEL_ARCH.ORION: "orion", - MODEL_ARCH.INTERNLM2: "internlm2", - MODEL_ARCH.MINICPM: "minicpm", - MODEL_ARCH.MINICPM3: "minicpm3", - MODEL_ARCH.GEMMA: "gemma", - MODEL_ARCH.GEMMA2: "gemma2", - MODEL_ARCH.STARCODER2: "starcoder2", - MODEL_ARCH.RWKV6: "rwkv6", - MODEL_ARCH.RWKV6QWEN2: "rwkv6qwen2", - MODEL_ARCH.MAMBA: "mamba", - MODEL_ARCH.XVERSE: "xverse", - MODEL_ARCH.COMMAND_R: "command-r", - MODEL_ARCH.COHERE2: "cohere2", - MODEL_ARCH.DBRX: "dbrx", - MODEL_ARCH.OLMO: "olmo", - MODEL_ARCH.OLMO2: "olmo2", - MODEL_ARCH.OLMOE: "olmoe", - MODEL_ARCH.OPENELM: "openelm", - MODEL_ARCH.ARCTIC: "arctic", - MODEL_ARCH.DEEPSEEK: "deepseek", - MODEL_ARCH.DEEPSEEK2: "deepseek2", - MODEL_ARCH.CHATGLM: "chatglm", - MODEL_ARCH.BITNET: "bitnet", - MODEL_ARCH.T5: "t5", - MODEL_ARCH.T5ENCODER: "t5encoder", - MODEL_ARCH.JAIS: "jais", - MODEL_ARCH.NEMOTRON: "nemotron", - MODEL_ARCH.EXAONE: "exaone", - MODEL_ARCH.GRANITE: "granite", - MODEL_ARCH.GRANITE_MOE: "granitemoe", - MODEL_ARCH.CHAMELEON: "chameleon", + MODEL_ARCH.CLIP_VISION: "clip", # dummy arch for clip.cpp + MODEL_ARCH.LLAMA: "llama", + MODEL_ARCH.LLAMA4: "llama4", + MODEL_ARCH.DECI: "deci", + MODEL_ARCH.FALCON: "falcon", + MODEL_ARCH.BAICHUAN: "baichuan", + MODEL_ARCH.GROK: "grok", + MODEL_ARCH.GPT2: "gpt2", + MODEL_ARCH.GPTJ: "gptj", + MODEL_ARCH.GPTNEOX: "gptneox", + MODEL_ARCH.MPT: "mpt", + MODEL_ARCH.STARCODER: "starcoder", + MODEL_ARCH.REFACT: "refact", + MODEL_ARCH.BERT: "bert", + MODEL_ARCH.NOMIC_BERT: "nomic-bert", + MODEL_ARCH.NOMIC_BERT_MOE: "nomic-bert-moe", + MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2", + MODEL_ARCH.BLOOM: "bloom", + MODEL_ARCH.STABLELM: "stablelm", + MODEL_ARCH.QWEN: "qwen", + MODEL_ARCH.QWEN2: "qwen2", + MODEL_ARCH.QWEN2MOE: "qwen2moe", + MODEL_ARCH.QWEN2VL: "qwen2vl", + MODEL_ARCH.QWEN3: "qwen3", + MODEL_ARCH.QWEN3MOE: "qwen3moe", + MODEL_ARCH.PHI2: "phi2", + MODEL_ARCH.PHI3: "phi3", + MODEL_ARCH.PHIMOE: "phimoe", + MODEL_ARCH.PLAMO: "plamo", + MODEL_ARCH.CODESHELL: "codeshell", + MODEL_ARCH.ORION: "orion", + MODEL_ARCH.INTERNLM2: "internlm2", + MODEL_ARCH.MINICPM: "minicpm", + MODEL_ARCH.MINICPM3: "minicpm3", + MODEL_ARCH.GEMMA: "gemma", + MODEL_ARCH.GEMMA2: "gemma2", + MODEL_ARCH.GEMMA3: "gemma3", + MODEL_ARCH.STARCODER2: "starcoder2", + MODEL_ARCH.RWKV6: "rwkv6", + MODEL_ARCH.RWKV6QWEN2: "rwkv6qwen2", + MODEL_ARCH.RWKV7: "rwkv7", + MODEL_ARCH.ARWKV7: "arwkv7", + MODEL_ARCH.MAMBA: "mamba", + MODEL_ARCH.XVERSE: "xverse", + MODEL_ARCH.COMMAND_R: "command-r", + MODEL_ARCH.COHERE2: "cohere2", + MODEL_ARCH.DBRX: "dbrx", + MODEL_ARCH.OLMO: "olmo", + MODEL_ARCH.OLMO2: "olmo2", + MODEL_ARCH.OLMOE: "olmoe", + MODEL_ARCH.OPENELM: "openelm", + MODEL_ARCH.ARCTIC: "arctic", + MODEL_ARCH.DEEPSEEK: "deepseek", + MODEL_ARCH.DEEPSEEK2: "deepseek2", + MODEL_ARCH.CHATGLM: "chatglm", + MODEL_ARCH.GLM4: "glm4", + MODEL_ARCH.BITNET: "bitnet", + MODEL_ARCH.T5: "t5", + MODEL_ARCH.T5ENCODER: "t5encoder", + MODEL_ARCH.JAIS: "jais", + MODEL_ARCH.NEMOTRON: "nemotron", + MODEL_ARCH.EXAONE: "exaone", + MODEL_ARCH.GRANITE: "granite", + MODEL_ARCH.GRANITE_MOE: "granitemoe", + MODEL_ARCH.CHAMELEON: "chameleon", MODEL_ARCH.WAVTOKENIZER_DEC: "wavtokenizer-dec", + MODEL_ARCH.PLM: "plm", + MODEL_ARCH.BAILINGMOE: "bailingmoe", +} + +VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = { + VISION_PROJECTOR_TYPE.MLP: "mlp", + VISION_PROJECTOR_TYPE.LDP: "ldp", + VISION_PROJECTOR_TYPE.LDPV2: "ldpv2", + VISION_PROJECTOR_TYPE.RESAMPLER: "resampler", + VISION_PROJECTOR_TYPE.GLM_EDGE: "adapter", + VISION_PROJECTOR_TYPE.MERGER: "qwen2vl_merger", + VISION_PROJECTOR_TYPE.GEMMA3: "gemma3", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { - MODEL_TENSOR.TOKEN_EMBD: "token_embd", - MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm", - MODEL_TENSOR.TOKEN_TYPES: "token_types", - MODEL_TENSOR.POS_EMBD: "position_embd", - MODEL_TENSOR.OUTPUT_NORM: "output_norm", - MODEL_TENSOR.OUTPUT: "output", - MODEL_TENSOR.ROPE_FREQS: "rope_freqs", - MODEL_TENSOR.ROPE_FACTORS_LONG: "rope_factors_long", - MODEL_TENSOR.ROPE_FACTORS_SHORT: "rope_factors_short", - MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", - MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2", - MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv", - MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q", - MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k", - MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v", - MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", - MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd", - MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm", - MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm", - MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm", - MODEL_TENSOR.ATTN_POST_NORM: "blk.{bid}.post_attention_norm", - MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp", - MODEL_TENSOR.FFN_GATE_INP_SHEXP: "blk.{bid}.ffn_gate_inp_shexp", - MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", - MODEL_TENSOR.FFN_PRE_NORM: "blk.{bid}.ffn_norm", - MODEL_TENSOR.FFN_POST_NORM: "blk.{bid}.post_ffw_norm", - MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate", - MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", - MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", - MODEL_TENSOR.FFN_GATE_SHEXP: "blk.{bid}.ffn_gate_shexp", - MODEL_TENSOR.FFN_DOWN_SHEXP: "blk.{bid}.ffn_down_shexp", - MODEL_TENSOR.FFN_UP_SHEXP: "blk.{bid}.ffn_up_shexp", - MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn", - MODEL_TENSOR.FFN_NORM_EXP: "blk.{bid}.ffn_norm_exps", - MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps", - MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps", - MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps", - MODEL_TENSOR.FFN_EXP_PROBS_B: "blk.{bid}.exp_probs_b", - MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm", - MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in", - MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d", - MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x", - MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt", - MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a", - MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d", - MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out", - MODEL_TENSOR.TIME_MIX_W1: "blk.{bid}.time_mix_w1", - MODEL_TENSOR.TIME_MIX_W2: "blk.{bid}.time_mix_w2", - MODEL_TENSOR.TIME_MIX_LERP_X: "blk.{bid}.time_mix_lerp_x", - MODEL_TENSOR.TIME_MIX_LERP_K: "blk.{bid}.time_mix_lerp_k", - MODEL_TENSOR.TIME_MIX_LERP_V: "blk.{bid}.time_mix_lerp_v", - MODEL_TENSOR.TIME_MIX_LERP_R: "blk.{bid}.time_mix_lerp_r", - MODEL_TENSOR.TIME_MIX_LERP_G: "blk.{bid}.time_mix_lerp_g", - MODEL_TENSOR.TIME_MIX_LERP_FUSED: "blk.{bid}.time_mix_lerp_fused", - MODEL_TENSOR.TIME_MIX_LERP_W: "blk.{bid}.time_mix_lerp_w", - MODEL_TENSOR.TIME_MIX_FIRST: "blk.{bid}.time_mix_first", - MODEL_TENSOR.TIME_MIX_DECAY: "blk.{bid}.time_mix_decay", - MODEL_TENSOR.TIME_MIX_DECAY_W1: "blk.{bid}.time_mix_decay_w1", - MODEL_TENSOR.TIME_MIX_DECAY_W2: "blk.{bid}.time_mix_decay_w2", - MODEL_TENSOR.TIME_MIX_KEY: "blk.{bid}.time_mix_key", - MODEL_TENSOR.TIME_MIX_VALUE: "blk.{bid}.time_mix_value", - MODEL_TENSOR.TIME_MIX_RECEPTANCE: "blk.{bid}.time_mix_receptance", - MODEL_TENSOR.TIME_MIX_GATE: "blk.{bid}.time_mix_gate", - MODEL_TENSOR.TIME_MIX_LN: "blk.{bid}.time_mix_ln", - MODEL_TENSOR.TIME_MIX_OUTPUT: "blk.{bid}.time_mix_output", - MODEL_TENSOR.CHANNEL_MIX_LERP_K: "blk.{bid}.channel_mix_lerp_k", - MODEL_TENSOR.CHANNEL_MIX_LERP_R: "blk.{bid}.channel_mix_lerp_r", - MODEL_TENSOR.CHANNEL_MIX_KEY: "blk.{bid}.channel_mix_key", - MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE: "blk.{bid}.channel_mix_receptance", - MODEL_TENSOR.CHANNEL_MIX_VALUE: "blk.{bid}.channel_mix_value", - MODEL_TENSOR.ATTN_Q_A: "blk.{bid}.attn_q_a", - MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b", - MODEL_TENSOR.ATTN_KV_A_MQA: "blk.{bid}.attn_kv_a_mqa", - MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b", - MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm", - MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm", - MODEL_TENSOR.ATTN_SUB_NORM: "blk.{bid}.attn_sub_norm", - MODEL_TENSOR.FFN_SUB_NORM: "blk.{bid}.ffn_sub_norm", - MODEL_TENSOR.DEC_ATTN_NORM: "dec.blk.{bid}.attn_norm", - MODEL_TENSOR.DEC_ATTN_Q: "dec.blk.{bid}.attn_q", - MODEL_TENSOR.DEC_ATTN_K: "dec.blk.{bid}.attn_k", - MODEL_TENSOR.DEC_ATTN_V: "dec.blk.{bid}.attn_v", - MODEL_TENSOR.DEC_ATTN_OUT: "dec.blk.{bid}.attn_o", - MODEL_TENSOR.DEC_ATTN_REL_B: "dec.blk.{bid}.attn_rel_b", - MODEL_TENSOR.DEC_CROSS_ATTN_NORM: "dec.blk.{bid}.cross_attn_norm", - MODEL_TENSOR.DEC_CROSS_ATTN_Q: "dec.blk.{bid}.cross_attn_q", - MODEL_TENSOR.DEC_CROSS_ATTN_K: "dec.blk.{bid}.cross_attn_k", - MODEL_TENSOR.DEC_CROSS_ATTN_V: "dec.blk.{bid}.cross_attn_v", - MODEL_TENSOR.DEC_CROSS_ATTN_OUT: "dec.blk.{bid}.cross_attn_o", - MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: "dec.blk.{bid}.cross_attn_rel_b", - MODEL_TENSOR.DEC_FFN_NORM: "dec.blk.{bid}.ffn_norm", - MODEL_TENSOR.DEC_FFN_GATE: "dec.blk.{bid}.ffn_gate", - MODEL_TENSOR.DEC_FFN_DOWN: "dec.blk.{bid}.ffn_down", - MODEL_TENSOR.DEC_FFN_UP: "dec.blk.{bid}.ffn_up", - MODEL_TENSOR.DEC_OUTPUT_NORM: "dec.output_norm", - MODEL_TENSOR.ENC_ATTN_NORM: "enc.blk.{bid}.attn_norm", - MODEL_TENSOR.ENC_ATTN_Q: "enc.blk.{bid}.attn_q", - MODEL_TENSOR.ENC_ATTN_K: "enc.blk.{bid}.attn_k", - MODEL_TENSOR.ENC_ATTN_V: "enc.blk.{bid}.attn_v", - MODEL_TENSOR.ENC_ATTN_OUT: "enc.blk.{bid}.attn_o", - MODEL_TENSOR.ENC_ATTN_REL_B: "enc.blk.{bid}.attn_rel_b", - MODEL_TENSOR.ENC_FFN_NORM: "enc.blk.{bid}.ffn_norm", - MODEL_TENSOR.ENC_FFN_GATE: "enc.blk.{bid}.ffn_gate", - MODEL_TENSOR.ENC_FFN_DOWN: "enc.blk.{bid}.ffn_down", - MODEL_TENSOR.ENC_FFN_UP: "enc.blk.{bid}.ffn_up", - MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm", - MODEL_TENSOR.CLS: "cls", - MODEL_TENSOR.CLS_OUT: "cls.output", - MODEL_TENSOR.CONV1D: "conv1d", - MODEL_TENSOR.CONVNEXT_DW: "convnext.{bid}.dw", - MODEL_TENSOR.CONVNEXT_NORM: "convnext.{bid}.norm", - MODEL_TENSOR.CONVNEXT_PW1: "convnext.{bid}.pw1", - MODEL_TENSOR.CONVNEXT_PW2: "convnext.{bid}.pw2", - MODEL_TENSOR.CONVNEXT_GAMMA: "convnext.{bid}.gamma", - MODEL_TENSOR.POSNET_CONV1: "posnet.{bid}.conv1", - MODEL_TENSOR.POSNET_CONV2: "posnet.{bid}.conv2", - MODEL_TENSOR.POSNET_NORM: "posnet.{bid}.norm", - MODEL_TENSOR.POSNET_NORM1: "posnet.{bid}.norm1", - MODEL_TENSOR.POSNET_NORM2: "posnet.{bid}.norm2", - MODEL_TENSOR.POSNET_ATTN_NORM: "posnet.{bid}.attn_norm", - MODEL_TENSOR.POSNET_ATTN_Q: "posnet.{bid}.attn_q", - MODEL_TENSOR.POSNET_ATTN_K: "posnet.{bid}.attn_k", - MODEL_TENSOR.POSNET_ATTN_V: "posnet.{bid}.attn_v", - MODEL_TENSOR.POSNET_ATTN_OUT: "posnet.{bid}.attn_output", + MODEL_TENSOR.TOKEN_EMBD: "token_embd", + MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm", + MODEL_TENSOR.TOKEN_TYPES: "token_types", + MODEL_TENSOR.POS_EMBD: "position_embd", + MODEL_TENSOR.OUTPUT_NORM: "output_norm", + MODEL_TENSOR.OUTPUT: "output", + MODEL_TENSOR.ROPE_FREQS: "rope_freqs", + MODEL_TENSOR.ROPE_FACTORS_LONG: "rope_factors_long", + MODEL_TENSOR.ROPE_FACTORS_SHORT: "rope_factors_short", + MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", + MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2", + MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv", + MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q", + MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k", + MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v", + MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", + MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd", + MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm", + MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm", + MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm", + MODEL_TENSOR.ATTN_POST_NORM: "blk.{bid}.post_attention_norm", + MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp", + MODEL_TENSOR.FFN_GATE_INP_SHEXP: "blk.{bid}.ffn_gate_inp_shexp", + MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", + MODEL_TENSOR.FFN_PRE_NORM: "blk.{bid}.ffn_norm", + MODEL_TENSOR.FFN_POST_NORM: "blk.{bid}.post_ffw_norm", + MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate", + MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", + MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", + MODEL_TENSOR.FFN_GATE_SHEXP: "blk.{bid}.ffn_gate_shexp", + MODEL_TENSOR.FFN_DOWN_SHEXP: "blk.{bid}.ffn_down_shexp", + MODEL_TENSOR.FFN_UP_SHEXP: "blk.{bid}.ffn_up_shexp", + MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn", + MODEL_TENSOR.FFN_NORM_EXP: "blk.{bid}.ffn_norm_exps", + MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps", + MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps", + MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps", + MODEL_TENSOR.FFN_EXP_PROBS_B: "blk.{bid}.exp_probs_b", + MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm", + MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in", + MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d", + MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x", + MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt", + MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a", + MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d", + MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out", + MODEL_TENSOR.TIME_MIX_W0: "blk.{bid}.time_mix_w0", + MODEL_TENSOR.TIME_MIX_W1: "blk.{bid}.time_mix_w1", + MODEL_TENSOR.TIME_MIX_W2: "blk.{bid}.time_mix_w2", + MODEL_TENSOR.TIME_MIX_A0: "blk.{bid}.time_mix_a0", + MODEL_TENSOR.TIME_MIX_A1: "blk.{bid}.time_mix_a1", + MODEL_TENSOR.TIME_MIX_A2: "blk.{bid}.time_mix_a2", + MODEL_TENSOR.TIME_MIX_V0: "blk.{bid}.time_mix_v0", + MODEL_TENSOR.TIME_MIX_V1: "blk.{bid}.time_mix_v1", + MODEL_TENSOR.TIME_MIX_V2: "blk.{bid}.time_mix_v2", + MODEL_TENSOR.TIME_MIX_G1: "blk.{bid}.time_mix_g1", + MODEL_TENSOR.TIME_MIX_G2: "blk.{bid}.time_mix_g2", + MODEL_TENSOR.TIME_MIX_K_K: "blk.{bid}.time_mix_k_k", + MODEL_TENSOR.TIME_MIX_K_A: "blk.{bid}.time_mix_k_a", + MODEL_TENSOR.TIME_MIX_R_K: "blk.{bid}.time_mix_r_k", + MODEL_TENSOR.TIME_MIX_LERP_X: "blk.{bid}.time_mix_lerp_x", + MODEL_TENSOR.TIME_MIX_LERP_K: "blk.{bid}.time_mix_lerp_k", + MODEL_TENSOR.TIME_MIX_LERP_V: "blk.{bid}.time_mix_lerp_v", + MODEL_TENSOR.TIME_MIX_LERP_R: "blk.{bid}.time_mix_lerp_r", + MODEL_TENSOR.TIME_MIX_LERP_G: "blk.{bid}.time_mix_lerp_g", + MODEL_TENSOR.TIME_MIX_LERP_FUSED: "blk.{bid}.time_mix_lerp_fused", + MODEL_TENSOR.TIME_MIX_LERP_W: "blk.{bid}.time_mix_lerp_w", + MODEL_TENSOR.TIME_MIX_FIRST: "blk.{bid}.time_mix_first", + MODEL_TENSOR.TIME_MIX_DECAY: "blk.{bid}.time_mix_decay", + MODEL_TENSOR.TIME_MIX_DECAY_W1: "blk.{bid}.time_mix_decay_w1", + MODEL_TENSOR.TIME_MIX_DECAY_W2: "blk.{bid}.time_mix_decay_w2", + MODEL_TENSOR.TIME_MIX_KEY: "blk.{bid}.time_mix_key", + MODEL_TENSOR.TIME_MIX_VALUE: "blk.{bid}.time_mix_value", + MODEL_TENSOR.TIME_MIX_RECEPTANCE: "blk.{bid}.time_mix_receptance", + MODEL_TENSOR.TIME_MIX_GATE: "blk.{bid}.time_mix_gate", + MODEL_TENSOR.TIME_MIX_LN: "blk.{bid}.time_mix_ln", + MODEL_TENSOR.TIME_MIX_OUTPUT: "blk.{bid}.time_mix_output", + MODEL_TENSOR.CHANNEL_MIX_LERP_K: "blk.{bid}.channel_mix_lerp_k", + MODEL_TENSOR.CHANNEL_MIX_LERP_R: "blk.{bid}.channel_mix_lerp_r", + MODEL_TENSOR.CHANNEL_MIX_KEY: "blk.{bid}.channel_mix_key", + MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE: "blk.{bid}.channel_mix_receptance", + MODEL_TENSOR.CHANNEL_MIX_VALUE: "blk.{bid}.channel_mix_value", + MODEL_TENSOR.ATTN_Q_A: "blk.{bid}.attn_q_a", + MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b", + MODEL_TENSOR.ATTN_KV_A_MQA: "blk.{bid}.attn_kv_a_mqa", + MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b", + MODEL_TENSOR.ATTN_K_B: "blk.{bid}.attn_k_b", + MODEL_TENSOR.ATTN_V_B: "blk.{bid}.attn_v_b", + MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm", + MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm", + MODEL_TENSOR.ATTN_SUB_NORM: "blk.{bid}.attn_sub_norm", + MODEL_TENSOR.FFN_SUB_NORM: "blk.{bid}.ffn_sub_norm", + MODEL_TENSOR.DEC_ATTN_NORM: "dec.blk.{bid}.attn_norm", + MODEL_TENSOR.DEC_ATTN_Q: "dec.blk.{bid}.attn_q", + MODEL_TENSOR.DEC_ATTN_K: "dec.blk.{bid}.attn_k", + MODEL_TENSOR.DEC_ATTN_V: "dec.blk.{bid}.attn_v", + MODEL_TENSOR.DEC_ATTN_OUT: "dec.blk.{bid}.attn_o", + MODEL_TENSOR.DEC_ATTN_REL_B: "dec.blk.{bid}.attn_rel_b", + MODEL_TENSOR.DEC_CROSS_ATTN_NORM: "dec.blk.{bid}.cross_attn_norm", + MODEL_TENSOR.DEC_CROSS_ATTN_Q: "dec.blk.{bid}.cross_attn_q", + MODEL_TENSOR.DEC_CROSS_ATTN_K: "dec.blk.{bid}.cross_attn_k", + MODEL_TENSOR.DEC_CROSS_ATTN_V: "dec.blk.{bid}.cross_attn_v", + MODEL_TENSOR.DEC_CROSS_ATTN_OUT: "dec.blk.{bid}.cross_attn_o", + MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: "dec.blk.{bid}.cross_attn_rel_b", + MODEL_TENSOR.DEC_FFN_NORM: "dec.blk.{bid}.ffn_norm", + MODEL_TENSOR.DEC_FFN_GATE: "dec.blk.{bid}.ffn_gate", + MODEL_TENSOR.DEC_FFN_DOWN: "dec.blk.{bid}.ffn_down", + MODEL_TENSOR.DEC_FFN_UP: "dec.blk.{bid}.ffn_up", + MODEL_TENSOR.DEC_OUTPUT_NORM: "dec.output_norm", + MODEL_TENSOR.ENC_ATTN_NORM: "enc.blk.{bid}.attn_norm", + MODEL_TENSOR.ENC_ATTN_Q: "enc.blk.{bid}.attn_q", + MODEL_TENSOR.ENC_ATTN_K: "enc.blk.{bid}.attn_k", + MODEL_TENSOR.ENC_ATTN_V: "enc.blk.{bid}.attn_v", + MODEL_TENSOR.ENC_ATTN_OUT: "enc.blk.{bid}.attn_o", + MODEL_TENSOR.ENC_ATTN_REL_B: "enc.blk.{bid}.attn_rel_b", + MODEL_TENSOR.ENC_FFN_NORM: "enc.blk.{bid}.ffn_norm", + MODEL_TENSOR.ENC_FFN_GATE: "enc.blk.{bid}.ffn_gate", + MODEL_TENSOR.ENC_FFN_DOWN: "enc.blk.{bid}.ffn_down", + MODEL_TENSOR.ENC_FFN_UP: "enc.blk.{bid}.ffn_up", + MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm", + MODEL_TENSOR.CLS: "cls", + MODEL_TENSOR.CLS_OUT: "cls.output", + MODEL_TENSOR.CONV1D: "conv1d", + MODEL_TENSOR.CONVNEXT_DW: "convnext.{bid}.dw", + MODEL_TENSOR.CONVNEXT_NORM: "convnext.{bid}.norm", + MODEL_TENSOR.CONVNEXT_PW1: "convnext.{bid}.pw1", + MODEL_TENSOR.CONVNEXT_PW2: "convnext.{bid}.pw2", + MODEL_TENSOR.CONVNEXT_GAMMA: "convnext.{bid}.gamma", + MODEL_TENSOR.POSNET_CONV1: "posnet.{bid}.conv1", + MODEL_TENSOR.POSNET_CONV2: "posnet.{bid}.conv2", + MODEL_TENSOR.POSNET_NORM: "posnet.{bid}.norm", + MODEL_TENSOR.POSNET_NORM1: "posnet.{bid}.norm1", + MODEL_TENSOR.POSNET_NORM2: "posnet.{bid}.norm2", + MODEL_TENSOR.POSNET_ATTN_NORM: "posnet.{bid}.attn_norm", + MODEL_TENSOR.POSNET_ATTN_Q: "posnet.{bid}.attn_q", + MODEL_TENSOR.POSNET_ATTN_K: "posnet.{bid}.attn_k", + MODEL_TENSOR.POSNET_ATTN_V: "posnet.{bid}.attn_v", + MODEL_TENSOR.POSNET_ATTN_OUT: "posnet.{bid}.attn_output", + # vision + MODEL_TENSOR.V_MMPROJ: "mm.{bid}", + MODEL_TENSOR.V_MMPROJ_FC: "mm.model.fc", + MODEL_TENSOR.V_MMPROJ_MLP: "mm.model.mlp.{bid}", + MODEL_TENSOR.V_MMPROJ_PEG: "mm.model.peg.{bid}", + MODEL_TENSOR.V_ENC_EMBD_CLS: "v.class_embd", + MODEL_TENSOR.V_ENC_EMBD_PATCH: "v.patch_embd", + MODEL_TENSOR.V_ENC_EMBD_POS: "v.position_embd", + MODEL_TENSOR.V_ENC_ATTN_Q: "v.blk.{bid}.attn_q", + MODEL_TENSOR.V_ENC_ATTN_K: "v.blk.{bid}.attn_k", + MODEL_TENSOR.V_ENC_ATTN_V: "v.blk.{bid}.attn_v", + MODEL_TENSOR.V_ENC_INPUT_NORM: "v.blk.{bid}.ln1", + MODEL_TENSOR.V_ENC_OUTPUT: "v.blk.{bid}.attn_out", + MODEL_TENSOR.V_ENC_OUTPUT_NORM: "v.blk.{bid}.ln2", + MODEL_TENSOR.V_ENC_FFN_UP: "v.blk.{bid}.ffn_up", + MODEL_TENSOR.V_ENC_FFN_GATE: "v.blk.{bid}.ffn_gate", + MODEL_TENSOR.V_ENC_FFN_DOWN: "v.blk.{bid}.ffn_down", + MODEL_TENSOR.V_PRE_NORM: "v.pre_ln", + MODEL_TENSOR.V_POST_NORM: "v.post_ln", + MODEL_TENSOR.V_MM_INP_PROJ: "mm.input_projection", + MODEL_TENSOR.V_MM_SOFT_EMB_NORM: "mm.soft_emb_norm", + MODEL_TENSOR.V_RESMPL_POS_EMBD_K: "resampler.pos_embd_k", + MODEL_TENSOR.V_RESMPL_ATTN_Q: "resampler.attn.q", + MODEL_TENSOR.V_RESMPL_ATTN_K: "resampler.attn.k", + MODEL_TENSOR.V_RESMPL_ATTN_V: "resampler.attn.v", + MODEL_TENSOR.V_RESMPL_ATTN_OUT: "resampler.attn.out", + MODEL_TENSOR.V_RESMPL_KV: "resampler.kv", + MODEL_TENSOR.V_RESMPL_KV_NORM: "resampler.ln_kv", + MODEL_TENSOR.V_RESMPL_POST_NORM: "resampler.ln_post", + MODEL_TENSOR.V_RESMPL_Q_NORM: "resampler.ln_q", + MODEL_TENSOR.V_RESMPL_PROJ: "resampler.proj", + MODEL_TENSOR.V_RESMPL_QUERY: "resampler.query", + MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK: "v.token_embd.img_break", # pixtral } MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { + MODEL_ARCH.CLIP_VISION: [ + MODEL_TENSOR.V_MMPROJ, + MODEL_TENSOR.V_MMPROJ_FC, + MODEL_TENSOR.V_MMPROJ_MLP, + MODEL_TENSOR.V_MMPROJ_PEG, + MODEL_TENSOR.V_ENC_EMBD_CLS, + MODEL_TENSOR.V_ENC_EMBD_PATCH, + MODEL_TENSOR.V_ENC_EMBD_POS, + MODEL_TENSOR.V_ENC_ATTN_Q, + MODEL_TENSOR.V_ENC_ATTN_K, + MODEL_TENSOR.V_ENC_ATTN_V, + MODEL_TENSOR.V_ENC_INPUT_NORM, + MODEL_TENSOR.V_ENC_OUTPUT, + MODEL_TENSOR.V_ENC_OUTPUT_NORM, + MODEL_TENSOR.V_ENC_FFN_UP, + MODEL_TENSOR.V_ENC_FFN_GATE, + MODEL_TENSOR.V_ENC_FFN_DOWN, + MODEL_TENSOR.V_PRE_NORM, + MODEL_TENSOR.V_POST_NORM, + MODEL_TENSOR.V_MM_INP_PROJ, + MODEL_TENSOR.V_MM_SOFT_EMB_NORM, + MODEL_TENSOR.V_RESMPL_POS_EMBD_K, + MODEL_TENSOR.V_RESMPL_ATTN_Q, + MODEL_TENSOR.V_RESMPL_ATTN_K, + MODEL_TENSOR.V_RESMPL_ATTN_V, + MODEL_TENSOR.V_RESMPL_ATTN_OUT, + MODEL_TENSOR.V_RESMPL_KV, + MODEL_TENSOR.V_RESMPL_KV_NORM, + MODEL_TENSOR.V_RESMPL_POST_NORM, + MODEL_TENSOR.V_RESMPL_Q_NORM, + MODEL_TENSOR.V_RESMPL_PROJ, + MODEL_TENSOR.V_RESMPL_QUERY, + MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK, + ], MODEL_ARCH.LLAMA: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, @@ -623,6 +817,29 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, ], + MODEL_ARCH.LLAMA4: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_ROT_EMBD, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.FFN_GATE_SHEXP, + MODEL_TENSOR.FFN_DOWN_SHEXP, + MODEL_TENSOR.FFN_UP_SHEXP, + ], MODEL_ARCH.DECI: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, @@ -746,6 +963,22 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_UP, MODEL_TENSOR.LAYER_OUT_NORM, ], + MODEL_ARCH.NOMIC_BERT_MOE: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.TOKEN_EMBD_NORM, + MODEL_TENSOR.TOKEN_TYPES, + MODEL_TENSOR.POS_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.ATTN_OUT_NORM, + MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.LAYER_OUT_NORM, + ], MODEL_ARCH.JINA_BERT_V2: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD_NORM, @@ -896,6 +1129,40 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN_SHEXP, MODEL_TENSOR.FFN_UP_SHEXP, ], + MODEL_ARCH.QWEN3: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], + MODEL_ARCH.QWEN3MOE: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + ], MODEL_ARCH.PLAMO: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, @@ -1084,6 +1351,24 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_PRE_NORM, MODEL_TENSOR.FFN_POST_NORM, ], + MODEL_ARCH.GEMMA3: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_POST_NORM, + MODEL_TENSOR.FFN_PRE_NORM, + MODEL_TENSOR.FFN_POST_NORM, + ], MODEL_ARCH.STARCODER2: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, @@ -1160,6 +1445,68 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.RWKV7: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.TOKEN_EMBD_NORM, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_NORM_2, + MODEL_TENSOR.TIME_MIX_LERP_FUSED, + MODEL_TENSOR.TIME_MIX_W0, + MODEL_TENSOR.TIME_MIX_W1, + MODEL_TENSOR.TIME_MIX_W2, + MODEL_TENSOR.TIME_MIX_A0, + MODEL_TENSOR.TIME_MIX_A1, + MODEL_TENSOR.TIME_MIX_A2, + MODEL_TENSOR.TIME_MIX_V0, + MODEL_TENSOR.TIME_MIX_V1, + MODEL_TENSOR.TIME_MIX_V2, + MODEL_TENSOR.TIME_MIX_G1, + MODEL_TENSOR.TIME_MIX_G2, + MODEL_TENSOR.TIME_MIX_K_K, + MODEL_TENSOR.TIME_MIX_K_A, + MODEL_TENSOR.TIME_MIX_R_K, + MODEL_TENSOR.TIME_MIX_KEY, + MODEL_TENSOR.TIME_MIX_VALUE, + MODEL_TENSOR.TIME_MIX_RECEPTANCE, + MODEL_TENSOR.TIME_MIX_LN, + MODEL_TENSOR.TIME_MIX_OUTPUT, + MODEL_TENSOR.CHANNEL_MIX_LERP_K, + MODEL_TENSOR.CHANNEL_MIX_KEY, + MODEL_TENSOR.CHANNEL_MIX_VALUE, + ], + MODEL_ARCH.ARWKV7: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.TOKEN_EMBD_NORM, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.TIME_MIX_LERP_FUSED, + MODEL_TENSOR.TIME_MIX_W0, + MODEL_TENSOR.TIME_MIX_W1, + MODEL_TENSOR.TIME_MIX_W2, + MODEL_TENSOR.TIME_MIX_A0, + MODEL_TENSOR.TIME_MIX_A1, + MODEL_TENSOR.TIME_MIX_A2, + MODEL_TENSOR.TIME_MIX_V0, + MODEL_TENSOR.TIME_MIX_V1, + MODEL_TENSOR.TIME_MIX_V2, + MODEL_TENSOR.TIME_MIX_G1, + MODEL_TENSOR.TIME_MIX_G2, + MODEL_TENSOR.TIME_MIX_K_K, + MODEL_TENSOR.TIME_MIX_K_A, + MODEL_TENSOR.TIME_MIX_R_K, + MODEL_TENSOR.TIME_MIX_KEY, + MODEL_TENSOR.TIME_MIX_VALUE, + MODEL_TENSOR.TIME_MIX_RECEPTANCE, + MODEL_TENSOR.TIME_MIX_LN, + MODEL_TENSOR.TIME_MIX_OUTPUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], MODEL_ARCH.MAMBA: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, @@ -1340,6 +1687,8 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.ATTN_Q_B, MODEL_TENSOR.ATTN_KV_A_MQA, MODEL_TENSOR.ATTN_KV_B, + MODEL_TENSOR.ATTN_K_B, + MODEL_TENSOR.ATTN_V_B, MODEL_TENSOR.ATTN_Q_A_NORM, MODEL_TENSOR.ATTN_KV_A_NORM, MODEL_TENSOR.ATTN_OUT, @@ -1357,18 +1706,52 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_UP_SHEXP, MODEL_TENSOR.FFN_EXP_PROBS_B, ], - MODEL_ARCH.CHATGLM: [ + MODEL_ARCH.PLM: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_KV_A_MQA, + MODEL_TENSOR.ATTN_KV_A_NORM, + MODEL_TENSOR.ATTN_KV_B, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_DOWN, + ], + MODEL_ARCH.CHATGLM : [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.OUTPUT, MODEL_TENSOR.ATTN_NORM, MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, MODEL_TENSOR.ATTN_OUT, MODEL_TENSOR.FFN_NORM, MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.GLM4 : [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.ATTN_POST_NORM, + MODEL_TENSOR.FFN_POST_NORM, + ], MODEL_ARCH.BITNET: [ MODEL_TENSOR.ATTN_Q, MODEL_TENSOR.ATTN_K, @@ -1541,6 +1924,25 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.POSNET_ATTN_V, MODEL_TENSOR.POSNET_ATTN_OUT, ], + MODEL_ARCH.BAILINGMOE: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.FFN_GATE_SHEXP, + MODEL_TENSOR.FFN_DOWN_SHEXP, + MODEL_TENSOR.FFN_UP_SHEXP, + ], # TODO } @@ -1593,6 +1995,9 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.ATTN_ROT_EMBD, ], + MODEL_ARCH.BAILINGMOE: [ + MODEL_TENSOR.ROPE_FREQS, + ], } # @@ -1601,64 +2006,64 @@ class MODEL_TENSOR(IntEnum): class TokenType(IntEnum): - NORMAL = 1 - UNKNOWN = 2 - CONTROL = 3 + NORMAL = 1 + UNKNOWN = 2 + CONTROL = 3 USER_DEFINED = 4 - UNUSED = 5 - BYTE = 6 + UNUSED = 5 + BYTE = 6 class RopeScalingType(Enum): - NONE = "none" - LINEAR = "linear" - YARN = "yarn" - LONGROPE = "longrope" + NONE = 'none' + LINEAR = 'linear' + YARN = 'yarn' + LONGROPE = 'longrope' class PoolingType(IntEnum): NONE = 0 MEAN = 1 - CLS = 2 + CLS = 2 class GGMLQuantizationType(IntEnum): - F32 = 0 - F16 = 1 - Q4_0 = 2 - Q4_1 = 3 - Q5_0 = 6 - Q5_1 = 7 - Q8_0 = 8 - Q8_1 = 9 - Q2_K = 10 - Q3_K = 11 - Q4_K = 12 - Q5_K = 13 - Q6_K = 14 - Q8_K = 15 + F32 = 0 + F16 = 1 + Q4_0 = 2 + Q4_1 = 3 + Q5_0 = 6 + Q5_1 = 7 + Q8_0 = 8 + Q8_1 = 9 + Q2_K = 10 + Q3_K = 11 + Q4_K = 12 + Q5_K = 13 + Q6_K = 14 + Q8_K = 15 IQ2_XXS = 16 - IQ2_XS = 17 + IQ2_XS = 17 IQ3_XXS = 18 - IQ1_S = 19 - IQ4_NL = 20 - IQ3_S = 21 - IQ2_S = 22 - IQ4_XS = 23 - I8 = 24 - I16 = 25 - I32 = 26 - I64 = 27 - F64 = 28 - IQ1_M = 29 - BF16 = 30 - TQ1_0 = 34 - TQ2_0 = 35 + IQ1_S = 19 + IQ4_NL = 20 + IQ3_S = 21 + IQ2_S = 22 + IQ4_XS = 23 + I8 = 24 + I16 = 25 + I32 = 26 + I64 = 27 + F64 = 28 + IQ1_M = 29 + BF16 = 30 + TQ1_0 = 34 + TQ2_0 = 35 class ExpertGatingFuncType(IntEnum): - SOFTMAX = 1 - SIGMOID = 2 + SOFTMAX = 1 + SIGMOID = 2 # TODO: add GGMLFileType from ggml_ftype in ggml.h @@ -1667,46 +2072,46 @@ class ExpertGatingFuncType(IntEnum): # from llama_ftype in llama.h # ALL VALUES SHOULD BE THE SAME HERE AS THEY ARE OVER THERE. class LlamaFileType(IntEnum): - ALL_F32 = 0 - MOSTLY_F16 = 1 # except 1d tensors - MOSTLY_Q4_0 = 2 # except 1d tensors - MOSTLY_Q4_1 = 3 # except 1d tensors + ALL_F32 = 0 + MOSTLY_F16 = 1 # except 1d tensors + MOSTLY_Q4_0 = 2 # except 1d tensors + MOSTLY_Q4_1 = 3 # except 1d tensors # MOSTLY_Q4_1_SOME_F16 = 4 # tok_embeddings.weight and output.weight are F16 # MOSTLY_Q4_2 = 5 # support has been removed # MOSTLY_Q4_3 = 6 # support has been removed - MOSTLY_Q8_0 = 7 # except 1d tensors - MOSTLY_Q5_0 = 8 # except 1d tensors - MOSTLY_Q5_1 = 9 # except 1d tensors - MOSTLY_Q2_K = 10 # except 1d tensors - MOSTLY_Q3_K_S = 11 # except 1d tensors - MOSTLY_Q3_K_M = 12 # except 1d tensors - MOSTLY_Q3_K_L = 13 # except 1d tensors - MOSTLY_Q4_K_S = 14 # except 1d tensors - MOSTLY_Q4_K_M = 15 # except 1d tensors - MOSTLY_Q5_K_S = 16 # except 1d tensors - MOSTLY_Q5_K_M = 17 # except 1d tensors - MOSTLY_Q6_K = 18 # except 1d tensors - MOSTLY_IQ2_XXS = 19 # except 1d tensors - MOSTLY_IQ2_XS = 20 # except 1d tensors - MOSTLY_Q2_K_S = 21 # except 1d tensors - MOSTLY_IQ3_XS = 22 # except 1d tensors - MOSTLY_IQ3_XXS = 23 # except 1d tensors - MOSTLY_IQ1_S = 24 # except 1d tensors - MOSTLY_IQ4_NL = 25 # except 1d tensors - MOSTLY_IQ3_S = 26 # except 1d tensors - MOSTLY_IQ3_M = 27 # except 1d tensors - MOSTLY_IQ2_S = 28 # except 1d tensors - MOSTLY_IQ2_M = 29 # except 1d tensors - MOSTLY_IQ4_XS = 30 # except 1d tensors - MOSTLY_IQ1_M = 31 # except 1d tensors - MOSTLY_BF16 = 32 # except 1d tensors + MOSTLY_Q8_0 = 7 # except 1d tensors + MOSTLY_Q5_0 = 8 # except 1d tensors + MOSTLY_Q5_1 = 9 # except 1d tensors + MOSTLY_Q2_K = 10 # except 1d tensors + MOSTLY_Q3_K_S = 11 # except 1d tensors + MOSTLY_Q3_K_M = 12 # except 1d tensors + MOSTLY_Q3_K_L = 13 # except 1d tensors + MOSTLY_Q4_K_S = 14 # except 1d tensors + MOSTLY_Q4_K_M = 15 # except 1d tensors + MOSTLY_Q5_K_S = 16 # except 1d tensors + MOSTLY_Q5_K_M = 17 # except 1d tensors + MOSTLY_Q6_K = 18 # except 1d tensors + MOSTLY_IQ2_XXS = 19 # except 1d tensors + MOSTLY_IQ2_XS = 20 # except 1d tensors + MOSTLY_Q2_K_S = 21 # except 1d tensors + MOSTLY_IQ3_XS = 22 # except 1d tensors + MOSTLY_IQ3_XXS = 23 # except 1d tensors + MOSTLY_IQ1_S = 24 # except 1d tensors + MOSTLY_IQ4_NL = 25 # except 1d tensors + MOSTLY_IQ3_S = 26 # except 1d tensors + MOSTLY_IQ3_M = 27 # except 1d tensors + MOSTLY_IQ2_S = 28 # except 1d tensors + MOSTLY_IQ2_M = 29 # except 1d tensors + MOSTLY_IQ4_XS = 30 # except 1d tensors + MOSTLY_IQ1_M = 31 # except 1d tensors + MOSTLY_BF16 = 32 # except 1d tensors # MOSTLY_Q4_0_4_4 = 33 # removed from gguf files, use Q4_0 and runtime repack # MOSTLY_Q4_0_4_8 = 34 # removed from gguf files, use Q4_0 and runtime repack # MOSTLY_Q4_0_8_8 = 35 # removed from gguf files, use Q4_0 and runtime repack - MOSTLY_TQ1_0 = 36 # except 1d tensors - MOSTLY_TQ2_0 = 37 # except 1d tensors + MOSTLY_TQ1_0 = 36 # except 1d tensors + MOSTLY_TQ2_0 = 37 # except 1d tensors - GUESSED = 1024 # not specified in the model file + GUESSED = 1024 # not specified in the model file class GGUFEndian(IntEnum): @@ -1715,18 +2120,18 @@ class GGUFEndian(IntEnum): class GGUFValueType(IntEnum): - UINT8 = 0 - INT8 = 1 - UINT16 = 2 - INT16 = 3 - UINT32 = 4 - INT32 = 5 + UINT8 = 0 + INT8 = 1 + UINT16 = 2 + INT16 = 3 + UINT32 = 4 + INT32 = 5 FLOAT32 = 6 - BOOL = 7 - STRING = 8 - ARRAY = 9 - UINT64 = 10 - INT64 = 11 + BOOL = 7 + STRING = 8 + ARRAY = 9 + UINT64 = 10 + INT64 = 11 FLOAT64 = 12 @staticmethod @@ -1746,106 +2151,112 @@ def get_type(val: Any) -> GGUFValueType: raise ValueError(f"Unknown type: {type(val)}") +class VisionProjectorType: + GEMMA3 = "gemma3" + IDEFICS3 = "idefics3" + PIXTRAL = "pixtral" + + # Items here are (block size, type size) QK_K = 256 GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = { - GGMLQuantizationType.F32: (1, 4), - GGMLQuantizationType.F16: (1, 2), - GGMLQuantizationType.Q4_0: (32, 2 + 16), - GGMLQuantizationType.Q4_1: (32, 2 + 2 + 16), - GGMLQuantizationType.Q5_0: (32, 2 + 4 + 16), - GGMLQuantizationType.Q5_1: (32, 2 + 2 + 4 + 16), - GGMLQuantizationType.Q8_0: (32, 2 + 32), - GGMLQuantizationType.Q8_1: (32, 4 + 4 + 32), - GGMLQuantizationType.Q2_K: (256, 2 + 2 + QK_K // 16 + QK_K // 4), - GGMLQuantizationType.Q3_K: (256, 2 + QK_K // 4 + QK_K // 8 + 12), - GGMLQuantizationType.Q4_K: (256, 2 + 2 + QK_K // 2 + 12), - GGMLQuantizationType.Q5_K: (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12), - GGMLQuantizationType.Q6_K: (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16), - GGMLQuantizationType.Q8_K: (256, 4 + QK_K + QK_K // 8), + GGMLQuantizationType.F32: (1, 4), + GGMLQuantizationType.F16: (1, 2), + GGMLQuantizationType.Q4_0: (32, 2 + 16), + GGMLQuantizationType.Q4_1: (32, 2 + 2 + 16), + GGMLQuantizationType.Q5_0: (32, 2 + 4 + 16), + GGMLQuantizationType.Q5_1: (32, 2 + 2 + 4 + 16), + GGMLQuantizationType.Q8_0: (32, 2 + 32), + GGMLQuantizationType.Q8_1: (32, 4 + 4 + 32), + GGMLQuantizationType.Q2_K: (256, 2 + 2 + QK_K // 16 + QK_K // 4), + GGMLQuantizationType.Q3_K: (256, 2 + QK_K // 4 + QK_K // 8 + 12), + GGMLQuantizationType.Q4_K: (256, 2 + 2 + QK_K // 2 + 12), + GGMLQuantizationType.Q5_K: (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12), + GGMLQuantizationType.Q6_K: (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16), + GGMLQuantizationType.Q8_K: (256, 4 + QK_K + QK_K // 8), GGMLQuantizationType.IQ2_XXS: (256, 2 + QK_K // 4), - GGMLQuantizationType.IQ2_XS: (256, 2 + QK_K // 4 + QK_K // 32), + GGMLQuantizationType.IQ2_XS: (256, 2 + QK_K // 4 + QK_K // 32), GGMLQuantizationType.IQ3_XXS: (256, 2 + QK_K // 4 + QK_K // 8), - GGMLQuantizationType.IQ1_S: (256, 2 + QK_K // 8 + QK_K // 16), - GGMLQuantizationType.IQ4_NL: (32, 2 + 16), - GGMLQuantizationType.IQ3_S: (256, 2 + QK_K // 4 + QK_K // 8 + QK_K // 32 + 4), - GGMLQuantizationType.IQ2_S: (256, 2 + QK_K // 4 + QK_K // 16), - GGMLQuantizationType.IQ4_XS: (256, 2 + 2 + QK_K // 2 + QK_K // 64), - GGMLQuantizationType.I8: (1, 1), - GGMLQuantizationType.I16: (1, 2), - GGMLQuantizationType.I32: (1, 4), - GGMLQuantizationType.I64: (1, 8), - GGMLQuantizationType.F64: (1, 8), - GGMLQuantizationType.IQ1_M: (256, QK_K // 8 + QK_K // 16 + QK_K // 32), - GGMLQuantizationType.BF16: (1, 2), - GGMLQuantizationType.TQ1_0: (256, 2 + 4 * 13), - GGMLQuantizationType.TQ2_0: (256, 2 + 64), + GGMLQuantizationType.IQ1_S: (256, 2 + QK_K // 8 + QK_K // 16), + GGMLQuantizationType.IQ4_NL: (32, 2 + 16), + GGMLQuantizationType.IQ3_S: (256, 2 + QK_K // 4 + QK_K // 8 + QK_K // 32 + 4), + GGMLQuantizationType.IQ2_S: (256, 2 + QK_K // 4 + QK_K // 16), + GGMLQuantizationType.IQ4_XS: (256, 2 + 2 + QK_K // 2 + QK_K // 64), + GGMLQuantizationType.I8: (1, 1), + GGMLQuantizationType.I16: (1, 2), + GGMLQuantizationType.I32: (1, 4), + GGMLQuantizationType.I64: (1, 8), + GGMLQuantizationType.F64: (1, 8), + GGMLQuantizationType.IQ1_M: (256, QK_K // 8 + QK_K // 16 + QK_K // 32), + GGMLQuantizationType.BF16: (1, 2), + GGMLQuantizationType.TQ1_0: (256, 2 + 4 * 13), + GGMLQuantizationType.TQ2_0: (256, 2 + 64), } # Aliases for backward compatibility. # general -KEY_GENERAL_ARCHITECTURE = Keys.General.ARCHITECTURE +KEY_GENERAL_ARCHITECTURE = Keys.General.ARCHITECTURE KEY_GENERAL_QUANTIZATION_VERSION = Keys.General.QUANTIZATION_VERSION -KEY_GENERAL_ALIGNMENT = Keys.General.ALIGNMENT -KEY_GENERAL_NAME = Keys.General.NAME -KEY_GENERAL_AUTHOR = Keys.General.AUTHOR -KEY_GENERAL_URL = Keys.General.URL -KEY_GENERAL_DESCRIPTION = Keys.General.DESCRIPTION -KEY_GENERAL_LICENSE = Keys.General.LICENSE -KEY_GENERAL_SOURCE_URL = Keys.General.SOURCE_URL -KEY_GENERAL_FILE_TYPE = Keys.General.FILE_TYPE +KEY_GENERAL_ALIGNMENT = Keys.General.ALIGNMENT +KEY_GENERAL_NAME = Keys.General.NAME +KEY_GENERAL_AUTHOR = Keys.General.AUTHOR +KEY_GENERAL_URL = Keys.General.URL +KEY_GENERAL_DESCRIPTION = Keys.General.DESCRIPTION +KEY_GENERAL_LICENSE = Keys.General.LICENSE +KEY_GENERAL_SOURCE_URL = Keys.General.SOURCE_URL +KEY_GENERAL_FILE_TYPE = Keys.General.FILE_TYPE # LLM -KEY_VOCAB_SIZE = Keys.LLM.VOCAB_SIZE -KEY_CONTEXT_LENGTH = Keys.LLM.CONTEXT_LENGTH -KEY_EMBEDDING_LENGTH = Keys.LLM.EMBEDDING_LENGTH -KEY_BLOCK_COUNT = Keys.LLM.BLOCK_COUNT -KEY_FEED_FORWARD_LENGTH = Keys.LLM.FEED_FORWARD_LENGTH +KEY_VOCAB_SIZE = Keys.LLM.VOCAB_SIZE +KEY_CONTEXT_LENGTH = Keys.LLM.CONTEXT_LENGTH +KEY_EMBEDDING_LENGTH = Keys.LLM.EMBEDDING_LENGTH +KEY_BLOCK_COUNT = Keys.LLM.BLOCK_COUNT +KEY_FEED_FORWARD_LENGTH = Keys.LLM.FEED_FORWARD_LENGTH KEY_USE_PARALLEL_RESIDUAL = Keys.LLM.USE_PARALLEL_RESIDUAL -KEY_TENSOR_DATA_LAYOUT = Keys.LLM.TENSOR_DATA_LAYOUT +KEY_TENSOR_DATA_LAYOUT = Keys.LLM.TENSOR_DATA_LAYOUT # attention -KEY_ATTENTION_HEAD_COUNT = Keys.Attention.HEAD_COUNT -KEY_ATTENTION_HEAD_COUNT_KV = Keys.Attention.HEAD_COUNT_KV -KEY_ATTENTION_MAX_ALIBI_BIAS = Keys.Attention.MAX_ALIBI_BIAS -KEY_ATTENTION_CLAMP_KQV = Keys.Attention.CLAMP_KQV -KEY_ATTENTION_LAYERNORM_EPS = Keys.Attention.LAYERNORM_EPS +KEY_ATTENTION_HEAD_COUNT = Keys.Attention.HEAD_COUNT +KEY_ATTENTION_HEAD_COUNT_KV = Keys.Attention.HEAD_COUNT_KV +KEY_ATTENTION_MAX_ALIBI_BIAS = Keys.Attention.MAX_ALIBI_BIAS +KEY_ATTENTION_CLAMP_KQV = Keys.Attention.CLAMP_KQV +KEY_ATTENTION_LAYERNORM_EPS = Keys.Attention.LAYERNORM_EPS KEY_ATTENTION_LAYERNORM_RMS_EPS = Keys.Attention.LAYERNORM_RMS_EPS # RoPE -KEY_ROPE_DIMENSION_COUNT = Keys.Rope.DIMENSION_COUNT -KEY_ROPE_FREQ_BASE = Keys.Rope.FREQ_BASE -KEY_ROPE_SCALING_TYPE = Keys.Rope.SCALING_TYPE -KEY_ROPE_SCALING_FACTOR = Keys.Rope.SCALING_FACTOR +KEY_ROPE_DIMENSION_COUNT = Keys.Rope.DIMENSION_COUNT +KEY_ROPE_FREQ_BASE = Keys.Rope.FREQ_BASE +KEY_ROPE_SCALING_TYPE = Keys.Rope.SCALING_TYPE +KEY_ROPE_SCALING_FACTOR = Keys.Rope.SCALING_FACTOR KEY_ROPE_SCALING_ORIG_CTX_LEN = Keys.Rope.SCALING_ORIG_CTX_LEN -KEY_ROPE_SCALING_FINETUNED = Keys.Rope.SCALING_FINETUNED +KEY_ROPE_SCALING_FINETUNED = Keys.Rope.SCALING_FINETUNED # SSM -KEY_SSM_CONV_KERNEL = Keys.SSM.CONV_KERNEL -KEY_SSM_INNER_SIZE = Keys.SSM.INNER_SIZE -KEY_SSM_STATE_SIZE = Keys.SSM.STATE_SIZE +KEY_SSM_CONV_KERNEL = Keys.SSM.CONV_KERNEL +KEY_SSM_INNER_SIZE = Keys.SSM.INNER_SIZE +KEY_SSM_STATE_SIZE = Keys.SSM.STATE_SIZE KEY_SSM_TIME_STEP_RANK = Keys.SSM.TIME_STEP_RANK -KEY_SSM_DT_B_C_RMS = Keys.SSM.DT_B_C_RMS +KEY_SSM_DT_B_C_RMS = Keys.SSM.DT_B_C_RMS # tokenization -KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL -KEY_TOKENIZER_PRE = Keys.Tokenizer.PRE -KEY_TOKENIZER_LIST = Keys.Tokenizer.LIST +KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL +KEY_TOKENIZER_PRE = Keys.Tokenizer.PRE +KEY_TOKENIZER_LIST = Keys.Tokenizer.LIST KEY_TOKENIZER_TOKEN_TYPE = Keys.Tokenizer.TOKEN_TYPE -KEY_TOKENIZER_SCORES = Keys.Tokenizer.SCORES -KEY_TOKENIZER_MERGES = Keys.Tokenizer.MERGES -KEY_TOKENIZER_BOS_ID = Keys.Tokenizer.BOS_ID -KEY_TOKENIZER_EOS_ID = Keys.Tokenizer.EOS_ID -KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID -KEY_TOKENIZER_EOM_ID = Keys.Tokenizer.EOM_ID -KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID -KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID -KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID -KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID -KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON -KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV +KEY_TOKENIZER_SCORES = Keys.Tokenizer.SCORES +KEY_TOKENIZER_MERGES = Keys.Tokenizer.MERGES +KEY_TOKENIZER_BOS_ID = Keys.Tokenizer.BOS_ID +KEY_TOKENIZER_EOS_ID = Keys.Tokenizer.EOS_ID +KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID +KEY_TOKENIZER_EOM_ID = Keys.Tokenizer.EOM_ID +KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID +KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID +KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID +KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID +KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON +KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV KEY_TOKENIZER_FIM_PRE_ID = Keys.Tokenizer.FIM_PRE_ID KEY_TOKENIZER_FIM_SUF_ID = Keys.Tokenizer.FIM_SUF_ID @@ -1855,6 +2266,6 @@ def get_type(val: Any) -> GGUFValueType: KEY_TOKENIZER_FIM_SEP_ID = Keys.Tokenizer.FIM_SEP_ID # deprecated -KEY_TOKENIZER_PREFIX_ID = Keys.Tokenizer.PREFIX_ID -KEY_TOKENIZER_SUFFIX_ID = Keys.Tokenizer.SUFFIX_ID -KEY_TOKENIZER_MIDDLE_ID = Keys.Tokenizer.MIDDLE_ID +KEY_TOKENIZER_PREFIX_ID = Keys.Tokenizer.PREFIX_ID +KEY_TOKENIZER_SUFFIX_ID = Keys.Tokenizer.SUFFIX_ID +KEY_TOKENIZER_MIDDLE_ID = Keys.Tokenizer.MIDDLE_ID diff --git a/lpm_kernel/L2/gguf-py/gguf/gguf_reader.py b/lpm_kernel/L2/gguf-py/gguf/gguf_reader.py index 962c43e9..5991cdb7 100644 --- a/lpm_kernel/L2/gguf-py/gguf/gguf_reader.py +++ b/lpm_kernel/L2/gguf-py/gguf/gguf_reader.py @@ -6,6 +6,7 @@ import logging import os +import sys from collections import OrderedDict from typing import Any, Literal, NamedTuple, TypeVar, Union @@ -15,7 +16,6 @@ from .quants import quant_shape_to_byte_shape if __name__ == "__main__": - import sys from pathlib import Path # Allow running file in package as a script. @@ -28,6 +28,7 @@ GGUF_VERSION, GGMLQuantizationType, GGUFValueType, + GGUFEndian, ) logger = logging.getLogger(__name__) @@ -53,6 +54,48 @@ class ReaderField(NamedTuple): types: list[GGUFValueType] = [] + def contents(self, index_or_slice: int | slice = slice(None)) -> Any: + if self.types: + to_string = lambda x: str(x.tobytes(), encoding='utf-8') # noqa: E731 + main_type = self.types[0] + + if main_type == GGUFValueType.ARRAY: + sub_type = self.types[-1] + + if sub_type == GGUFValueType.STRING: + indices = self.data[index_or_slice] + + if isinstance(index_or_slice, int): + return to_string(self.parts[indices]) # type: ignore + else: + return [to_string(self.parts[idx]) for idx in indices] # type: ignore + else: + # FIXME: When/if _get_field_parts() support multi-dimensional arrays, this must do so too + + # Check if it's unsafe to perform slice optimization on data + # if any(True for idx in self.data if len(self.parts[idx]) != 1): + # optim_slice = slice(None) + # else: + # optim_slice = index_or_slice + # index_or_slice = slice(None) + + # if isinstance(optim_slice, int): + # return self.parts[self.data[optim_slice]].tolist()[0] + # else: + # return [pv for idx in self.data[optim_slice] for pv in self.parts[idx].tolist()][index_or_slice] + + if isinstance(index_or_slice, int): + return self.parts[self.data[index_or_slice]].tolist()[0] + else: + return [pv for idx in self.data[index_or_slice] for pv in self.parts[idx].tolist()] + + if main_type == GGUFValueType.STRING: + return to_string(self.parts[-1]) + else: + return self.parts[-1].tolist()[0] + + return None + class ReaderTensor(NamedTuple): name: str @@ -67,34 +110,32 @@ class ReaderTensor(NamedTuple): class GGUFReader: # I - same as host, S - swapped - byte_order: Literal["I", "S"] = "I" + byte_order: Literal['I', 'S'] = 'I' alignment: int = GGUF_DEFAULT_ALIGNMENT data_offset: int # Note: Internal helper, API may change. gguf_scalar_to_np: dict[GGUFValueType, type[np.generic]] = { - GGUFValueType.UINT8: np.uint8, - GGUFValueType.INT8: np.int8, - GGUFValueType.UINT16: np.uint16, - GGUFValueType.INT16: np.int16, - GGUFValueType.UINT32: np.uint32, - GGUFValueType.INT32: np.int32, + GGUFValueType.UINT8: np.uint8, + GGUFValueType.INT8: np.int8, + GGUFValueType.UINT16: np.uint16, + GGUFValueType.INT16: np.int16, + GGUFValueType.UINT32: np.uint32, + GGUFValueType.INT32: np.int32, GGUFValueType.FLOAT32: np.float32, - GGUFValueType.UINT64: np.uint64, - GGUFValueType.INT64: np.int64, + GGUFValueType.UINT64: np.uint64, + GGUFValueType.INT64: np.int64, GGUFValueType.FLOAT64: np.float64, - GGUFValueType.BOOL: np.bool_, + GGUFValueType.BOOL: np.bool_, } - def __init__( - self, path: os.PathLike[str] | str, mode: Literal["r", "r+", "c"] = "r" - ): - self.data = np.memmap(path, mode=mode) + def __init__(self, path: os.PathLike[str] | str, mode: Literal['r', 'r+', 'c'] = 'r'): + self.data = np.memmap(path, mode = mode) offs = 0 # Check for GGUF magic - if self._get(offs, np.uint32, override_order="<")[0] != GGUF_MAGIC: - raise ValueError("GGUF magic invalid") + if self._get(offs, np.uint32, override_order = '<')[0] != GGUF_MAGIC: + raise ValueError('GGUF magic invalid') offs += 4 # Check GGUF version @@ -102,46 +143,37 @@ def __init__( if temp_version[0] & 65535 == 0: # If we get 0 here that means it's (probably) a GGUF file created for # the opposite byte order of the machine this script is running on. - self.byte_order = "S" - temp_version = temp_version.newbyteorder(self.byte_order) + self.byte_order = 'S' + temp_version = temp_version.view(temp_version.dtype.newbyteorder(self.byte_order)) version = temp_version[0] if version not in READER_SUPPORTED_VERSIONS: - raise ValueError( - f"Sorry, file appears to be version {version} which we cannot handle" - ) + raise ValueError(f'Sorry, file appears to be version {version} which we cannot handle') + if sys.byteorder == "little": + # Host is little endian + host_endian = GGUFEndian.LITTLE + swapped_endian = GGUFEndian.BIG + else: + # Sorry PDP or other weird systems that don't use BE or LE. + host_endian = GGUFEndian.BIG + swapped_endian = GGUFEndian.LITTLE + self.endianess = swapped_endian if self.byte_order == "S" else host_endian self.fields: OrderedDict[str, ReaderField] = OrderedDict() self.tensors: list[ReaderTensor] = [] - offs += self._push_field( - ReaderField( - offs, "GGUF.version", [temp_version], [0], [GGUFValueType.UINT32] - ) - ) + offs += self._push_field(ReaderField(offs, 'GGUF.version', [temp_version], [0], [GGUFValueType.UINT32])) # Check tensor count and kv count temp_counts = self._get(offs, np.uint64, 2) - offs += self._push_field( - ReaderField( - offs, - "GGUF.tensor_count", - [temp_counts[:1]], - [0], - [GGUFValueType.UINT64], - ) - ) - offs += self._push_field( - ReaderField( - offs, "GGUF.kv_count", [temp_counts[1:]], [0], [GGUFValueType.UINT64] - ) - ) + offs += self._push_field(ReaderField(offs, 'GGUF.tensor_count', [temp_counts[:1]], [0], [GGUFValueType.UINT64])) + offs += self._push_field(ReaderField(offs, 'GGUF.kv_count', [temp_counts[1:]], [0], [GGUFValueType.UINT64])) tensor_count, kv_count = temp_counts offs = self._build_fields(offs, kv_count) # Build Tensor Info Fields offs, tensors_fields = self._build_tensor_info(offs, tensor_count) - new_align = self.fields.get("general.alignment") + new_align = self.fields.get('general.alignment') if new_align is not None: if new_align.types != [GGUFValueType.UINT32]: - raise ValueError("Bad type for general.alignment field") + raise ValueError('Bad type for general.alignment field') self.alignment = new_align.parts[-1][0] padding = offs % self.alignment if padding != 0: @@ -149,7 +181,7 @@ def __init__( self.data_offset = offs self._build_tensors(offs, tensors_fields) - _DT = TypeVar("_DT", bound=npt.DTypeLike) + _DT = TypeVar('_DT', bound = npt.DTypeLike) # Fetch a key/value metadata field by key. def get_field(self, key: str) -> Union[ReaderField, None]: @@ -160,41 +192,31 @@ def get_tensor(self, idx: int) -> ReaderTensor: return self.tensors[idx] def _get( - self, - offset: int, - dtype: npt.DTypeLike, - count: int = 1, - override_order: None | Literal["I", "S", "<"] = None, + self, offset: int, dtype: npt.DTypeLike, count: int = 1, override_order: None | Literal['I', 'S', '<'] = None, ) -> npt.NDArray[Any]: count = int(count) - itemsize = int(np.empty([], dtype=dtype).itemsize) + itemsize = int(np.empty([], dtype = dtype).itemsize) end_offs = offset + itemsize * count arr = self.data[offset:end_offs].view(dtype=dtype)[:count] - if override_order is None: - return arr - return arr.view(arr.dtype.newbyteorder(override_order)) + return arr.view(arr.dtype.newbyteorder(self.byte_order if override_order is None else override_order)) def _push_field(self, field: ReaderField, skip_sum: bool = False) -> int: if field.name in self.fields: # TODO: add option to generate error on duplicate keys # raise KeyError(f'Duplicate {field.name} already in list at offset {field.offset}') - logger.warning(f"Duplicate key {field.name} at offset {field.offset}") - self.fields[field.name + "_{}".format(field.offset)] = field + logger.warning(f'Duplicate key {field.name} at offset {field.offset}') + self.fields[field.name + '_{}'.format(field.offset)] = field else: self.fields[field.name] = field return 0 if skip_sum else sum(int(part.nbytes) for part in field.parts) - def _get_str( - self, offset: int - ) -> tuple[npt.NDArray[np.uint64], npt.NDArray[np.uint8]]: + def _get_str(self, offset: int) -> tuple[npt.NDArray[np.uint64], npt.NDArray[np.uint8]]: slen = self._get(offset, np.uint64) return slen, self._get(offset + 8, np.uint8, slen[0]) def _get_field_parts( - self, - orig_offs: int, - raw_type: int, + self, orig_offs: int, raw_type: int, ) -> tuple[int, list[npt.NDArray[Any]], list[int], list[GGUFValueType]]: offs = orig_offs types: list[GGUFValueType] = [] @@ -218,10 +240,9 @@ def _get_field_parts( offs += int(alen.nbytes) aparts: list[npt.NDArray[Any]] = [raw_itype, alen] data_idxs: list[int] = [] + # FIXME: Handle multi-dimensional arrays properly instead of flattening for idx in range(alen[0]): - curr_size, curr_parts, curr_idxs, curr_types = self._get_field_parts( - offs, raw_itype[0] - ) + curr_size, curr_parts, curr_idxs, curr_types = self._get_field_parts(offs, raw_itype[0]) if idx == 0: types += curr_types idxs_offs = len(aparts) @@ -230,7 +251,7 @@ def _get_field_parts( offs += curr_size return offs - orig_offs, aparts, data_idxs, types # We can't deal with this one. - raise ValueError("Unknown/unhandled field type {gtype}") + raise ValueError('Unknown/unhandled field type {gtype}') def _get_tensor_info_field(self, orig_offs: int) -> ReaderField: offs = orig_offs @@ -257,7 +278,7 @@ def _get_tensor_info_field(self, orig_offs: int) -> ReaderField: return ReaderField( orig_offs, - str(bytes(name_data), encoding="utf-8"), + str(bytes(name_data), encoding = 'utf-8'), [name_len, name_data, n_dims, dims, raw_dtype, offset_tensor], [1, 3, 4, 5], ) @@ -271,26 +292,19 @@ def _build_fields(self, offs: int, count: int) -> int: offs += int(raw_kv_type.nbytes) parts: list[npt.NDArray[Any]] = [kv_klen, kv_kdata, raw_kv_type] idxs_offs = len(parts) - field_size, field_parts, field_idxs, field_types = self._get_field_parts( - offs, raw_kv_type[0] - ) + field_size, field_parts, field_idxs, field_types = self._get_field_parts(offs, raw_kv_type[0]) parts += field_parts - self._push_field( - ReaderField( - orig_offs, - str(bytes(kv_kdata), encoding="utf-8"), - parts, - [idx + idxs_offs for idx in field_idxs], - field_types, - ), - skip_sum=True, - ) + self._push_field(ReaderField( + orig_offs, + str(bytes(kv_kdata), encoding = 'utf-8'), + parts, + [idx + idxs_offs for idx in field_idxs], + field_types, + ), skip_sum = True) offs += field_size return offs - def _build_tensor_info( - self, offs: int, count: int - ) -> tuple[int, list[ReaderField]]: + def _build_tensor_info(self, offs: int, count: int) -> tuple[int, list[ReaderField]]: tensor_fields = [] for _ in range(count): field = self._get_tensor_info_field(offs) @@ -300,13 +314,13 @@ def _build_tensor_info( def _build_tensors(self, start_offs: int, fields: list[ReaderField]) -> None: tensors = [] - tensor_names = set() # keep track of name to prevent duplicated tensors + tensor_names = set() # keep track of name to prevent duplicated tensors for field in fields: _name_len, name_data, _n_dims, dims, raw_dtype, offset_tensor = field.parts # check if there's any tensor having same name already in the list - tensor_name = str(bytes(name_data), encoding="utf-8") + tensor_name = str(bytes(name_data), encoding = 'utf-8') if tensor_name in tensor_names: - raise ValueError(f"Found duplicated tensor with name {tensor_name}") + raise ValueError(f'Found duplicated tensor with name {tensor_name}') tensor_names.add(tensor_name) ggml_type = GGMLQuantizationType(raw_dtype[0]) n_elems = int(np.prod(dims)) @@ -340,16 +354,14 @@ def _build_tensors(self, start_offs: int, fields: list[ReaderField]) -> None: item_count = n_bytes item_type = np.uint8 np_dims = quant_shape_to_byte_shape(np_dims, ggml_type) - tensors.append( - ReaderTensor( - name=tensor_name, - tensor_type=ggml_type, - shape=dims, - n_elements=n_elems, - n_bytes=n_bytes, - data_offset=data_offs, - data=self._get(data_offs, item_type, item_count).reshape(np_dims), - field=field, - ) - ) + tensors.append(ReaderTensor( + name = tensor_name, + tensor_type = ggml_type, + shape = dims, + n_elements = n_elems, + n_bytes = n_bytes, + data_offset = data_offs, + data = self._get(data_offs, item_type, item_count).reshape(np_dims), + field = field, + )) self.tensors = tensors diff --git a/lpm_kernel/L2/gguf-py/gguf/gguf_writer.py b/lpm_kernel/L2/gguf-py/gguf/gguf_writer.py index c8c3d200..f22a6d4a 100644 --- a/lpm_kernel/L2/gguf-py/gguf/gguf_writer.py +++ b/lpm_kernel/L2/gguf-py/gguf/gguf_writer.py @@ -53,8 +53,8 @@ class GGUFValue: class WriterState(Enum): NO_FILE = auto() - EMPTY = auto() - HEADER = auto() + EMPTY = auto() + HEADER = auto() KV_DATA = auto() TI_DATA = auto() WEIGHTS = auto() @@ -68,29 +68,22 @@ class GGUFWriter: kv_data: list[dict[str, GGUFValue]] state: WriterState _simple_value_packing = { - GGUFValueType.UINT8: "B", - GGUFValueType.INT8: "b", - GGUFValueType.UINT16: "H", - GGUFValueType.INT16: "h", - GGUFValueType.UINT32: "I", - GGUFValueType.INT32: "i", + GGUFValueType.UINT8: "B", + GGUFValueType.INT8: "b", + GGUFValueType.UINT16: "H", + GGUFValueType.INT16: "h", + GGUFValueType.UINT32: "I", + GGUFValueType.INT32: "i", GGUFValueType.FLOAT32: "f", - GGUFValueType.UINT64: "Q", - GGUFValueType.INT64: "q", + GGUFValueType.UINT64: "Q", + GGUFValueType.INT64: "q", GGUFValueType.FLOAT64: "d", - GGUFValueType.BOOL: "?", + GGUFValueType.BOOL: "?", } def __init__( - self, - path: os.PathLike[str] | str | None, - arch: str, - use_temp_file: bool = False, - endianess: GGUFEndian = GGUFEndian.LITTLE, - split_max_tensors: int = 0, - split_max_size: int = 0, - dry_run: bool = False, - small_first_shard: bool = False, + self, path: os.PathLike[str] | str | None, arch: str, use_temp_file: bool = False, endianess: GGUFEndian = GGUFEndian.LITTLE, + split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False ): self.fout = None self.path = Path(path) if path else None @@ -105,11 +98,9 @@ def __init__( self.split_max_size = split_max_size self.dry_run = dry_run self.small_first_shard = small_first_shard - logger.info( - "gguf: This GGUF file is for {0} Endian only".format( - "Big" if self.endianess == GGUFEndian.BIG else "Little", - ) - ) + logger.info("gguf: This GGUF file is for {0} Endian only".format( + "Big" if self.endianess == GGUFEndian.BIG else "Little", + )) self.state = WriterState.NO_FILE if self.small_first_shard: @@ -129,6 +120,7 @@ def get_total_parameter_count(self) -> tuple[int, int, int, int]: for tensors in self.tensors: for name, info in tensors.items(): + shape = info.shape if name.endswith(".lora_a"): @@ -137,9 +129,7 @@ def get_total_parameter_count(self) -> tuple[int, int, int, int]: elif name.endswith(".lora_b"): if last_lora_a is None or last_lora_a[0] != name[:-1] + "a": # Bail when the LoRA pair can't be found trivially - logger.warning( - "can't measure LoRA size correctly, tensor order is unusual" - ) + logger.warning("can't measure LoRA size correctly, tensor order is unusual") return 0, 0, 0, 0 else: shape = (*shape[:-1], last_lora_a[1].shape[-1]) @@ -147,7 +137,7 @@ def get_total_parameter_count(self) -> tuple[int, int, int, int]: size = prod(shape) if "_exps." in name: - expert_params += size // shape[-3] + expert_params += (size // shape[-3]) expert_sum += shape[-3] n_expert_tensors += 1 else: @@ -168,26 +158,15 @@ def get_total_parameter_count(self) -> tuple[int, int, int, int]: def format_shard_names(self, path: Path) -> list[Path]: if len(self.tensors) == 1: return [path] - return [ - path.with_name( - SHARD_NAME_FORMAT.format(path.stem, i + 1, len(self.tensors)) - ) - for i in range(len(self.tensors)) - ] + return [path.with_name(SHARD_NAME_FORMAT.format(path.stem, i + 1, len(self.tensors))) for i in range(len(self.tensors))] def open_output_file(self, path: Path | None = None) -> None: - if ( - self.state is WriterState.EMPTY - and self.fout is not None - and (path is None or path == self.path) - ): + if self.state is WriterState.EMPTY and self.fout is not None and (path is None or path == self.path): # allow calling this multiple times as long as the path is the same return if self.state is not WriterState.NO_FILE: - raise ValueError( - f"Expected output file to be not yet opened, got {self.state}" - ) + raise ValueError(f'Expected output file to be not yet opened, got {self.state}') if path is not None: self.path = path @@ -203,9 +182,7 @@ def print_plan(self) -> list[Path]: filenames = self.format_shard_names(self.path) assert len(filenames) == len(self.tensors) for name, tensors in zip(filenames, self.tensors): - logger.info( - f"{name}: n_tensors = {len(tensors)}, total_size = {GGUFWriter.format_n_bytes_to_str(sum(ti.nbytes for ti in tensors.values()))}" - ) + logger.info(f"{name}: n_tensors = {len(tensors)}, total_size = {GGUFWriter.format_n_bytes_to_str(sum(ti.nbytes for ti in tensors.values()))}") if self.dry_run: logger.info("Dry run, not writing files") @@ -225,23 +202,17 @@ def add_shard_kv_data(self) -> None: self.kv_data.extend({} for _ in range(len(self.kv_data), total_splits)) for i, kv_data in enumerate(self.kv_data): kv_data[Keys.Split.LLM_KV_SPLIT_NO] = GGUFValue(i, GGUFValueType.UINT16) - kv_data[Keys.Split.LLM_KV_SPLIT_COUNT] = GGUFValue( - total_splits, GGUFValueType.UINT16 - ) - kv_data[Keys.Split.LLM_KV_SPLIT_TENSORS_COUNT] = GGUFValue( - total_tensors, GGUFValueType.INT32 - ) + kv_data[Keys.Split.LLM_KV_SPLIT_COUNT] = GGUFValue(total_splits, GGUFValueType.UINT16) + kv_data[Keys.Split.LLM_KV_SPLIT_TENSORS_COUNT] = GGUFValue(total_tensors, GGUFValueType.INT32) def write_header_to_file(self, path: Path | None = None) -> None: - if len(self.tensors) == 1 and ( - self.split_max_tensors != 0 or self.split_max_size != 0 - ): + if len(self.tensors) == 1 and (self.split_max_tensors != 0 or self.split_max_size != 0): logger.warning("Model fails split requirements, not splitting") self.open_output_file(path) if self.state is not WriterState.EMPTY: - raise ValueError(f"Expected output file to be empty, got {self.state}") + raise ValueError(f'Expected output file to be empty, got {self.state}') assert self.fout is not None assert len(self.fout) == len(self.tensors) @@ -250,7 +221,7 @@ def write_header_to_file(self, path: Path | None = None) -> None: self.add_shard_kv_data() for fout, tensors, kv_data in zip(self.fout, self.tensors, self.kv_data): - fout.write(self._pack(" None: def write_kv_data_to_file(self) -> None: if self.state is not WriterState.HEADER: - raise ValueError( - f"Expected output file to contain the header, got {self.state}" - ) + raise ValueError(f'Expected output file to contain the header, got {self.state}') assert self.fout is not None for fout, kv_data in zip(self.fout, self.kv_data): @@ -278,9 +247,7 @@ def write_kv_data_to_file(self) -> None: def write_ti_data_to_file(self) -> None: if self.state is not WriterState.KV_DATA: - raise ValueError( - f"Expected output file to contain KV data, got {self.state}" - ) + raise ValueError(f'Expected output file to contain KV data, got {self.state}') assert self.fout is not None for fout, tensors in zip(self.fout, self.tensors): @@ -303,12 +270,12 @@ def write_ti_data_to_file(self) -> None: def add_key_value(self, key: str, val: Any, vtype: GGUFValueType) -> None: if any(key in kv_data for kv_data in self.kv_data): - raise ValueError(f"Duplicated key name {key!r}") + raise ValueError(f'Duplicated key name {key!r}') self.kv_data[0][key] = GGUFValue(value=val, type=vtype) def add_uint8(self, key: str, val: int) -> None: - self.add_key_value(key, val, GGUFValueType.UINT8) + self.add_key_value(key,val, GGUFValueType.UINT8) def add_int8(self, key: str, val: int) -> None: self.add_key_value(key, val, GGUFValueType.INT8) @@ -355,20 +322,14 @@ def ggml_pad(x: int, n: int) -> int: return ((x + n - 1) // n) * n def add_tensor_info( - self, - name: str, - tensor_shape: Sequence[int], - tensor_dtype: np.dtype, - tensor_nbytes: int, - raw_dtype: GGMLQuantizationType | None = None, + self, name: str, tensor_shape: Sequence[int], tensor_dtype: np.dtype, + tensor_nbytes: int, raw_dtype: GGMLQuantizationType | None = None, ) -> None: if self.state is not WriterState.NO_FILE: - raise ValueError( - f"Expected output file to be not yet opened, got {self.state}" - ) + raise ValueError(f'Expected output file to be not yet opened, got {self.state}') if any(name in tensors for tensors in self.tensors): - raise ValueError(f"Duplicated tensor name {name!r}") + raise ValueError(f'Duplicated tensor name {name!r}') if raw_dtype is None: if tensor_dtype == np.float16: @@ -386,9 +347,7 @@ def add_tensor_info( elif tensor_dtype == np.int64: dtype = GGMLQuantizationType.I64 else: - raise ValueError( - "Only F16, F32, F64, I8, I16, I32, I64 tensors are supported for now" - ) + raise ValueError("Only F16, F32, F64, I8, I16, I32, I64 tensors are supported for now") else: dtype = raw_dtype if tensor_dtype == np.uint8: @@ -399,22 +358,16 @@ def add_tensor_info( if ( # split when over tensor limit self.split_max_tensors != 0 and len(self.tensors[-1]) >= self.split_max_tensors - ) or ( # split when over size limit + ) or ( # split when over size limit self.split_max_size != 0 - and sum(ti.nbytes for ti in self.tensors[-1].values()) + tensor_nbytes - > self.split_max_size + and sum(ti.nbytes for ti in self.tensors[-1].values()) + tensor_nbytes > self.split_max_size ): self.tensors.append({}) - self.tensors[-1][name] = TensorInfo( - shape=tensor_shape, dtype=dtype, nbytes=tensor_nbytes - ) + self.tensors[-1][name] = TensorInfo(shape=tensor_shape, dtype=dtype, nbytes=tensor_nbytes) def add_tensor( - self, - name: str, - tensor: np.ndarray[Any, Any], - raw_shape: Sequence[int] | None = None, + self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None, raw_dtype: GGMLQuantizationType | None = None, ) -> None: if self.endianess == GGUFEndian.BIG: @@ -425,9 +378,7 @@ def add_tensor( self.temp_file = fp shape: Sequence[int] = raw_shape if raw_shape is not None else tensor.shape - self.add_tensor_info( - name, shape, tensor.dtype, tensor.nbytes, raw_dtype=raw_dtype - ) + self.add_tensor_info(name, shape, tensor.dtype, tensor.nbytes, raw_dtype=raw_dtype) if self.temp_file is None: self.tensors[-1][name].tensor = tensor @@ -437,21 +388,13 @@ def add_tensor( self.write_padding(self.temp_file, tensor.nbytes) def write_padding(self, fp: IO[bytes], n: int, align: int | None = None) -> None: - pad = ( - GGUFWriter.ggml_pad(n, align if align is not None else self.data_alignment) - - n - ) + pad = GGUFWriter.ggml_pad(n, align if align is not None else self.data_alignment) - n if pad != 0: fp.write(bytes([0] * pad)) def write_tensor_data(self, tensor: np.ndarray[Any, Any]) -> None: - if ( - self.state is not WriterState.TI_DATA - and self.state is not WriterState.WEIGHTS - ): - raise ValueError( - f"Expected output file to contain tensor info or weights, got {self.state}" - ) + if self.state is not WriterState.TI_DATA and self.state is not WriterState.WEIGHTS: + raise ValueError(f'Expected output file to contain tensor info or weights, got {self.state}') assert self.fout is not None if self.endianess == GGUFEndian.BIG: @@ -467,9 +410,7 @@ def write_tensor_data(self, tensor: np.ndarray[Any, Any]) -> None: # pop the first tensor info # TODO: cleaner way to get the first key - first_tensor_name = [ - name for name, _ in zip(self.tensors[file_id].keys(), range(1)) - ][0] + first_tensor_name = [name for name, _ in zip(self.tensors[file_id].keys(), range(1))][0] ti = self.tensors[file_id].pop(first_tensor_name) assert ti.nbytes == tensor.nbytes @@ -497,15 +438,8 @@ def write_tensors_to_file(self, *, progress: bool = False) -> None: total_bytes = sum(ti.nbytes for t in self.tensors for ti in t.values()) if len(self.fout) > 1: - shard_bar = tqdm( - desc=f"Shard (0/{len(self.fout)})", - total=None, - unit="byte", - unit_scale=True, - ) - bar = tqdm( - desc="Writing", total=total_bytes, unit="byte", unit_scale=True - ) + shard_bar = tqdm(desc=f"Shard (0/{len(self.fout)})", total=None, unit="byte", unit_scale=True) + bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True) for i, (fout, tensors) in enumerate(zip(self.fout, self.tensors)): if shard_bar is not None: @@ -515,9 +449,7 @@ def write_tensors_to_file(self, *, progress: bool = False) -> None: # relying on the fact that Python dicts preserve insertion order (since 3.7) for ti in tensors.values(): - assert ( - ti.tensor is not None - ) # can only iterate once over the tensors + assert ti.tensor is not None # can only iterate once over the tensors assert ti.tensor.nbytes == ti.nbytes ti.tensor.tofile(fout) if shard_bar is not None: @@ -529,9 +461,7 @@ def write_tensors_to_file(self, *, progress: bool = False) -> None: else: self.temp_file.seek(0) - shutil.copyfileobj( - self.temp_file, self.fout[0 if not self.small_first_shard else 1] - ) + shutil.copyfileobj(self.temp_file, self.fout[0 if not self.small_first_shard else 1]) self.flush() self.temp_file.close() @@ -637,14 +567,10 @@ def add_base_model_version(self, source_id: int, version: str) -> None: self.add_string(Keys.General.BASE_MODEL_VERSION.format(id=source_id), version) def add_base_model_organization(self, source_id: int, organization: str) -> None: - self.add_string( - Keys.General.BASE_MODEL_ORGANIZATION.format(id=source_id), organization - ) + self.add_string(Keys.General.BASE_MODEL_ORGANIZATION.format(id=source_id), organization) def add_base_model_description(self, source_id: int, description: str) -> None: - self.add_string( - Keys.General.BASE_MODEL_DESCRIPTION.format(id=source_id), description - ) + self.add_string(Keys.General.BASE_MODEL_DESCRIPTION.format(id=source_id), description) def add_base_model_url(self, source_id: int, url: str) -> None: self.add_string(Keys.General.BASE_MODEL_URL.format(id=source_id), url) @@ -671,14 +597,10 @@ def add_dataset_version(self, source_id: int, version: str) -> None: self.add_string(Keys.General.DATASET_VERSION.format(id=source_id), version) def add_dataset_organization(self, source_id: int, organization: str) -> None: - self.add_string( - Keys.General.DATASET_ORGANIZATION.format(id=source_id), organization - ) + self.add_string(Keys.General.DATASET_ORGANIZATION.format(id=source_id), organization) def add_dataset_description(self, source_id: int, description: str) -> None: - self.add_string( - Keys.General.DATASET_DESCRIPTION.format(id=source_id), description - ) + self.add_string(Keys.General.DATASET_DESCRIPTION.format(id=source_id), description) def add_dataset_url(self, source_id: int, url: str) -> None: self.add_string(Keys.General.DATASET_URL.format(id=source_id), url) @@ -729,9 +651,7 @@ def add_block_count(self, length: int) -> None: self.add_uint32(Keys.LLM.BLOCK_COUNT.format(arch=self.arch), length) def add_leading_dense_block_count(self, length: int) -> None: - self.add_uint32( - Keys.LLM.LEADING_DENSE_BLOCK_COUNT.format(arch=self.arch), length - ) + self.add_uint32(Keys.LLM.LEADING_DENSE_BLOCK_COUNT.format(arch=self.arch), length) def add_feed_forward_length(self, length: int | Sequence[int]) -> None: if isinstance(length, int): @@ -740,14 +660,10 @@ def add_feed_forward_length(self, length: int | Sequence[int]) -> None: self.add_array(Keys.LLM.FEED_FORWARD_LENGTH.format(arch=self.arch), length) def add_expert_feed_forward_length(self, length: int) -> None: - self.add_uint32( - Keys.LLM.EXPERT_FEED_FORWARD_LENGTH.format(arch=self.arch), length - ) + self.add_uint32(Keys.LLM.EXPERT_FEED_FORWARD_LENGTH.format(arch=self.arch), length) def add_expert_shared_feed_forward_length(self, length: int) -> None: - self.add_uint32( - Keys.LLM.EXPERT_SHARED_FEED_FORWARD_LENGTH.format(arch=self.arch), length - ) + self.add_uint32(Keys.LLM.EXPERT_SHARED_FEED_FORWARD_LENGTH.format(arch=self.arch), length) def add_parallel_residual(self, use: bool) -> None: self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use) @@ -773,6 +689,12 @@ def add_key_length(self, length: int) -> None: def add_value_length(self, length: int) -> None: self.add_uint32(Keys.Attention.VALUE_LENGTH.format(arch=self.arch), length) + def add_key_length_mla(self, length: int) -> None: + self.add_uint32(Keys.Attention.KEY_LENGTH_MLA.format(arch=self.arch), length) + + def add_value_length_mla(self, length: int) -> None: + self.add_uint32(Keys.Attention.VALUE_LENGTH_MLA.format(arch=self.arch), length) + def add_max_alibi_bias(self, bias: float) -> None: self.add_float32(Keys.Attention.MAX_ALIBI_BIAS.format(arch=self.arch), bias) @@ -806,6 +728,9 @@ def add_expert_weights_norm(self, value: bool) -> None: def add_expert_gating_func(self, value: ExpertGatingFuncType) -> None: self.add_uint32(Keys.LLM.EXPERT_GATING_FUNC.format(arch=self.arch), value.value) + def add_moe_every_n_layers(self, value: int) -> None: + self.add_uint32(Keys.LLM.MOE_EVERY_N_LAYERS.format(arch=self.arch), value) + def add_swin_norm(self, value: bool) -> None: self.add_bool(Keys.LLM.SWIN_NORM.format(arch=self.arch), value) @@ -830,6 +755,9 @@ def add_wkv_head_size(self, size: int) -> None: def add_token_shift_count(self, count: int) -> None: self.add_uint32(Keys.LLM.TOKEN_SHIFT_COUNT.format(arch=self.arch), count) + def add_interleave_moe_layer_step(self, value: int) -> None: + self.add_uint32(Keys.LLM.INTERLEAVE_MOE_LAYER_STEP.format(arch=self.arch), value) + def add_layer_norm_eps(self, value: float) -> None: self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value) @@ -851,6 +779,18 @@ def add_q_lora_rank(self, length: int) -> None: def add_kv_lora_rank(self, length: int) -> None: self.add_uint32(Keys.Attention.KV_LORA_RANK.format(arch=self.arch), length) + def add_decay_lora_rank(self, length: int) -> None: + self.add_uint32(Keys.Attention.DECAY_LORA_RANK.format(arch=self.arch), length) + + def add_iclr_lora_rank(self, length: int) -> None: + self.add_uint32(Keys.Attention.ICLR_LORA_RANK.format(arch=self.arch), length) + + def add_value_residual_mix_lora_rank(self, length: int) -> None: + self.add_uint32(Keys.Attention.VALUE_RESIDUAL_MIX_LORA_RANK.format(arch=self.arch), length) + + def add_gate_lora_rank(self, length: int) -> None: + self.add_uint32(Keys.Attention.GATE_LORA_RANK.format(arch=self.arch), length) + def add_relative_attn_buckets_count(self, value: int) -> None: self.add_uint32(Keys.Attention.REL_BUCKETS_COUNT.format(arch=self.arch), value) @@ -911,14 +851,10 @@ def add_tokenizer_model(self, model: str) -> None: def add_tokenizer_pre(self, pre: str) -> None: self.add_string(Keys.Tokenizer.PRE, pre) - def add_token_list( - self, tokens: Sequence[str] | Sequence[bytes] | Sequence[bytearray] - ) -> None: + def add_token_list(self, tokens: Sequence[str] | Sequence[bytes] | Sequence[bytearray]) -> None: self.add_array(Keys.Tokenizer.LIST, tokens) - def add_token_merges( - self, merges: Sequence[str] | Sequence[bytes] | Sequence[bytearray] - ) -> None: + def add_token_merges(self, merges: Sequence[str] | Sequence[bytes] | Sequence[bytearray]) -> None: self.add_array(Keys.Tokenizer.MERGES, merges) def add_token_types(self, types: Sequence[TokenType] | Sequence[int]) -> None: @@ -969,22 +905,18 @@ def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None: template_names = set() for choice in value: - name = choice.get("name", "") - template = choice.get("template") + name = choice.get('name', '') + template = choice.get('template') # Allowing non-alphanumerical characters in template name is probably not a good idea, so filter it - name = "".join( - (c if c in ascii_letters + digits else "_" for c in name) - ) + name = ''.join((c if c in ascii_letters + digits else '_' for c in name)) if name and template is not None: - if name == "default": + if name == 'default': template_default = template else: template_names.add(name) - self.add_string( - Keys.Tokenizer.CHAT_TEMPLATE_N.format(name=name), template - ) + self.add_string(Keys.Tokenizer.CHAT_TEMPLATE_N.format(name=name), template) if template_names: self.add_array(Keys.Tokenizer.CHAT_TEMPLATES, list(template_names)) @@ -1002,11 +934,58 @@ def add_eot_token_id(self, id: int) -> None: def add_eom_token_id(self, id: int) -> None: self.add_uint32(Keys.Tokenizer.EOM_ID, id) + # for vision models + + def add_vision_projection_dim(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.PROJECTION_DIM, value) + + def add_vision_has_vision_encoder(self, value: bool) -> None: + self.add_bool(Keys.ClipVision.HAS_VISION_ENCODER, value) + + def add_vision_patch_size(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.PATCH_SIZE, value) + + def add_vision_embedding_length(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.EMBEDDING_LENGTH, value) + + def add_vision_feed_forward_length(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.FEED_FORWARD_LENGTH, value) + + def add_vision_block_count(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.BLOCK_COUNT, value) + + def add_vision_head_count(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.Attention.HEAD_COUNT, value) + + def add_vision_projector_type(self, value: str) -> None: + self.add_string(Keys.ClipVision.PROJECTOR_TYPE, value) + + def add_vision_attention_layernorm_eps(self, value: float) -> None: + self.add_float32(Keys.ClipVision.Attention.LAYERNORM_EPS, value) + + def add_vision_image_size(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.IMAGE_SIZE, value) + + def add_vision_image_mean(self, values: Sequence[float]) -> None: + self.add_array(Keys.ClipVision.IMAGE_MEAN, values) + + def add_vision_image_std(self, values: Sequence[float]) -> None: + self.add_array(Keys.ClipVision.IMAGE_STD, values) + + def add_vision_use_gelu(self, value: bool) -> None: + self.add_bool(Keys.ClipVision.USE_GELU, value) + + def add_vision_use_silu(self, value: bool) -> None: + self.add_bool(Keys.ClipVision.USE_SILU, value) + + def add_vision_projector_scale_factor(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.Projector.SCALE_FACTOR, value) + def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes: - pack_prefix = "" + pack_prefix = '' if not skip_pack_prefix: - pack_prefix = "<" if self.endianess == GGUFEndian.LITTLE else ">" - return struct.pack(f"{pack_prefix}{fmt}", value) + pack_prefix = '<' if self.endianess == GGUFEndian.LITTLE else '>' + return struct.pack(f'{pack_prefix}{fmt}', value) def _pack_val(self, val: Any, vtype: GGUFValueType, add_vtype: bool) -> bytes: kv_data = bytearray() @@ -1016,14 +995,13 @@ def _pack_val(self, val: Any, vtype: GGUFValueType, add_vtype: bool) -> bytes: pack_fmt = self._simple_value_packing.get(vtype) if pack_fmt is not None: - kv_data += self._pack( - pack_fmt, val, skip_pack_prefix=vtype == GGUFValueType.BOOL - ) + kv_data += self._pack(pack_fmt, val, skip_pack_prefix = vtype == GGUFValueType.BOOL) elif vtype == GGUFValueType.STRING: encoded_val = val.encode("utf-8") if isinstance(val, str) else val kv_data += self._pack("Q", len(encoded_val)) kv_data += encoded_val elif vtype == GGUFValueType.ARRAY: + if not isinstance(val, Sequence): raise ValueError("Invalid GGUF metadata array, expecting sequence") @@ -1035,9 +1013,7 @@ def _pack_val(self, val: Any, vtype: GGUFValueType, add_vtype: bool) -> bytes: else: ltype = GGUFValueType.get_type(val[0]) if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]): - raise ValueError( - "All items in a GGUF array should be of the same type" - ) + raise ValueError("All items in a GGUF array should be of the same type") kv_data += self._pack("I", ltype) kv_data += self._pack("Q", len(val)) for item in val: diff --git a/lpm_kernel/L2/gguf-py/gguf/lazy.py b/lpm_kernel/L2/gguf-py/gguf/lazy.py index aec57ee0..f9bcadae 100644 --- a/lpm_kernel/L2/gguf-py/gguf/lazy.py +++ b/lpm_kernel/L2/gguf-py/gguf/lazy.py @@ -12,9 +12,8 @@ class LazyMeta(ABCMeta): - def __new__( - cls, name: str, bases: tuple[type, ...], namespace: dict[str, Any], **kwargs - ): + + def __new__(cls, name: str, bases: tuple[type, ...], namespace: dict[str, Any], **kwargs): def __getattr__(self, name: str) -> Any: meta_attr = getattr(self._meta, name) if callable(meta_attr): @@ -42,7 +41,6 @@ def wrapped_special_op(self, *args, **kwargs): getattr(type(self)._tensor_type, op_name), meta_noop=meta_noop, )(self, *args, **kwargs) - return wrapped_special_op # special methods bypass __getattr__, so they need to be added manually @@ -50,48 +48,11 @@ def wrapped_special_op(self, *args, **kwargs): # NOTE: doing this from a metaclass is very convenient # TODO: make this even more comprehensive for binary_op in ( - "lt", - "le", - "eq", - "ne", - "ge", - "gt", - "not" "abs", - "add", - "and", - "floordiv", - "invert", - "lshift", - "mod", - "mul", - "matmul", - "neg", - "or", - "pos", - "pow", - "rshift", - "sub", - "truediv", - "xor", - "iadd", - "iand", - "ifloordiv", - "ilshift", - "imod", - "imul", - "ior", - "irshift", - "isub", - "ixor", - "radd", - "rand", - "rfloordiv", - "rmul", - "ror", - "rpow", - "rsub", - "rtruediv", - "rxor", + "lt", "le", "eq", "ne", "ge", "gt", "not" + "abs", "add", "and", "floordiv", "invert", "lshift", "mod", "mul", "matmul", + "neg", "or", "pos", "pow", "rshift", "sub", "truediv", "xor", + "iadd", "iand", "ifloordiv", "ilshift", "imod", "imul", "ior", "irshift", "isub", "ixor", + "radd", "rand", "rfloordiv", "rmul", "ror", "rpow", "rsub", "rtruediv", "rxor", ): attr_name = f"__{binary_op}__" # the result of these operators usually has the same shape and dtype as the input, @@ -99,9 +60,7 @@ def wrapped_special_op(self, *args, **kwargs): namespace[attr_name] = mk_wrap(attr_name, meta_noop=True) for special_op in ( - "getitem", - "setitem", - "len", + "getitem", "setitem", "len", ): attr_name = f"__{special_op}__" namespace[attr_name] = mk_wrap(attr_name, meta_noop=False) @@ -118,15 +77,7 @@ class LazyBase(ABC, metaclass=LazyMeta): _kwargs: dict[str, Any] _func: Callable[[Any], Any] | None - def __init__( - self, - *, - meta: Any, - data: Any | None = None, - args: tuple = (), - kwargs: dict[str, Any] | None = None, - func: Callable[[Any], Any] | None = None, - ): + def __init__(self, *, meta: Any, data: Any | None = None, args: tuple = (), kwargs: dict[str, Any] | None = None, func: Callable[[Any], Any] | None = None): super().__init__() self._meta = meta self._data = data @@ -156,15 +107,7 @@ def _recurse_apply(o: Any, fn: Callable[[Any], Any]) -> Any: return o @classmethod - def _wrap_fn( - cls, - fn: Callable, - *, - use_self: LazyBase | None = None, - meta_noop: bool - | DTypeLike - | tuple[DTypeLike, Callable[[tuple[int, ...]], tuple[int, ...]]] = False, - ) -> Callable[[Any], Any]: + def _wrap_fn(cls, fn: Callable, *, use_self: LazyBase | None = None, meta_noop: bool | DTypeLike | tuple[DTypeLike, Callable[[tuple[int, ...]], tuple[int, ...]]] = False) -> Callable[[Any], Any]: def wrapped_fn(*args, **kwargs): if kwargs is None: kwargs = {} @@ -195,16 +138,23 @@ def wrapped_fn(*args, **kwargs): res = cls.meta_with_dtype_and_shape(meta_noop, res.shape) if isinstance(res, cls._tensor_type): - return cls( - meta=cls.eager_to_meta(res), args=args, kwargs=kwargs, func=fn - ) + return cls(meta=cls.eager_to_meta(res), args=args, kwargs=kwargs, func=fn) + elif isinstance(res, tuple) and all(isinstance(t, cls._tensor_type) for t in res): + # share the evaluation between lazy tuple elements + shared_args: list = [args, None] + + def eager_tuple_element(a: list[Any], i: int = 0, /, **kw) -> LazyBase: + assert len(a) == 2 + if a[1] is None: + a[1] = fn(*a[0], **kw) + return a[1][i] + return tuple(cls(meta=cls.eager_to_meta(res[i]), args=(shared_args, i), kwargs=kwargs, func=eager_tuple_element) for i in range(len(res))) else: del res # not needed # non-tensor return likely relies on the contents of the args # (e.g. the result of torch.equal) eager_args = cls.to_eager(args) return fn(*eager_args, **kwargs) - return wrapped_fn @classmethod @@ -235,8 +185,7 @@ def eager_to_meta(cls, t: Any) -> Any: # must be overridden, meta tensor init is backend-specific @classmethod @abstractmethod - def meta_with_dtype_and_shape(cls, dtype: Any, shape: Any) -> Any: - pass + def meta_with_dtype_and_shape(cls, dtype: Any, shape: Any) -> Any: pass @classmethod def from_eager(cls, t: Any) -> Any: @@ -255,9 +204,7 @@ class LazyNumpyTensor(LazyBase): shape: tuple[int, ...] # Makes the type checker happy in quants.py @classmethod - def meta_with_dtype_and_shape( - cls, dtype: DTypeLike, shape: tuple[int, ...] - ) -> np.ndarray[Any, Any]: + def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: tuple[int, ...]) -> np.ndarray[Any, Any]: # The initial idea was to use np.nan as the fill value, # but non-float types like np.int16 can't use that. # So zero it is. @@ -266,16 +213,8 @@ def meta_with_dtype_and_shape( def astype(self, dtype, *args, **kwargs): meta = type(self).meta_with_dtype_and_shape(dtype, self._meta.shape) - full_args = ( - self, - dtype, - ) + args - return type(self)( - meta=meta, - args=full_args, - kwargs=kwargs, - func=(lambda a, *args, **kwargs: a.astype(*args, **kwargs)), - ) + full_args = (self, dtype,) + args + return type(self)(meta=meta, args=full_args, kwargs=kwargs, func=(lambda a, *args, **kwargs: a.astype(*args, **kwargs))) def tofile(self, *args, **kwargs): eager = LazyNumpyTensor.to_eager(self) diff --git a/lpm_kernel/L2/gguf-py/gguf/metadata.py b/lpm_kernel/L2/gguf-py/gguf/metadata.py index 50270c25..e807f434 100644 --- a/lpm_kernel/L2/gguf-py/gguf/metadata.py +++ b/lpm_kernel/L2/gguf-py/gguf/metadata.py @@ -44,12 +44,7 @@ class Metadata: datasets: Optional[list[dict]] = None @staticmethod - def load( - metadata_override_path: Optional[Path] = None, - model_path: Optional[Path] = None, - model_name: Optional[str] = None, - total_params: int = 0, - ) -> Metadata: + def load(metadata_override_path: Optional[Path] = None, model_path: Optional[Path] = None, model_name: Optional[str] = None, total_params: int = 0) -> Metadata: # This grabs as many contextual authorship metadata as possible from the model repository # making any conversion as required to match the gguf kv store metadata format # as well as giving users the ability to override any authorship metadata that may be incorrect @@ -62,77 +57,45 @@ def load( # TODO: load adapter_config.json when possible, it usually contains the base model of the LoRA adapter # heuristics - metadata = Metadata.apply_metadata_heuristic( - metadata, model_card, hf_params, model_path, total_params - ) + metadata = Metadata.apply_metadata_heuristic(metadata, model_card, hf_params, model_path, total_params) # Metadata Override File Provided # This is based on LLM_KV_NAMES mapping in llama.cpp metadata_override = Metadata.load_metadata_override(metadata_override_path) - metadata.name = metadata_override.get(Keys.General.NAME, metadata.name) - metadata.author = metadata_override.get(Keys.General.AUTHOR, metadata.author) - metadata.version = metadata_override.get(Keys.General.VERSION, metadata.version) - metadata.organization = metadata_override.get( - Keys.General.ORGANIZATION, metadata.organization - ) - - metadata.finetune = metadata_override.get( - Keys.General.FINETUNE, metadata.finetune - ) - metadata.basename = metadata_override.get( - Keys.General.BASENAME, metadata.basename - ) - - metadata.description = metadata_override.get( - Keys.General.DESCRIPTION, metadata.description - ) - metadata.quantized_by = metadata_override.get( - Keys.General.QUANTIZED_BY, metadata.quantized_by - ) - - metadata.size_label = metadata_override.get( - Keys.General.SIZE_LABEL, metadata.size_label - ) - metadata.license_name = metadata_override.get( - Keys.General.LICENSE_NAME, metadata.license_name - ) - metadata.license_link = metadata_override.get( - Keys.General.LICENSE_LINK, metadata.license_link - ) - - metadata.url = metadata_override.get(Keys.General.URL, metadata.url) - metadata.doi = metadata_override.get(Keys.General.DOI, metadata.doi) - metadata.uuid = metadata_override.get(Keys.General.UUID, metadata.uuid) - metadata.repo_url = metadata_override.get( - Keys.General.REPO_URL, metadata.repo_url - ) - - metadata.source_url = metadata_override.get( - Keys.General.SOURCE_URL, metadata.source_url - ) - metadata.source_doi = metadata_override.get( - Keys.General.SOURCE_DOI, metadata.source_doi - ) - metadata.source_uuid = metadata_override.get( - Keys.General.SOURCE_UUID, metadata.source_uuid - ) - metadata.source_repo_url = metadata_override.get( - Keys.General.SOURCE_REPO_URL, metadata.source_repo_url - ) + metadata.name = metadata_override.get(Keys.General.NAME, metadata.name) + metadata.author = metadata_override.get(Keys.General.AUTHOR, metadata.author) + metadata.version = metadata_override.get(Keys.General.VERSION, metadata.version) + metadata.organization = metadata_override.get(Keys.General.ORGANIZATION, metadata.organization) + + metadata.finetune = metadata_override.get(Keys.General.FINETUNE, metadata.finetune) + metadata.basename = metadata_override.get(Keys.General.BASENAME, metadata.basename) + + metadata.description = metadata_override.get(Keys.General.DESCRIPTION, metadata.description) + metadata.quantized_by = metadata_override.get(Keys.General.QUANTIZED_BY, metadata.quantized_by) + + metadata.size_label = metadata_override.get(Keys.General.SIZE_LABEL, metadata.size_label) + metadata.license_name = metadata_override.get(Keys.General.LICENSE_NAME, metadata.license_name) + metadata.license_link = metadata_override.get(Keys.General.LICENSE_LINK, metadata.license_link) + + metadata.url = metadata_override.get(Keys.General.URL, metadata.url) + metadata.doi = metadata_override.get(Keys.General.DOI, metadata.doi) + metadata.uuid = metadata_override.get(Keys.General.UUID, metadata.uuid) + metadata.repo_url = metadata_override.get(Keys.General.REPO_URL, metadata.repo_url) + + metadata.source_url = metadata_override.get(Keys.General.SOURCE_URL, metadata.source_url) + metadata.source_doi = metadata_override.get(Keys.General.SOURCE_DOI, metadata.source_doi) + metadata.source_uuid = metadata_override.get(Keys.General.SOURCE_UUID, metadata.source_uuid) + metadata.source_repo_url = metadata_override.get(Keys.General.SOURCE_REPO_URL, metadata.source_repo_url) # Base Models is received here as an array of models - metadata.base_models = metadata_override.get( - "general.base_models", metadata.base_models - ) + metadata.base_models = metadata_override.get("general.base_models", metadata.base_models) # Datasets is received here as an array of datasets - metadata.datasets = metadata_override.get("general.datasets", metadata.datasets) + metadata.datasets = metadata_override.get("general.datasets", metadata.datasets) - metadata.tags = metadata_override.get(Keys.General.TAGS, metadata.tags) - metadata.languages = metadata_override.get( - Keys.General.LANGUAGES, metadata.languages - ) + metadata.tags = metadata_override.get(Keys.General.TAGS, metadata.tags) + metadata.languages = metadata_override.get(Keys.General.LANGUAGES, metadata.languages) # Direct Metadata Override (via direct cli argument) if model_name is not None: @@ -141,9 +104,7 @@ def load( return metadata @staticmethod - def load_metadata_override( - metadata_override_path: Optional[Path] = None, - ) -> dict[str, Any]: + def load_metadata_override(metadata_override_path: Optional[Path] = None) -> dict[str, Any]: if metadata_override_path is None or not metadata_override_path.is_file(): return {} @@ -160,21 +121,39 @@ def load_model_card(model_path: Optional[Path] = None) -> dict[str, Any]: if not model_card_path.is_file(): return {} - # The model card metadata is assumed to always be in YAML + # The model card metadata is assumed to always be in YAML (frontmatter) # ref: https://github.com/huggingface/transformers/blob/a5c642fe7a1f25d3bdcd76991443ba6ff7ee34b2/src/transformers/modelcard.py#L468-L473 + yaml_content: str = "" with open(model_card_path, "r", encoding="utf-8") as f: - if f.readline() == "---\n": - raw = f.read().partition("---\n")[0] - data = yaml.safe_load(raw) - if isinstance(data, dict): - return data + content = f.read() + lines = content.splitlines() + lines_yaml = [] + if len(lines) == 0: + # Empty file + return {} + if len(lines) > 0 and lines[0] != "---": + # No frontmatter + return {} + for line in lines[1:]: + if line == "---": + break # End of frontmatter else: - logger.error( - f"while reading YAML model card frontmatter, data is {type(data)} instead of dict" - ) - return {} + lines_yaml.append(line) + yaml_content = "\n".join(lines_yaml) + "\n" + + # Quick hack to fix the Norway problem + # https://hitchdev.com/strictyaml/why/implicit-typing-removed/ + yaml_content = yaml_content.replace("- no\n", "- \"no\"\n") + + if yaml_content: + data = yaml.safe_load(yaml_content) + if isinstance(data, dict): + return data else: + logger.error(f"while reading YAML model card frontmatter, data is {type(data)} instead of dict") return {} + else: + return {} @staticmethod def load_hf_parameters(model_path: Optional[Path] = None) -> dict[str, Any]: @@ -192,19 +171,10 @@ def load_hf_parameters(model_path: Optional[Path] = None) -> dict[str, Any]: @staticmethod def id_to_title(string): # Convert capitalization into title form unless acronym or version number - return " ".join( - [ - w.title() - if w.islower() and not re.match(r"^(v\d+(?:\.\d+)*|\d.*)$", w) - else w - for w in string.strip().replace("-", " ").split() - ] - ) + return ' '.join([w.title() if w.islower() and not re.match(r'^(v\d+(?:\.\d+)*|\d.*)$', w) else w for w in string.strip().replace('-', ' ').split()]) @staticmethod - def get_model_id_components( - model_id: Optional[str] = None, total_params: int = 0 - ) -> tuple[str | None, str | None, str | None, str | None, str | None, str | None]: + def get_model_id_components(model_id: Optional[str] = None, total_params: int = 0) -> tuple[str | None, str | None, str | None, str | None, str | None, str | None]: # Huggingface often store model id as '/' # so let's parse it and apply some heuristics if possible for model name components @@ -212,28 +182,24 @@ def get_model_id_components( # model ID missing return None, None, None, None, None, None - if " " in model_id: + if ' ' in model_id: # model ID is actually a normal human sentence # which means its most likely a normal model name only # not part of the hugging face naming standard, but whatever return model_id, None, None, None, None, None - if "/" in model_id: + if '/' in model_id: # model ID (huggingface style) - org_component, model_full_name_component = model_id.split("/", 1) + org_component, model_full_name_component = model_id.split('/', 1) else: # model ID but missing org components org_component, model_full_name_component = None, model_id # Check if we erroneously matched against './' or '../' etc... - if ( - org_component is not None - and len(org_component) > 0 - and org_component[0] == "." - ): + if org_component is not None and len(org_component) > 0 and org_component[0] == '.': org_component = None - name_parts: list[str] = model_full_name_component.split("-") + name_parts: list[str] = model_full_name_component.split('-') # Remove empty parts for i in reversed(range(len(name_parts))): @@ -247,18 +213,14 @@ def get_model_id_components( # Annotate the name for i, part in enumerate(name_parts): # Version - if re.fullmatch(r"(v|iter)?\d+([.]\d+)*", part, re.IGNORECASE): + if re.fullmatch(r'(v|iter)?\d+([.]\d+)*', part, re.IGNORECASE): name_types[i].add("version") # Quant type (should not be there for base models, but still annotated) - elif re.fullmatch(r"i?q\d(_\w)*|b?fp?(16|32)", part, re.IGNORECASE): + elif re.fullmatch(r'i?q\d(_\w)*|b?fp?(16|32)', part, re.IGNORECASE): name_types[i].add("type") name_parts[i] = part.upper() # Model size - elif i > 0 and re.fullmatch( - r"(([A]|\d+[x])?\d+([._]\d+)?[KMBT][\d]?|small|mini|medium|large|x?xl)", - part, - re.IGNORECASE, - ): + elif i > 0 and re.fullmatch(r'(([A]|\d+[x])?\d+([._]\d+)?[KMBT][\d]?|small|mini|medium|large|x?xl)', part, re.IGNORECASE): part = part.replace("_", ".") # Handle weird bloom-7b1 notation if part[-1].isdecimal(): @@ -269,19 +231,14 @@ def get_model_id_components( part = part[:-1] + part[-1].upper() if total_params != 0: try: - label_params = float(part[:-1]) * pow( - 1000, " KMBT".find(part[-1]) - ) + label_params = float(part[:-1]) * pow(1000, " KMBT".find(part[-1])) # Only use it as a size label if it's close or bigger than the model size # Note that LoRA adapters don't necessarily include all layers, # so this is why bigger label sizes are accepted. # Do not use the size label when it's smaller than 1/8 of the model size - if ( - total_params < 0 and label_params < abs(total_params) // 8 - ) or ( + if (total_params < 0 and label_params < abs(total_params) // 8) or ( # Check both directions when the current model isn't a LoRA adapter - total_params > 0 - and abs(label_params - total_params) > 7 * total_params // 8 + total_params > 0 and abs(label_params - total_params) > 7 * total_params // 8 ): # Likely a context length name_types[i].add("finetune") @@ -294,9 +251,7 @@ def get_model_id_components( name_types[i].add("size_label") name_parts[i] = part # Some easy to recognize finetune names - elif i > 0 and re.fullmatch( - r"chat|instruct|vision|lora", part, re.IGNORECASE - ): + elif i > 0 and re.fullmatch(r'chat|instruct|vision|lora', part, re.IGNORECASE): if total_params < 0 and part.lower() == "lora": # ignore redundant "lora" in the finetune part when the output is a lora adapter name_types[i].add("type") @@ -305,12 +260,7 @@ def get_model_id_components( # Ignore word-based size labels when there is at least a number-based one present # TODO: should word-based size labels always be removed instead? - if any( - c.isdecimal() - for n, t in zip(name_parts, name_types) - if "size_label" in t - for c in n - ): + if any(c.isdecimal() for n, t in zip(name_parts, name_types) if "size_label" in t for c in n): for n, t in zip(name_parts, name_types): if "size_label" in t: if all(c.isalpha() for c in n): @@ -334,55 +284,22 @@ def get_model_id_components( else: break - basename = ( - "-".join(n for n, t in zip(name_parts, name_types) if "basename" in t) - or None - ) + basename = "-".join(n for n, t in zip(name_parts, name_types) if "basename" in t) or None # Deduplicate size labels using order-preserving 'dict' ('set' seems to sort the keys) - size_label = ( - "-".join( - dict.fromkeys( - s for s, t in zip(name_parts, name_types) if "size_label" in t - ).keys() - ) - or None - ) - finetune = ( - "-".join(f for f, t in zip(name_parts, name_types) if "finetune" in t) - or None - ) + size_label = "-".join(dict.fromkeys(s for s, t in zip(name_parts, name_types) if "size_label" in t).keys()) or None + finetune = "-".join(f for f, t in zip(name_parts, name_types) if "finetune" in t) or None # TODO: should the basename version always be excluded? # NOTE: multiple finetune versions are joined together - version = ( - "-".join( - v - for v, t in zip(name_parts, name_types) - if "version" in t and "basename" not in t - ) - or None - ) + version = "-".join(v for v, t, in zip(name_parts, name_types) if "version" in t and "basename" not in t) or None if size_label is None and finetune is None and version is None: # Too ambiguous, output nothing basename = None - return ( - model_full_name_component, - org_component, - basename, - finetune, - version, - size_label, - ) + return model_full_name_component, org_component, basename, finetune, version, size_label @staticmethod - def apply_metadata_heuristic( - metadata: Metadata, - model_card: Optional[dict] = None, - hf_params: Optional[dict] = None, - model_path: Optional[Path] = None, - total_params: int = 0, - ) -> Metadata: + def apply_metadata_heuristic(metadata: Metadata, model_card: Optional[dict] = None, hf_params: Optional[dict] = None, model_path: Optional[Path] = None, total_params: int = 0) -> Metadata: # Reference Model Card Metadata: https://github.com/huggingface/hub-docs/blob/main/modelcard.md?plain=1 # Model Card Heuristics @@ -390,10 +307,7 @@ def apply_metadata_heuristic( if model_card is not None: def use_model_card_metadata(metadata_key: str, model_card_key: str): - if ( - model_card_key in model_card - and getattr(metadata, metadata_key, None) is None - ): + if model_card_key in model_card and getattr(metadata, metadata_key, None) is None: setattr(metadata, metadata_key, model_card.get(model_card_key)) def use_array_model_card_metadata(metadata_key: str, model_card_key: str): @@ -454,21 +368,12 @@ def use_array_model_card_metadata(metadata_key: str, model_card_key: str): use_model_card_metadata("author", "model_creator") use_model_card_metadata("basename", "model_type") - if ( - "base_model" in model_card - or "base_models" in model_card - or "base_model_sources" in model_card - ): + if "base_model" in model_card or "base_models" in model_card or "base_model_sources" in model_card: # This represents the parent models that this is based on # Example: stabilityai/stable-diffusion-xl-base-1.0. Can also be a list (for merges) # Example of merges: https://huggingface.co/EmbeddedLLM/Mistral-7B-Merge-14-v0.1/blob/main/README.md metadata_base_models = [] - base_model_value = model_card.get( - "base_model", - model_card.get( - "base_models", model_card.get("base_model_sources", None) - ), - ) + base_model_value = model_card.get("base_model", model_card.get("base_models", model_card.get("base_model_sources", None))) if base_model_value is not None: if isinstance(base_model_value, str): @@ -483,94 +388,50 @@ def use_array_model_card_metadata(metadata_key: str, model_card_key: str): # NOTE: model size of base model is assumed to be similar to the size of the current model base_model = {} if isinstance(model_id, str): - if ( - model_id.startswith("http://") - or model_id.startswith("https://") - or model_id.startswith("ssh://") - ): + if model_id.startswith("http://") or model_id.startswith("https://") or model_id.startswith("ssh://"): base_model["repo_url"] = model_id # Check if Hugging Face ID is present in URL if "huggingface.co" in model_id: - match = re.match( - r"https?://huggingface.co/([^/]+/[^/]+)$", model_id - ) + match = re.match(r"https?://huggingface.co/([^/]+/[^/]+)$", model_id) if match: model_id_component = match.group(1) - ( - model_full_name_component, - org_component, - basename, - finetune, - version, - size_label, - ) = Metadata.get_model_id_components( - model_id_component, total_params - ) + model_full_name_component, org_component, basename, finetune, version, size_label = Metadata.get_model_id_components(model_id_component, total_params) # Populate model dictionary with extracted components if model_full_name_component is not None: - base_model["name"] = Metadata.id_to_title( - model_full_name_component - ) + base_model["name"] = Metadata.id_to_title(model_full_name_component) if org_component is not None: - base_model[ - "organization" - ] = Metadata.id_to_title(org_component) + base_model["organization"] = Metadata.id_to_title(org_component) if version is not None: base_model["version"] = version else: # Likely a Hugging Face ID - ( - model_full_name_component, - org_component, - basename, - finetune, - version, - size_label, - ) = Metadata.get_model_id_components(model_id, total_params) + model_full_name_component, org_component, basename, finetune, version, size_label = Metadata.get_model_id_components(model_id, total_params) # Populate model dictionary with extracted components if model_full_name_component is not None: - base_model["name"] = Metadata.id_to_title( - model_full_name_component - ) + base_model["name"] = Metadata.id_to_title(model_full_name_component) if org_component is not None: - base_model["organization"] = Metadata.id_to_title( - org_component - ) + base_model["organization"] = Metadata.id_to_title(org_component) if version is not None: base_model["version"] = version - if ( - org_component is not None - and model_full_name_component is not None - ): - base_model[ - "repo_url" - ] = f"https://huggingface.co/{org_component}/{model_full_name_component}" + if org_component is not None and model_full_name_component is not None: + base_model["repo_url"] = f"https://huggingface.co/{org_component}/{model_full_name_component}" elif isinstance(model_id, dict): base_model = model_id else: - logger.error( - f"base model entry '{str(model_id)}' not in a known format" - ) + logger.error(f"base model entry '{str(model_id)}' not in a known format") metadata.base_models.append(base_model) - if ( - "datasets" in model_card - or "dataset" in model_card - or "dataset_sources" in model_card - ): + if "datasets" in model_card or "dataset" in model_card or "dataset_sources" in model_card: # This represents the datasets that this was trained from metadata_datasets = [] - dataset_value = model_card.get( - "datasets", - model_card.get("dataset", model_card.get("dataset_sources", None)), - ) + dataset_value = model_card.get("datasets", model_card.get("dataset", model_card.get("dataset_sources", None))) if dataset_value is not None: if isinstance(dataset_value, str): @@ -590,74 +451,38 @@ def use_array_model_card_metadata(metadata_key: str, model_card_key: str): # Check if Hugging Face ID is present in URL if "huggingface.co" in dataset_id: - match = re.match( - r"https?://huggingface.co/([^/]+/[^/]+)$", - dataset_id, - ) + match = re.match(r"https?://huggingface.co/([^/]+/[^/]+)$", dataset_id) if match: dataset_id_component = match.group(1) - ( - dataset_name_component, - org_component, - basename, - finetune, - version, - size_label, - ) = Metadata.get_model_id_components( - dataset_id_component, total_params - ) + dataset_name_component, org_component, basename, finetune, version, size_label = Metadata.get_model_id_components(dataset_id_component, total_params) # Populate dataset dictionary with extracted components if dataset_name_component is not None: - dataset["name"] = Metadata.id_to_title( - dataset_name_component - ) + dataset["name"] = Metadata.id_to_title(dataset_name_component) if org_component is not None: - dataset["organization"] = Metadata.id_to_title( - org_component - ) + dataset["organization"] = Metadata.id_to_title(org_component) if version is not None: dataset["version"] = version else: # Likely a Hugging Face ID - ( - dataset_name_component, - org_component, - basename, - finetune, - version, - size_label, - ) = Metadata.get_model_id_components( - dataset_id, total_params - ) + dataset_name_component, org_component, basename, finetune, version, size_label = Metadata.get_model_id_components(dataset_id, total_params) # Populate dataset dictionary with extracted components if dataset_name_component is not None: - dataset["name"] = Metadata.id_to_title( - dataset_name_component - ) + dataset["name"] = Metadata.id_to_title(dataset_name_component) if org_component is not None: - dataset["organization"] = Metadata.id_to_title( - org_component - ) + dataset["organization"] = Metadata.id_to_title(org_component) if version is not None: dataset["version"] = version - if ( - org_component is not None - and dataset_name_component is not None - ): - dataset[ - "repo_url" - ] = f"https://huggingface.co/{org_component}/{dataset_name_component}" + if org_component is not None and dataset_name_component is not None: + dataset["repo_url"] = f"https://huggingface.co/{org_component}/{dataset_name_component}" elif isinstance(dataset_id, dict): dataset = dataset_id else: - logger.error( - f"dataset entry '{str(dataset_id)}' not in a known format" - ) + logger.error(f"dataset entry '{str(dataset_id)}' not in a known format") metadata.datasets.append(dataset) @@ -675,19 +500,13 @@ def use_array_model_card_metadata(metadata_key: str, model_card_key: str): #################################### if hf_params is not None: + hf_name_or_path = hf_params.get("_name_or_path") - if hf_name_or_path is not None and hf_name_or_path.count("/") <= 1: + if hf_name_or_path is not None and hf_name_or_path.count('/') <= 1: # Use _name_or_path only if its actually a model name and not some computer path # e.g. 'meta-llama/Llama-2-7b-hf' model_id = hf_name_or_path - ( - model_full_name_component, - org_component, - basename, - finetune, - version, - size_label, - ) = Metadata.get_model_id_components(model_id, total_params) + model_full_name_component, org_component, basename, finetune, version, size_label = Metadata.get_model_id_components(model_id, total_params) if metadata.name is None and model_full_name_component is not None: metadata.name = Metadata.id_to_title(model_full_name_component) if metadata.organization is None and org_component is not None: @@ -705,14 +524,7 @@ def use_array_model_card_metadata(metadata_key: str, model_card_key: str): ############################################ if model_path is not None: model_id = model_path.name - ( - model_full_name_component, - org_component, - basename, - finetune, - version, - size_label, - ) = Metadata.get_model_id_components(model_id, total_params) + model_full_name_component, org_component, basename, finetune, version, size_label = Metadata.get_model_id_components(model_id, total_params) if metadata.name is None and model_full_name_component is not None: metadata.name = Metadata.id_to_title(model_full_name_component) if metadata.organization is None and org_component is not None: @@ -790,13 +602,9 @@ def set_gguf_meta_model(self, gguf_writer: gguf.GGUFWriter): if "version" in base_model_entry: gguf_writer.add_base_model_version(key, base_model_entry["version"]) if "organization" in base_model_entry: - gguf_writer.add_base_model_organization( - key, base_model_entry["organization"] - ) + gguf_writer.add_base_model_organization(key, base_model_entry["organization"]) if "description" in base_model_entry: - gguf_writer.add_base_model_description( - key, base_model_entry["description"] - ) + gguf_writer.add_base_model_description(key, base_model_entry["description"]) if "url" in base_model_entry: gguf_writer.add_base_model_url(key, base_model_entry["url"]) if "doi" in base_model_entry: @@ -804,9 +612,7 @@ def set_gguf_meta_model(self, gguf_writer: gguf.GGUFWriter): if "uuid" in base_model_entry: gguf_writer.add_base_model_uuid(key, base_model_entry["uuid"]) if "repo_url" in base_model_entry: - gguf_writer.add_base_model_repo_url( - key, base_model_entry["repo_url"] - ) + gguf_writer.add_base_model_repo_url(key, base_model_entry["repo_url"]) if self.datasets is not None: gguf_writer.add_dataset_count(len(self.datasets)) @@ -818,13 +624,9 @@ def set_gguf_meta_model(self, gguf_writer: gguf.GGUFWriter): if "version" in dataset_entry: gguf_writer.add_dataset_version(key, dataset_entry["version"]) if "organization" in dataset_entry: - gguf_writer.add_dataset_organization( - key, dataset_entry["organization"] - ) + gguf_writer.add_dataset_organization(key, dataset_entry["organization"]) if "description" in dataset_entry: - gguf_writer.add_dataset_description( - key, dataset_entry["description"] - ) + gguf_writer.add_dataset_description(key, dataset_entry["description"]) if "url" in dataset_entry: gguf_writer.add_dataset_url(key, dataset_entry["url"]) if "doi" in dataset_entry: diff --git a/lpm_kernel/L2/gguf-py/gguf/quants.py b/lpm_kernel/L2/gguf-py/gguf/quants.py index d2aed1d6..3c8ba82e 100644 --- a/lpm_kernel/L2/gguf-py/gguf/quants.py +++ b/lpm_kernel/L2/gguf-py/gguf/quants.py @@ -11,35 +11,22 @@ import numpy as np -def quant_shape_to_byte_shape( - shape: Sequence[int], quant_type: GGMLQuantizationType -) -> tuple[int, ...]: +def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType) -> tuple[int, ...]: block_size, type_size = GGML_QUANT_SIZES[quant_type] if shape[-1] % block_size != 0: - raise ValueError( - f"Quantized tensor row size ({shape[-1]}) is not a multiple of {quant_type.name} block size ({block_size})" - ) + raise ValueError(f"Quantized tensor row size ({shape[-1]}) is not a multiple of {quant_type.name} block size ({block_size})") return (*shape[:-1], shape[-1] // block_size * type_size) -def quant_shape_from_byte_shape( - shape: Sequence[int], quant_type: GGMLQuantizationType -) -> tuple[int, ...]: +def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType) -> tuple[int, ...]: block_size, type_size = GGML_QUANT_SIZES[quant_type] if shape[-1] % type_size != 0: - raise ValueError( - f"Quantized tensor bytes per row ({shape[-1]}) is not a multiple of {quant_type.name} type size ({type_size})" - ) + raise ValueError(f"Quantized tensor bytes per row ({shape[-1]}) is not a multiple of {quant_type.name} type size ({type_size})") return (*shape[:-1], shape[-1] // type_size * block_size) # This is faster than np.vectorize and np.apply_along_axis because it works on more than one row at a time -def _apply_over_grouped_rows( - func: Callable[[np.ndarray], np.ndarray], - arr: np.ndarray, - otype: DTypeLike, - oshape: tuple[int, ...], -) -> np.ndarray: +def _apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.ndarray, otype: DTypeLike, oshape: tuple[int, ...]) -> np.ndarray: rows = arr.reshape((-1, arr.shape[-1])) osize = 1 for dim in oshape: @@ -47,11 +34,7 @@ def _apply_over_grouped_rows( out = np.empty(shape=osize, dtype=otype) # compute over groups of 16 rows (arbitrary, but seems good for performance) n_groups = (rows.shape[0] // 16) or 1 - np.concatenate( - [func(group).ravel() for group in np.array_split(rows, n_groups)], - axis=0, - out=out, - ) + np.concatenate([func(group).ravel() for group in np.array_split(rows, n_groups)], axis=0, out=out) return out.reshape(oshape) @@ -64,8 +47,7 @@ def np_roundf(n: np.ndarray) -> np.ndarray: return np.sign(n) * b -class QuantError(Exception): - ... +class QuantError(Exception): ... _type_traits: dict[GGMLQuantizationType, type[__Quant]] = {} @@ -79,9 +61,7 @@ def quantize(data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray: elif (q := _type_traits.get(qtype)) is not None: return q.quantize(data) else: - raise NotImplementedError( - f"Quantization for {qtype.name} is not yet implemented" - ) + raise NotImplementedError(f"Quantization for {qtype.name} is not yet implemented") def dequantize(data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray: @@ -92,9 +72,7 @@ def dequantize(data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray: elif (q := _type_traits.get(qtype)) is not None: return q.dequantize(data) else: - raise NotImplementedError( - f"Dequantization for {qtype.name} is not yet implemented" - ) + raise NotImplementedError(f"Dequantization for {qtype.name} is not yet implemented") class __Quant(ABC): @@ -114,10 +92,12 @@ def __init_subclass__(cls, qtype: GGMLQuantizationType) -> None: cls.qtype = qtype cls.block_size, cls.type_size = GGML_QUANT_SIZES[qtype] cls.__quantize_lazy = LazyNumpyTensor._wrap_fn( - cls.__quantize_array, meta_noop=(np.uint8, cls.__shape_to_bytes) + cls.__quantize_array, + meta_noop=(np.uint8, cls.__shape_to_bytes) ) cls.__dequantize_lazy = LazyNumpyTensor._wrap_fn( - cls.__dequantize_array, meta_noop=(np.float32, cls.__shape_from_bytes) + cls.__dequantize_array, + meta_noop=(np.float32, cls.__shape_from_bytes) ) assert qtype not in _type_traits _type_traits[qtype] = cls @@ -134,14 +114,10 @@ def init_grid(cls): grid = np.frombuffer(cls.grid_hex, dtype=np.uint8) # decode hexadecimal chars from grid grid = grid.reshape((-1, 2)) - grid = (np.where(grid > 0x40, grid + 9, grid) & 0x0F) << np.array( - [4, 0], dtype=np.uint8 - ).reshape((1, 2)) + grid = (np.where(grid > 0x40, grid + 9, grid) & 0x0F) << np.array([4, 0], dtype=np.uint8).reshape((1, 2)) grid = grid[..., 0] | grid[..., 1] # unpack the grid values - grid = grid.reshape((-1, 1)) >> np.array( - [i for i in range(0, 8, 8 // elems_per_byte)], dtype=np.uint8 - ).reshape((1, elems_per_byte)) + grid = grid.reshape((-1, 1)) >> np.array([i for i in range(0, 8, 8 // elems_per_byte)], dtype=np.uint8).reshape((1, elems_per_byte)) grid = (grid & ((1 << bits_per_elem) - 1)).reshape((-1, 1)) grid_map = np.array(cls.grid_map, dtype=np.float32).reshape((1, -1)) grid = np.take_along_axis(grid_map, grid, axis=-1) @@ -189,22 +165,12 @@ def __shape_from_bytes(cls, shape: Sequence[int]): @classmethod def __quantize_array(cls, array: np.ndarray) -> np.ndarray: - return _apply_over_grouped_rows( - cls.quantize_rows, - arr=array, - otype=np.uint8, - oshape=cls.__shape_to_bytes(array.shape), - ) + return _apply_over_grouped_rows(cls.quantize_rows, arr=array, otype=np.uint8, oshape=cls.__shape_to_bytes(array.shape)) @classmethod def __dequantize_array(cls, array: np.ndarray) -> np.ndarray: cls.init_grid() - return _apply_over_grouped_rows( - cls.dequantize_rows, - arr=array, - otype=np.float32, - oshape=cls.__shape_from_bytes(array.shape), - ) + return _apply_over_grouped_rows(cls.dequantize_rows, arr=array, otype=np.float32, oshape=cls.__shape_from_bytes(array.shape)) @classmethod def __quantize_lazy(cls, lazy_tensor: LazyNumpyTensor, /) -> Any: @@ -221,9 +187,7 @@ def can_quantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> bool: @classmethod def quantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> np.ndarray: if not cls.can_quantize(tensor): - raise QuantError( - f"Can't quantize tensor with shape {tensor.shape} to {cls.qtype.name}" - ) + raise QuantError(f"Can't quantize tensor with shape {tensor.shape} to {cls.qtype.name}") if isinstance(tensor, LazyNumpyTensor): return cls.__quantize_lazy(tensor) else: @@ -243,13 +207,9 @@ class BF16(__Quant, qtype=GGMLQuantizationType.BF16): def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: n = blocks.view(np.uint32) # force nan to quiet - n = np.where( - (n & 0x7FFFFFFF) > 0x7F800000, - (n & np.uint32(0xFFFF0000)) | np.uint32(64 << 16), - n, - ) + n = np.where((n & 0x7fffffff) > 0x7f800000, (n & np.uint32(0xffff0000)) | np.uint32(64 << 16), n) # round to nearest even - n = (np.uint64(n) + (0x7FFF + ((n >> 16) & 1))) >> 16 + n = (np.uint64(n) + (0x7fff + ((n >> 16) & 1))) >> 16 return n.astype(np.uint16).view(np.uint8) @classmethod @@ -269,14 +229,7 @@ def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: with np.errstate(divide="ignore"): id = np.where(d == 0, 0, 1 / d) # FIXME: Q4_0's reference rounding is cursed and depends on FMA - qs = ( - np.trunc( - (np.float64(blocks) * np.float64(id)) + np.float64(8.5), - dtype=np.float32, - ) - .astype(np.uint8) - .clip(0, 15) - ) + qs = np.trunc((np.float64(blocks) * np.float64(id)) + np.float64(8.5), dtype=np.float32).astype(np.uint8).clip(0, 15) qs = qs.reshape((n_blocks, 2, cls.block_size // 2)) qs = qs[..., 0, :] | (qs[..., 1, :] << np.uint8(4)) @@ -293,12 +246,10 @@ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: d = d.view(np.float16).astype(np.float32) - qs = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array( - [0, 4], dtype=np.uint8 - ).reshape((1, 1, 2, 1)) + qs = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1)) qs = (qs & np.uint8(0x0F)).reshape((n_blocks, -1)).astype(np.int8) - np.int8(8) - return d * qs.astype(np.float32) + return (d * qs.astype(np.float32)) class Q4_1(__Quant, qtype=GGMLQuantizationType.Q4_1): @@ -312,11 +263,7 @@ def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: d = (max - min) / 15 with np.errstate(divide="ignore"): id = np.where(d == 0, 0, 1 / d) - qs = ( - np.trunc((blocks - min) * id + np.float32(0.5), dtype=np.float32) - .astype(np.uint8) - .clip(0, 15) - ) + qs = np.trunc((blocks - min) * id + np.float32(0.5), dtype=np.float32).astype(np.uint8).clip(0, 15) qs = qs.reshape((n_blocks, 2, cls.block_size // 2)) qs = qs[..., 0, :] | (qs[..., 1, :] << np.uint8(4)) @@ -336,9 +283,7 @@ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: d = d.view(np.float16).astype(np.float32) m = m.view(np.float16).astype(np.float32) - qs = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array( - [0, 4], dtype=np.uint8 - ).reshape((1, 1, 2, 1)) + qs = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1)) qs = (qs & np.uint8(0x0F)).reshape((n_blocks, -1)).astype(np.float32) return (d * qs) + m @@ -356,21 +301,12 @@ def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: with np.errstate(divide="ignore"): id = np.where(d == 0, 0, 1 / d) # FIXME: Q5_0's reference rounding is cursed and depends on FMA - q = ( - np.trunc( - (np.float64(blocks) * np.float64(id)) + np.float64(16.5), - dtype=np.float32, - ) - .astype(np.uint8) - .clip(0, 31) - ) + q = np.trunc((np.float64(blocks) * np.float64(id)) + np.float64(16.5), dtype=np.float32).astype(np.uint8).clip(0, 31) qs = q.reshape((n_blocks, 2, cls.block_size // 2)) qs = (qs[..., 0, :] & np.uint8(0x0F)) | (qs[..., 1, :] << np.uint8(4)) - qh = np.packbits( - q.reshape((n_blocks, 1, 32)) >> np.uint8(4), axis=-1, bitorder="little" - ).reshape(n_blocks, 4) + qh = np.packbits(q.reshape((n_blocks, 1, 32)) >> np.uint8(4), axis=-1, bitorder="little").reshape(n_blocks, 4) d = d.astype(np.float16).view(np.uint8) @@ -386,18 +322,14 @@ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: d = d.view(np.float16).astype(np.float32) qh = qh.view(np.uint32) - qh = qh.reshape((n_blocks, 1)) >> np.array( - [i for i in range(32)], dtype=np.uint32 - ).reshape((1, 32)) - ql = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array( - [0, 4], dtype=np.uint8 - ).reshape((1, 1, 2, 1)) + qh = qh.reshape((n_blocks, 1)) >> np.array([i for i in range(32)], dtype=np.uint32).reshape((1, 32)) + ql = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1)) qh = (qh & np.uint32(0x01)).astype(np.uint8) ql = (ql & np.uint8(0x0F)).reshape((n_blocks, -1)) qs = (ql | (qh << np.uint8(4))).astype(np.int8) - np.int8(16) - return d * qs.astype(np.float32) + return (d * qs.astype(np.float32)) class Q5_1(__Quant, qtype=GGMLQuantizationType.Q5_1): @@ -411,18 +343,12 @@ def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: d = (max - min) / 31 with np.errstate(divide="ignore"): id = np.where(d == 0, 0, 1 / d) - q = ( - np.trunc((blocks - min) * id + np.float32(0.5), dtype=np.float32) - .astype(np.uint8) - .clip(0, 31) - ) + q = np.trunc((blocks - min) * id + np.float32(0.5), dtype=np.float32).astype(np.uint8).clip(0, 31) qs = q.reshape((n_blocks, 2, cls.block_size // 2)) qs = (qs[..., 0, :] & np.uint8(0x0F)) | (qs[..., 1, :] << np.uint8(4)) - qh = np.packbits( - q.reshape((n_blocks, 1, 32)) >> np.uint8(4), axis=-1, bitorder="little" - ).reshape(n_blocks, 4) + qh = np.packbits(q.reshape((n_blocks, 1, 32)) >> np.uint8(4), axis=-1, bitorder="little").reshape(n_blocks, 4) d = d.astype(np.float16).view(np.uint8) m = min.astype(np.float16).view(np.uint8) @@ -441,12 +367,8 @@ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: m = m.view(np.float16).astype(np.float32) qh = qh.view(np.uint32) - qh = qh.reshape((n_blocks, 1)) >> np.array( - [i for i in range(32)], dtype=np.uint32 - ).reshape((1, 32)) - ql = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array( - [0, 4], dtype=np.uint8 - ).reshape((1, 1, 2, 1)) + qh = qh.reshape((n_blocks, 1)) >> np.array([i for i in range(32)], dtype=np.uint32).reshape((1, 32)) + ql = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1)) qh = (qh & np.uint32(0x01)).astype(np.uint8) ql = (ql & np.uint8(0x0F)).reshape((n_blocks, -1)) @@ -459,6 +381,7 @@ class Q8_0(__Quant, qtype=GGMLQuantizationType.Q8_0): @classmethod # Implementation of Q8_0 with bit-exact same results as reference implementation in ggml-quants.c def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + d = abs(blocks).max(axis=1, keepdims=True) / 127 with np.errstate(divide="ignore"): id = np.where(d == 0, 0, 1 / d) @@ -477,7 +400,7 @@ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: d = d.view(np.float16).astype(np.float32) x = x.view(np.int8).astype(np.float32) - return x * d + return (x * d) class Q2_K(__Quant, qtype=GGMLQuantizationType.Q2_K): @@ -494,9 +417,7 @@ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: # (n_blocks, 16, 1) dl = (d * (scales & 0xF).astype(np.float32)).reshape((n_blocks, QK_K // 16, 1)) - ml = (dmin * (scales >> 4).astype(np.float32)).reshape( - (n_blocks, QK_K // 16, 1) - ) + ml = (dmin * (scales >> 4).astype(np.float32)).reshape((n_blocks, QK_K // 16, 1)) shift = np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4, 1)) @@ -534,33 +455,21 @@ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: # 10: OOKKGGCC # 11: PPLLHHDD lscales, hscales = np.hsplit(scales, [8]) - lscales = lscales.reshape((n_blocks, 1, 8)) >> np.array( - [0, 4], dtype=np.uint8 - ).reshape((1, 2, 1)) + lscales = lscales.reshape((n_blocks, 1, 8)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 2, 1)) lscales = lscales.reshape((n_blocks, 16)) - hscales = hscales.reshape((n_blocks, 1, 4)) >> np.array( - [0, 2, 4, 6], dtype=np.uint8 - ).reshape((1, 4, 1)) + hscales = hscales.reshape((n_blocks, 1, 4)) >> np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 4, 1)) hscales = hscales.reshape((n_blocks, 16)) - scales = (lscales & np.uint8(0x0F)) | ( - (hscales & np.uint8(0x03)) << np.uint8(4) - ) + scales = (lscales & np.uint8(0x0F)) | ((hscales & np.uint8(0x03)) << np.uint8(4)) scales = (scales.astype(np.int8) - np.int8(32)).astype(np.float32) dl = (d * scales).reshape((n_blocks, 16, 1)) - ql = qs.reshape((n_blocks, -1, 1, 32)) >> np.array( - [0, 2, 4, 6], dtype=np.uint8 - ).reshape((1, 1, 4, 1)) - qh = hmask.reshape(n_blocks, -1, 1, 32) >> np.array( - [i for i in range(8)], dtype=np.uint8 - ).reshape((1, 1, 8, 1)) + ql = qs.reshape((n_blocks, -1, 1, 32)) >> np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4, 1)) + qh = hmask.reshape(n_blocks, -1, 1, 32) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 8, 1)) ql = ql.reshape((n_blocks, 16, QK_K // 16)) & np.uint8(3) - qh = qh.reshape((n_blocks, 16, QK_K // 16)) & np.uint8(1) + qh = (qh.reshape((n_blocks, 16, QK_K // 16)) & np.uint8(1)) qh = qh ^ np.uint8(1) # strangely, the offset is zero when the bitmask is 1 - q = (ql.astype(np.int8) - (qh << np.uint8(2)).astype(np.int8)).astype( - np.float32 - ) + q = (ql.astype(np.int8) - (qh << np.uint8(2)).astype(np.int8)).astype(np.float32) return (dl * q).reshape((n_blocks, QK_K)) @@ -609,9 +518,7 @@ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: d = (d * sc.astype(np.float32)).reshape((n_blocks, -1, 1)) dm = (dmin * m.astype(np.float32)).reshape((n_blocks, -1, 1)) - qs = qs.reshape((n_blocks, -1, 1, 32)) >> np.array( - [0, 4], dtype=np.uint8 - ).reshape((1, 1, 2, 1)) + qs = qs.reshape((n_blocks, -1, 1, 32)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1)) qs = (qs & np.uint8(0x0F)).reshape((n_blocks, -1, 32)).astype(np.float32) return (d * qs - dm).reshape((n_blocks, QK_K)) @@ -635,12 +542,8 @@ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: d = (d * sc.astype(np.float32)).reshape((n_blocks, -1, 1)) dm = (dmin * m.astype(np.float32)).reshape((n_blocks, -1, 1)) - ql = qs.reshape((n_blocks, -1, 1, 32)) >> np.array( - [0, 4], dtype=np.uint8 - ).reshape((1, 1, 2, 1)) - qh = qh.reshape((n_blocks, -1, 1, 32)) >> np.array( - [i for i in range(8)], dtype=np.uint8 - ).reshape((1, 1, 8, 1)) + ql = qs.reshape((n_blocks, -1, 1, 32)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1)) + qh = qh.reshape((n_blocks, -1, 1, 32)) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 8, 1)) ql = (ql & np.uint8(0x0F)).reshape((n_blocks, -1, 32)) qh = (qh & np.uint8(0x01)).reshape((n_blocks, -1, 32)) q = (ql | (qh << np.uint8(4))).astype(np.float32) @@ -661,13 +564,9 @@ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: d = d.view(np.float16).astype(np.float32) d = (d * scales).reshape((n_blocks, QK_K // 16, 1)) - ql = ql.reshape((n_blocks, -1, 1, 64)) >> np.array( - [0, 4], dtype=np.uint8 - ).reshape((1, 1, 2, 1)) + ql = ql.reshape((n_blocks, -1, 1, 64)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1)) ql = (ql & np.uint8(0x0F)).reshape((n_blocks, -1, 32)) - qh = qh.reshape((n_blocks, -1, 1, 32)) >> np.array( - [0, 2, 4, 6], dtype=np.uint8 - ).reshape((1, 1, 4, 1)) + qh = qh.reshape((n_blocks, -1, 1, 32)) >> np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4, 1)) qh = (qh & np.uint8(0x03)).reshape((n_blocks, -1, 32)) q = (ql | (qh << np.uint8(4))).astype(np.int8) - np.int8(32) q = q.reshape((n_blocks, QK_K // 16, -1)).astype(np.float32) @@ -686,22 +585,12 @@ def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: qs = np_roundf(blocks * id) qs = (qs.astype(np.int8) + np.int8(1)).astype(np.uint8) - qs0, qs1, qh = ( - qs[..., : (32 * 5)], - qs[..., (32 * 5) : (48 * 5)], - qs[..., (48 * 5) :], - ) - qs0 = qs0.reshape((n_blocks, -1, 5, 32)) * np.array( - [81, 27, 9, 3, 1], dtype=np.uint8 - ).reshape((1, 1, 5, 1)) + qs0, qs1, qh = qs[..., :(32 * 5)], qs[..., (32 * 5):(48 * 5)], qs[..., (48 * 5):] + qs0 = qs0.reshape((n_blocks, -1, 5, 32)) * np.array([81, 27, 9, 3, 1], dtype=np.uint8).reshape((1, 1, 5, 1)) qs0 = np.sum(qs0, axis=-2).reshape((n_blocks, -1)) - qs1 = qs1.reshape((n_blocks, -1, 5, 16)) * np.array( - [81, 27, 9, 3, 1], dtype=np.uint8 - ).reshape((1, 1, 5, 1)) + qs1 = qs1.reshape((n_blocks, -1, 5, 16)) * np.array([81, 27, 9, 3, 1], dtype=np.uint8).reshape((1, 1, 5, 1)) qs1 = np.sum(qs1, axis=-2).reshape((n_blocks, -1)) - qh = qh.reshape((n_blocks, -1, 4, 4)) * np.array( - [81, 27, 9, 3], dtype=np.uint8 - ).reshape((1, 1, 4, 1)) + qh = qh.reshape((n_blocks, -1, 4, 4)) * np.array([81, 27, 9, 3], dtype=np.uint8).reshape((1, 1, 4, 1)) qh = np.sum(qh, axis=-2).reshape((n_blocks, -1)) qs = np.concatenate([qs0, qs1, qh], axis=-1) qs = (qs.astype(np.uint16) * 256 + (243 - 1)) // 243 @@ -721,22 +610,16 @@ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: d = d.view(np.float16).astype(np.float32) qs0, qs1 = qs[..., :32], qs[..., 32:] - qs0 = qs0.reshape((n_blocks, -1, 1, 32)) * np.array( - [1, 3, 9, 27, 81], dtype=np.uint8 - ).reshape((1, 1, 5, 1)) + qs0 = qs0.reshape((n_blocks, -1, 1, 32)) * np.array([1, 3, 9, 27, 81], dtype=np.uint8).reshape((1, 1, 5, 1)) qs0 = qs0.reshape((n_blocks, -1)) - qs1 = qs1.reshape((n_blocks, -1, 1, 16)) * np.array( - [1, 3, 9, 27, 81], dtype=np.uint8 - ).reshape((1, 1, 5, 1)) + qs1 = qs1.reshape((n_blocks, -1, 1, 16)) * np.array([1, 3, 9, 27, 81], dtype=np.uint8).reshape((1, 1, 5, 1)) qs1 = qs1.reshape((n_blocks, -1)) - qh = qh.reshape((n_blocks, -1, 1, 4)) * np.array( - [1, 3, 9, 27], dtype=np.uint8 - ).reshape((1, 1, 4, 1)) + qh = qh.reshape((n_blocks, -1, 1, 4)) * np.array([1, 3, 9, 27], dtype=np.uint8).reshape((1, 1, 4, 1)) qh = qh.reshape((n_blocks, -1)) qs = np.concatenate([qs0, qs1, qh], axis=-1) qs = ((qs.astype(np.uint16) * 3) >> 8).astype(np.int8) - np.int8(1) - return d * qs.astype(np.float32) + return (d * qs.astype(np.float32)) class TQ2_0(__Quant, qtype=GGMLQuantizationType.TQ2_0): @@ -750,9 +633,7 @@ def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: qs = np_roundf(blocks * id) qs = (qs.astype(np.int8) + np.int8(1)).astype(np.uint8) - qs = qs.reshape((n_blocks, -1, 4, 32)) << np.array( - [0, 2, 4, 6], dtype=np.uint8 - ).reshape((1, 1, 4, 1)) + qs = qs.reshape((n_blocks, -1, 4, 32)) << np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4, 1)) qs = qs[..., 0, :] | qs[..., 1, :] | qs[..., 2, :] | qs[..., 3, :] qs = qs.reshape((n_blocks, -1)) @@ -768,12 +649,10 @@ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: d = d.view(np.float16).astype(np.float32) - qs = qs.reshape((n_blocks, -1, 1, 32)) >> np.array( - [0, 2, 4, 6], dtype=np.uint8 - ).reshape((1, 1, 4, 1)) + qs = qs.reshape((n_blocks, -1, 1, 32)) >> np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4, 1)) qs = (qs & 0x03).reshape((n_blocks, -1)).astype(np.int8) - np.int8(1) - return d * qs.astype(np.float32) + return (d * qs.astype(np.float32)) class IQ2_XXS(__Quant, qtype=GGMLQuantizationType.IQ2_XXS): @@ -791,7 +670,7 @@ class IQ2_XXS(__Quant, qtype=GGMLQuantizationType.IQ2_XXS): # iq2xxs_grid, but with each byte of the original packed in 2 bits, # by mapping 0x08 to 0, 0x19 to 1, and 0x2b to 2. grid_shape = (256, 8) - grid_map = (0x08, 0x19, 0x2B) + grid_map = (0x08, 0x19, 0x2b) grid_hex = ( b"00000200050008000a00110014002000220028002a0041004400500058006100" b"6400800082008a00a20001010401100115014001840198010002020222028202" @@ -821,33 +700,21 @@ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: qs = qs.view(np.uint32).reshape(n_blocks, -1, 2) - db = ( - d - * (np.float32(0.5) + (qs[..., 1] >> 28).astype(np.float32)) - * np.float32(0.25) - ) + db = d * (np.float32(0.5) + (qs[..., 1] >> 28).astype(np.float32)) * np.float32(0.25) db = db.reshape((n_blocks, -1, 1, 1)) # get the sign indices and unpack the bits - signs = qs[..., 1].reshape((n_blocks, -1, 1)) >> np.array( - [0, 7, 14, 21], dtype=np.uint32 - ).reshape((1, 1, 4)) + signs = qs[..., 1].reshape((n_blocks, -1, 1)) >> np.array([0, 7, 14, 21], dtype=np.uint32).reshape((1, 1, 4)) ksigns = np.frombuffer(cls.ksigns, dtype=np.uint8).reshape((1, 1, 1, 128)) signs = (signs & np.uint32(0x7F)).reshape((n_blocks, -1, 4, 1)) signs = np.take_along_axis(ksigns, signs, axis=-1) - signs = signs.reshape((n_blocks, -1, 4, 1)) >> np.array( - [i for i in range(8)], dtype=np.uint8 - ).reshape((1, 1, 1, 8)) + signs = signs.reshape((n_blocks, -1, 4, 1)) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 1, 8)) signs = signs & np.uint8(0x01) signs = np.where(signs == 0, np.float32(1), np.float32(-1)) signs = signs.reshape((n_blocks, -1, 4, 8)) assert cls.grid is not None - grid = np.take_along_axis( - cls.grid, - qs[..., 0].copy().view(np.uint8).reshape((n_blocks, -1, 1, 1)), - axis=-2, - ) + grid = np.take_along_axis(cls.grid, qs[..., 0].copy().view(np.uint8).reshape((n_blocks, -1, 1, 1)), axis=-2) grid = grid.reshape((n_blocks, -1, 4, 8)) return (db * grid * signs).reshape((n_blocks, -1)) @@ -857,7 +724,7 @@ class IQ2_XS(__Quant, qtype=GGMLQuantizationType.IQ2_XS): # iq2xs_grid, but with each byte of the original packed in 2 bits, # by mapping 0x08 to 0, 0x19 to 1, and 0x2b to 2. grid_shape = (512, 8) - grid_map = (0x08, 0x19, 0x2B) + grid_map = (0x08, 0x19, 0x2b) grid_hex = ( b"00000200050008000a0011001400160019002000220025002800410044004600" b"49005000520055005800610064008000820085008800910094009900a0000101" @@ -903,9 +770,7 @@ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: d = d.view(np.float16).astype(np.float32) qs = qs.view(np.uint16) - scales = scales.reshape((n_blocks, -1, 1)) >> np.array( - [0, 4], dtype=np.uint8 - ).reshape((1, 1, 2)) + scales = scales.reshape((n_blocks, -1, 1)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2)) scales = (scales & 0x0F).reshape((n_blocks, -1)) db = d * (np.float32(0.5) + scales) * np.float32(0.25) db = db.reshape((n_blocks, -1, 1, 1)) @@ -913,17 +778,13 @@ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: # get the sign indices and unpack the bits signs = np.frombuffer(IQ2_XXS.ksigns, dtype=np.uint8).reshape(1, 1, 128) signs = np.take_along_axis(signs, (qs >> 9).reshape((n_blocks, -1, 1)), axis=-1) - signs = signs.reshape((n_blocks, -1, 1)) >> np.array( - [i for i in range(8)], dtype=np.uint8 - ).reshape((1, 1, 8)) + signs = signs.reshape((n_blocks, -1, 1)) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 8)) signs = signs & np.uint8(0x01) signs = np.where(signs == 0, np.float32(1), np.float32(-1)) signs = signs.reshape((n_blocks, -1, 2, 8)) assert cls.grid is not None - grid = np.take_along_axis( - cls.grid, (qs & np.uint16(511)).reshape((n_blocks, -1, 1, 1)), axis=-2 - ) + grid = np.take_along_axis(cls.grid, (qs & np.uint16(511)).reshape((n_blocks, -1, 1, 1)), axis=-2) grid = grid.reshape((n_blocks, -1, 2, 8)) return (db * grid * signs).reshape((n_blocks, -1)) @@ -933,7 +794,7 @@ class IQ2_S(__Quant, qtype=GGMLQuantizationType.IQ2_S): # iq2s_grid, but with each byte of the original packed in 2 bits, # by mapping 0x08 to 0, 0x19 to 1, and 0x2b to 2. grid_shape = (1024, 8) - grid_map = (0x08, 0x19, 0x2B) + grid_map = (0x08, 0x19, 0x2b) grid_hex = ( b"00000200050008000a0011001400160019002000220025002800410044004600" b"490050005200550058006100640066006900800082008500880091009400a000" @@ -1012,27 +873,19 @@ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: d = d.view(np.float16).astype(np.float32) - scales = scales.reshape((n_blocks, -1, 1)) >> np.array( - [0, 4], dtype=np.uint8 - ).reshape((1, 1, 2)) + scales = scales.reshape((n_blocks, -1, 1)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2)) scales = (scales & 0x0F).reshape((n_blocks, -1)) db = d * (np.float32(0.5) + scales) * np.float32(0.25) db = db.reshape((n_blocks, -1, 1, 1)) # unpack the sign bits - signs = signs.reshape((n_blocks, -1, 1)) >> np.array( - [i for i in range(8)], dtype=np.uint8 - ).reshape((1, 1, 8)) + signs = signs.reshape((n_blocks, -1, 1)) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 8)) signs = signs & np.uint8(0x01) signs = np.where(signs == 0, np.float32(1), np.float32(-1)) signs = signs.reshape((n_blocks, -1, 2, 8)) - qh = qh.reshape((n_blocks, -1, 1)) >> np.array( - [0, 2, 4, 6], dtype=np.uint8 - ).reshape((1, 1, 4)) - qs = qs.astype(np.uint16) | ((qh & 0x03).astype(np.uint16) << 8).reshape( - (n_blocks, -1) - ) + qh = qh.reshape((n_blocks, -1, 1)) >> np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4)) + qs = qs.astype(np.uint16) | ((qh & 0x03).astype(np.uint16) << 8).reshape((n_blocks, -1)) assert cls.grid is not None grid = np.take_along_axis(cls.grid, qs.reshape((n_blocks, -1, 1, 1)), axis=-2) @@ -1043,7 +896,7 @@ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: class IQ3_XXS(__Quant, qtype=GGMLQuantizationType.IQ3_XXS): grid_shape = (256, 4) - grid_map = (0x04, 0x0C, 0x14, 0x1C, 0x24, 0x2C, 0x34, 0x3E) + grid_map = (0x04, 0x0c, 0x14, 0x1c, 0x24, 0x2c, 0x34, 0x3e) grid_hex = ( b"0000020004001100130017002000220031004200730075000101030110011201" b"2101250130013201410154017001000202020402110220022202310233023702" @@ -1077,15 +930,11 @@ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: db = db.reshape((n_blocks, -1, 1, 1)) # get the sign indices and unpack the bits - signs = scales.reshape((n_blocks, -1, 1)) >> np.array( - [0, 7, 14, 21], dtype=np.uint32 - ).reshape((1, 1, 4)) + signs = scales.reshape((n_blocks, -1, 1)) >> np.array([0, 7, 14, 21], dtype=np.uint32).reshape((1, 1, 4)) ksigns = np.frombuffer(IQ2_XXS.ksigns, dtype=np.uint8).reshape((1, 1, 1, 128)) signs = (signs & np.uint32(0x7F)).reshape((n_blocks, -1, 4, 1)) signs = np.take_along_axis(ksigns, signs, axis=-1) - signs = signs.reshape((n_blocks, -1, 4, 1)) >> np.array( - [i for i in range(8)], dtype=np.uint8 - ).reshape((1, 1, 1, 8)) + signs = signs.reshape((n_blocks, -1, 4, 1)) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 1, 8)) signs = signs & np.uint8(0x01) signs = np.where(signs == 0, np.float32(1), np.float32(-1)) signs = signs.reshape((n_blocks, -1, 4, 8)) @@ -1099,7 +948,7 @@ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: class IQ3_S(__Quant, qtype=GGMLQuantizationType.IQ3_S): grid_shape = (512, 4) - grid_map = (0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F) + grid_map = (0x01, 0x03, 0x05, 0x07, 0x09, 0x0b, 0x0d, 0x0f) grid_hex = ( b"0000010002000500070010001100120014001600200021002500330040004200" b"4500470051005300600062007100740077000001010102010401100111011501" @@ -1146,24 +995,18 @@ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: d = d.view(np.float16).astype(np.float32) - scales = scales.reshape((n_blocks, -1, 1)) >> np.array( - [0, 4], dtype=np.uint8 - ).reshape((1, 1, 2)) + scales = scales.reshape((n_blocks, -1, 1)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2)) scales = (scales & 0x0F).reshape((n_blocks, -1)) db = d * (1 + 2 * scales) db = db.reshape((n_blocks, -1, 1, 1)) # unpack the sign bits - signs = signs.reshape((n_blocks, -1, 1)) >> np.array( - [i for i in range(8)], dtype=np.uint8 - ).reshape((1, 1, 8)) + signs = signs.reshape((n_blocks, -1, 1)) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 8)) signs = signs & np.uint8(0x01) signs = np.where(signs == 0, np.float32(1), np.float32(-1)) signs = signs.reshape((n_blocks, -1, 4, 8)) - qh = qh.reshape((n_blocks, -1, 1)) >> np.array( - [i for i in range(8)], dtype=np.uint8 - ) + qh = qh.reshape((n_blocks, -1, 1)) >> np.array([i for i in range(8)], dtype=np.uint8) qh = (qh & 0x01).astype(np.uint16).reshape((n_blocks, -1)) qs = qs.astype(np.uint16) | (qh << 8) @@ -1327,9 +1170,7 @@ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: delta = np.where((qh & np.uint16(0x8000)) == 0, cls.delta, -cls.delta) delta = delta.reshape((n_blocks, -1, 1, 1)) - qh = qh.reshape((n_blocks, -1, 1)) >> np.array( - [0, 3, 6, 9], dtype=np.uint16 - ).reshape((1, 1, 4)) + qh = qh.reshape((n_blocks, -1, 1)) >> np.array([0, 3, 6, 9], dtype=np.uint16).reshape((1, 1, 4)) qs = qs.astype(np.uint16) | ((qh & 7) << 8).reshape((n_blocks, -1)) assert cls.grid is not None @@ -1356,25 +1197,17 @@ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: # The f16 scale is packed across multiple bytes scales = scales.view(np.uint16) - d = (scales.reshape((n_blocks, 4)) & np.uint16(0xF000)) >> np.array( - [12, 8, 4, 0], dtype=np.uint16 - ).reshape((1, 4)) + d = (scales.reshape((n_blocks, 4)) & np.uint16(0xF000)) >> np.array([12, 8, 4, 0], dtype=np.uint16).reshape((1, 4)) d = d[..., 0] | d[..., 1] | d[..., 2] | d[..., 3] d = d.view(np.float16).astype(np.float32).reshape((n_blocks, 1)) - scales = scales.reshape(n_blocks, -1, 1) >> np.array( - [0, 3, 6, 9], dtype=np.uint16 - ).reshape((1, 1, 4)) + scales = scales.reshape(n_blocks, -1, 1) >> np.array([0, 3, 6, 9], dtype=np.uint16).reshape((1, 1, 4)) scales = (scales & 0x07).reshape((n_blocks, -1)) dl = d * (2 * scales + 1) dl = dl.reshape((n_blocks, -1, 2, 1, 1)) - qh = qh.reshape((n_blocks, -1, 1)) >> np.array([0, 4], dtype=np.uint8).reshape( - (1, 1, 2) - ) - qs = qs.astype(np.uint16) | ((qh & 0x07).astype(np.uint16) << 8).reshape( - (n_blocks, -1) - ) + qh = qh.reshape((n_blocks, -1, 1)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2)) + qs = qs.astype(np.uint16) | ((qh & 0x07).astype(np.uint16) << 8).reshape((n_blocks, -1)) delta = np.where(qh & 0x08 == 0, cls.delta, -cls.delta) delta = delta.reshape((n_blocks, -1, 2, 2, 1)) @@ -1397,20 +1230,14 @@ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: d = d.view(np.float16).astype(np.float32) - qs = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array( - [0, 4], dtype=np.uint8 - ).reshape((1, 1, 2, 1)) + qs = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1)) qs = (qs & np.uint8(0x0F)).reshape((n_blocks, -1, 1)) kvalues = np.array(cls.kvalues, dtype=np.int8).reshape(1, 1, 16) - qs = ( - np.take_along_axis(kvalues, qs, axis=-1) - .astype(np.float32) - .reshape((n_blocks, -1)) - ) + qs = np.take_along_axis(kvalues, qs, axis=-1).astype(np.float32).reshape((n_blocks, -1)) - return d * qs + return (d * qs) class IQ4_XS(__Quant, qtype=GGMLQuantizationType.IQ4_XS): @@ -1425,28 +1252,18 @@ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: d = d.view(np.float16).astype(np.float32) scales_h = scales_h.view(np.uint16) - scales_l = scales_l.reshape((n_blocks, -1, 1)) >> np.array( - [0, 4], dtype=np.uint8 - ).reshape((1, 1, 2)) - scales_h = scales_h.reshape((n_blocks, 1, -1)) >> np.array( - [2 * i for i in range(QK_K // 32)], dtype=np.uint16 - ).reshape((1, -1, 1)) + scales_l = scales_l.reshape((n_blocks, -1, 1)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2)) + scales_h = scales_h.reshape((n_blocks, 1, -1)) >> np.array([2 * i for i in range(QK_K // 32)], dtype=np.uint16).reshape((1, -1, 1)) scales_l = scales_l.reshape((n_blocks, -1)) & np.uint8(0x0F) scales_h = scales_h.reshape((n_blocks, -1)).astype(np.uint8) & np.uint8(0x03) scales = (scales_l | (scales_h << np.uint8(4))).astype(np.int8) - np.int8(32) dl = (d * scales.astype(np.float32)).reshape((n_blocks, -1, 1)) - qs = qs.reshape((n_blocks, -1, 1, 16)) >> np.array( - [0, 4], dtype=np.uint8 - ).reshape((1, 1, 2, 1)) + qs = qs.reshape((n_blocks, -1, 1, 16)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1)) qs = qs.reshape((n_blocks, -1, 32, 1)) & np.uint8(0x0F) kvalues = np.array(IQ4_NL.kvalues, dtype=np.int8).reshape((1, 1, 1, -1)) - qs = ( - np.take_along_axis(kvalues, qs, axis=-1) - .astype(np.float32) - .reshape((n_blocks, -1, 32)) - ) + qs = np.take_along_axis(kvalues, qs, axis=-1).astype(np.float32).reshape((n_blocks, -1, 32)) return (dl * qs).reshape((n_blocks, -1)) diff --git a/lpm_kernel/L2/gguf-py/gguf/scripts/__init__.py b/lpm_kernel/L2/gguf-py/gguf/scripts/__init__.py index e77f2e9c..72cc73e7 100644 --- a/lpm_kernel/L2/gguf-py/gguf/scripts/__init__.py +++ b/lpm_kernel/L2/gguf-py/gguf/scripts/__init__.py @@ -4,3 +4,4 @@ from .gguf_dump import main as gguf_dump_entrypoint from .gguf_set_metadata import main as gguf_set_metadata_entrypoint from .gguf_new_metadata import main as gguf_new_metadata_entrypoint +from .gguf_editor_gui import main as gguf_editor_gui_entrypoint diff --git a/lpm_kernel/L2/gguf-py/gguf/scripts/gguf_convert_endian.py b/lpm_kernel/L2/gguf-py/gguf/scripts/gguf_convert_endian.py index aeebfc8b..0e0febaa 100755 --- a/lpm_kernel/L2/gguf-py/gguf/scripts/gguf_convert_endian.py +++ b/lpm_kernel/L2/gguf-py/gguf/scripts/gguf_convert_endian.py @@ -11,10 +11,7 @@ import numpy as np # Necessary to load the local gguf package -if ( - "NO_LOCAL_GGUF" not in os.environ - and (Path(__file__).parent.parent.parent.parent / "gguf-py").exists() -): +if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent.parent / 'gguf-py').exists(): sys.path.insert(0, str(Path(__file__).parent.parent.parent)) import gguf @@ -23,24 +20,15 @@ def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None: - if np.uint32(1) == np.uint32(1).newbyteorder("<"): - # Host is little endian - host_endian = "little" - swapped_endian = "big" + file_endian = reader.endianess.name + if reader.byte_order == 'S': + host_endian = 'BIG' if file_endian == 'LITTLE' else 'LITTLE' else: - # Sorry PDP or other weird systems that don't use BE or LE. - host_endian = "big" - swapped_endian = "little" - if reader.byte_order == "S": - file_endian = swapped_endian - else: - file_endian = host_endian - order = host_endian if args.order == "native" else args.order - logger.info( - f"* Host is {host_endian.upper()} endian, GGUF file seems to be {file_endian.upper()} endian" - ) + host_endian = file_endian + order = host_endian if args.order == "native" else args.order.upper() + logger.info(f"* Host is {host_endian} endian, GGUF file seems to be {file_endian} endian") if file_endian == order: - logger.info(f"* File is already {order.upper()} endian. Nothing to do.") + logger.info(f"* File is already {order} endian. Nothing to do.") sys.exit(0) logger.info("* Checking tensors for conversion compatibility") for tensor in reader.tensors: @@ -48,43 +36,31 @@ def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None gguf.GGMLQuantizationType.F32, gguf.GGMLQuantizationType.F16, gguf.GGMLQuantizationType.Q8_0, + gguf.GGMLQuantizationType.Q4_K, + gguf.GGMLQuantizationType.Q6_K, ): - raise ValueError( - f"Cannot handle type {tensor.tensor_type.name} for tensor {repr(tensor.name)}" - ) - logger.info(f"* Preparing to convert from {file_endian.upper()} to {order.upper()}") + raise ValueError(f"Cannot handle type {tensor.tensor_type.name} for tensor {repr(tensor.name)}") + logger.info(f"* Preparing to convert from {file_endian} to {order}") if args.dry_run: return logger.warning("*** Warning *** Warning *** Warning **") - logger.warning( - "* This conversion process may damage the file. Ensure you have a backup." - ) + logger.warning("* This conversion process may damage the file. Ensure you have a backup.") if order != host_endian: - logger.warning( - "* Requested endian differs from host, you will not be able to load the model on this machine." - ) - logger.warning( - "* The file will be modified immediately, so if conversion fails or is interrupted" - ) - logger.warning( - "* the file will be corrupted. Enter exactly YES if you are positive you want to proceed:" - ) + logger.warning("* Requested endian differs from host, you will not be able to load the model on this machine.") + logger.warning("* The file will be modified immediately, so if conversion fails or is interrupted") + logger.warning("* the file will be corrupted. Enter exactly YES if you are positive you want to proceed:") response = input("YES, I am sure> ") if response != "YES": logger.warning("You didn't enter YES. Okay then, see ya!") sys.exit(0) logger.info(f"* Converting fields ({len(reader.fields)})") for idx, field in enumerate(reader.fields.values()): - logger.info( - f"- {idx:4}: Converting field {repr(field.name)}, part count: {len(field.parts)}" - ) + logger.info(f"- {idx:4}: Converting field {repr(field.name)}, part count: {len(field.parts)}") for part in field.parts: part.byteswap(inplace=True) logger.info(f"* Converting tensors ({len(reader.tensors)})") - for idx, tensor in enumerate( - pbar := tqdm(reader.tensors, desc="Converting tensor") - ): + for idx, tensor in enumerate(pbar := tqdm(reader.tensors, desc="Converting tensor")): log_message = ( f"Converting tensor {repr(tensor.name)}, " f"type={tensor.tensor_type.name}, " @@ -101,25 +77,72 @@ def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None # Specific handling of block_q8_0 is required. # Each block_q8_0 consists of an f16 delta (scaling factor) followed by 32 int8 quantizations. - block_size = 34 # 34 bytes = + 32 * + block_size = 34 # 34 bytes = + 32 * n_blocks = len(tensor.data) // block_size - for block_num in ( - inner_pbar := tqdm( - range(n_blocks), desc="Byte-swapping Blocks", leave=False - ) - ): + for block_num in (inner_pbar := tqdm(range(n_blocks), desc="Byte-swapping Blocks", leave=False)): block_offs = block_num * block_size # Byte-Swap f16 sized delta field - delta = tensor.data[block_offs : block_offs + 2].view(dtype=np.uint16) + delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16) delta.byteswap(inplace=True) # Byte-Swap Q8 weights if block_num % 100000 == 0: - inner_pbar.set_description( - f"Byte-swapping Blocks [{(n_blocks - block_num) // n_blocks}]" - ) + inner_pbar.set_description(f"Byte-swapping Blocks [{(n_blocks - block_num) // n_blocks}]") + + elif tensor.tensor_type == gguf.GGMLQuantizationType.Q4_K: + # Handle Q4_K tensor blocks (block_q4_k) + # Specific handling of block_q4_k is required. + # Each block_q4_k consists of 2 f16 values followed by 140 int8 values. + + # first flatten structure + newshape = 1 + for i in tensor.data.shape: + newshape *= i + + tensor.data.resize(newshape) + + block_size = 144 + n_blocks = len(tensor.data) // block_size + for block_num in (inner_pbar := tqdm(range(n_blocks), desc="Byte-swapping Blocks", leave=False)): + block_offs = block_num * block_size + + # Byte-Swap f16 sized fields + delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16) + delta.byteswap(inplace=True) + + delta = tensor.data[block_offs + 2:block_offs + 4].view(dtype=np.uint16) + delta.byteswap(inplace=True) + + # Byte-Swap + if block_num % 100000 == 0: + inner_pbar.set_description(f"Byte-swapping Blocks [{(n_blocks - block_num) // n_blocks}]") + + elif tensor.tensor_type == gguf.GGMLQuantizationType.Q6_K: + # Handle Q6_K tensor blocks (block_q6_k) + # Specific handling of block_q6_k is required. + # Each block_q6_k consists of 208 int8 values followed by 1 f16 value. + + # first flatten structure + newshape = 1 + for i in tensor.data.shape: + newshape *= i + + tensor.data.resize(newshape) + + block_size = 210 + n_blocks = len(tensor.data) // block_size + for block_num in (inner_pbar := tqdm(range(n_blocks), desc="Byte-swapping Blocks", leave=False)): + block_offs = block_num * block_size + + # Byte-Swap f16 sized field + delta = tensor.data[block_offs + 208:block_offs + 210].view(dtype=np.uint16) + delta.byteswap(inplace=True) + + # Byte-Swap + if block_num % 100000 == 0: + inner_pbar.set_description(f"Byte-swapping Blocks [{(n_blocks - block_num) // n_blocks}]") else: # Handle other tensor types @@ -133,31 +156,25 @@ def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None def main() -> None: parser = argparse.ArgumentParser(description="Convert GGUF file byte order") parser.add_argument( - "model", - type=str, + "model", type=str, help="GGUF format model filename", ) parser.add_argument( - "order", - type=str, - choices=["big", "little", "native"], + "order", type=str, choices=['big', 'little', 'native'], help="Requested byte order", ) parser.add_argument( - "--dry-run", - action="store_true", + "--dry-run", action="store_true", help="Don't actually change anything", ) - parser.add_argument( - "--verbose", action="store_true", help="increase output verbosity" - ) + parser.add_argument("--verbose", action="store_true", help="increase output verbosity") args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"]) logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) - logger.info(f"* Loading: {args.model}") - reader = gguf.GGUFReader(args.model, "r" if args.dry_run else "r+") + logger.info(f'* Loading: {args.model}') + reader = gguf.GGUFReader(args.model, 'r' if args.dry_run else 'r+') convert_byteorder(reader, args) diff --git a/lpm_kernel/L2/gguf-py/gguf/scripts/gguf_dump.py b/lpm_kernel/L2/gguf-py/gguf/scripts/gguf_dump.py index ca0fb9c5..e282892d 100755 --- a/lpm_kernel/L2/gguf-py/gguf/scripts/gguf_dump.py +++ b/lpm_kernel/L2/gguf-py/gguf/scripts/gguf_dump.py @@ -9,13 +9,8 @@ from pathlib import Path from typing import Any -import numpy as np - # Necessary to load the local gguf package -if ( - "NO_LOCAL_GGUF" not in os.environ - and (Path(__file__).parent.parent.parent.parent / "gguf-py").exists() -): +if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent.parent / 'gguf-py').exists(): sys.path.insert(0, str(Path(__file__).parent.parent.parent)) from gguf import GGUFReader, GGUFValueType, ReaderTensor # noqa: E402 @@ -24,11 +19,11 @@ def get_file_host_endian(reader: GGUFReader) -> tuple[str, str]: - host_endian = "LITTLE" if np.uint32(1) == np.uint32(1).newbyteorder("<") else "BIG" - if reader.byte_order == "S": - file_endian = "BIG" if host_endian == "LITTLE" else "LITTLE" + file_endian = reader.endianess.name + if reader.byte_order == 'S': + host_endian = 'BIG' if file_endian == 'LITTLE' else 'LITTLE' else: - file_endian = host_endian + host_endian = file_endian return (host_endian, file_endian) @@ -36,47 +31,43 @@ def get_file_host_endian(reader: GGUFReader) -> tuple[str, str]: # please see the comments in the modify_gguf.py example. def dump_metadata(reader: GGUFReader, args: argparse.Namespace) -> None: host_endian, file_endian = get_file_host_endian(reader) - print( - f"* File is {file_endian} endian, script is running on a {host_endian} endian host." - ) # noqa: NP100 - print(f"* Dumping {len(reader.fields)} key/value pair(s)") # noqa: NP100 + print(f'* File is {file_endian} endian, script is running on a {host_endian} endian host.') # noqa: NP100 + print(f'* Dumping {len(reader.fields)} key/value pair(s)') # noqa: NP100 for n, field in enumerate(reader.fields.values(), 1): if not field.types: - pretty_type = "N/A" + pretty_type = 'N/A' elif field.types[0] == GGUFValueType.ARRAY: nest_count = len(field.types) - 1 - pretty_type = ( - "[" * nest_count + str(field.types[-1].name) + "]" * nest_count - ) + pretty_type = '[' * nest_count + str(field.types[-1].name) + ']' * nest_count else: pretty_type = str(field.types[-1].name) - log_message = f" {n:5}: {pretty_type:10} | {len(field.data):8} | {field.name}" - if len(field.types) == 1: + log_message = f' {n:5}: {pretty_type:10} | {len(field.data):8} | {field.name}' + if field.types: curr_type = field.types[0] if curr_type == GGUFValueType.STRING: - log_message += " = {0}".format( - repr(str(bytes(field.parts[-1]), encoding="utf-8")[:60]) - ) - elif field.types[0] in reader.gguf_scalar_to_np: - log_message += " = {0}".format(field.parts[-1][0]) + content = field.contents() + if len(content) > 60: + content = content[:57] + '...' + log_message += ' = {0}'.format(repr(content)) + elif curr_type in reader.gguf_scalar_to_np: + log_message += ' = {0}'.format(field.contents()) + else: + content = repr(field.contents(slice(6))) + if len(field.data) > 6: + content = content[:-1] + ', ...]' + log_message += ' = {0}'.format(content) print(log_message) # noqa: NP100 if args.no_tensors: return - print(f"* Dumping {len(reader.tensors)} tensor(s)") # noqa: NP100 + print(f'* Dumping {len(reader.tensors)} tensor(s)') # noqa: NP100 for n, tensor in enumerate(reader.tensors, 1): - prettydims = ", ".join( - "{0:5}".format(d) - for d in list(tensor.shape) + [1] * (4 - len(tensor.shape)) - ) - print( - f" {n:5}: {tensor.n_elements:10} | {prettydims} | {tensor.tensor_type.name:7} | {tensor.name}" - ) # noqa: NP100 + prettydims = ', '.join('{0:5}'.format(d) for d in list(tensor.shape) + [1] * (4 - len(tensor.shape))) + print(f' {n:5}: {tensor.n_elements:10} | {prettydims} | {tensor.tensor_type.name:7} | {tensor.name}') # noqa: NP100 def dump_metadata_json(reader: GGUFReader, args: argparse.Namespace) -> None: import json - host_endian, file_endian = get_file_host_endian(reader) metadata: dict[str, Any] = {} tensors: dict[str, Any] = {} @@ -89,7 +80,7 @@ def dump_metadata_json(reader: GGUFReader, args: argparse.Namespace) -> None: for idx, field in enumerate(reader.fields.values()): curr: dict[str, Any] = { "index": idx, - "type": field.types[0].name if field.types else "UNKNOWN", + "type": field.types[0].name if field.types else 'UNKNOWN', "offset": field.offset, } metadata[field.name] = curr @@ -97,19 +88,9 @@ def dump_metadata_json(reader: GGUFReader, args: argparse.Namespace) -> None: curr["array_types"] = [t.name for t in field.types][1:] if not args.json_array: continue - itype = field.types[-1] - if itype == GGUFValueType.STRING: - curr["value"] = [ - str(bytes(field.parts[idx]), encoding="utf-8") for idx in field.data - ] - else: - curr["value"] = [ - pv for idx in field.data for pv in field.parts[idx].tolist() - ] - elif field.types[0] == GGUFValueType.STRING: - curr["value"] = str(bytes(field.parts[-1]), encoding="utf-8") + curr["value"] = field.contents() else: - curr["value"] = field.parts[-1].tolist()[0] + curr["value"] = field.contents() if not args.no_tensors: for idx, tensor in enumerate(reader.tensors): tensors[tensor.name] = { @@ -121,108 +102,72 @@ def dump_metadata_json(reader: GGUFReader, args: argparse.Namespace) -> None: json.dump(result, sys.stdout) -def markdown_table_with_alignment_support( - header_map: list[dict[str, str]], data: list[dict[str, Any]] -): +def markdown_table_with_alignment_support(header_map: list[dict[str, str]], data: list[dict[str, Any]]): # JSON to Markdown table formatting: https://stackoverflow.com/a/72983854/2850957 # Alignment Utility Function def strAlign(padding: int, alignMode: str | None, strVal: str): - if alignMode == "center": + if alignMode == 'center': return strVal.center(padding) - elif alignMode == "right": - return strVal.rjust(padding - 1) + " " - elif alignMode == "left": - return " " + strVal.ljust(padding - 1) - else: # default left - return " " + strVal.ljust(padding - 1) + elif alignMode == 'right': + return strVal.rjust(padding - 1) + ' ' + elif alignMode == 'left': + return ' ' + strVal.ljust(padding - 1) + else: # default left + return ' ' + strVal.ljust(padding - 1) def dashAlign(padding: int, alignMode: str | None): - if alignMode == "center": - return ":" + "-" * (padding - 2) + ":" - elif alignMode == "right": - return "-" * (padding - 1) + ":" - elif alignMode == "left": - return ":" + "-" * (padding - 1) - else: # default left - return "-" * (padding) + if alignMode == 'center': + return ':' + '-' * (padding - 2) + ':' + elif alignMode == 'right': + return '-' * (padding - 1) + ':' + elif alignMode == 'left': + return ':' + '-' * (padding - 1) + else: # default left + return '-' * (padding) # Calculate Padding For Each Column Based On Header and Data Length rowsPadding = {} for index, columnEntry in enumerate(header_map): - padCount = ( - max( - [ - len(str(v)) - for d in data - for k, v in d.items() - if k == columnEntry["key_name"] - ], - default=0, - ) - + 2 - ) - headerPadCount = len(columnEntry["header_name"]) + 2 + padCount = max([len(str(v)) for d in data for k, v in d.items() if k == columnEntry['key_name']], default=0) + 2 + headerPadCount = len(columnEntry['header_name']) + 2 rowsPadding[index] = headerPadCount if padCount <= headerPadCount else padCount # Render Markdown Header rows = [] - rows.append( - "|".join( - strAlign( - rowsPadding[index], - columnEntry.get("align"), - str(columnEntry["header_name"]), - ) - for index, columnEntry in enumerate(header_map) - ) - ) - rows.append( - "|".join( - dashAlign(rowsPadding[index], columnEntry.get("align")) - for index, columnEntry in enumerate(header_map) - ) - ) + rows.append('|'.join(strAlign(rowsPadding[index], columnEntry.get('align'), str(columnEntry['header_name'])) for index, columnEntry in enumerate(header_map))) + rows.append('|'.join(dashAlign(rowsPadding[index], columnEntry.get('align')) for index, columnEntry in enumerate(header_map))) # Render Tabular Data for item in data: - rows.append( - "|".join( - strAlign( - rowsPadding[index], - columnEntry.get("align"), - str(item[columnEntry["key_name"]]), - ) - for index, columnEntry in enumerate(header_map) - ) - ) + rows.append('|'.join(strAlign(rowsPadding[index], columnEntry.get('align'), str(item[columnEntry['key_name']])) for index, columnEntry in enumerate(header_map))) # Convert Tabular String Rows Into String tableString = "" for row in rows: - tableString += f"|{row}|\n" + tableString += f'|{row}|\n' return tableString def element_count_rounded_notation(count: int) -> str: - if count > 1e15: + if count > 1e15 : # Quadrillion scaled_amount = count * 1e-15 scale_suffix = "Q" - elif count > 1e12: + elif count > 1e12 : # Trillions scaled_amount = count * 1e-12 scale_suffix = "T" - elif count > 1e9: + elif count > 1e9 : # Billions scaled_amount = count * 1e-9 scale_suffix = "B" - elif count > 1e6: + elif count > 1e6 : # Millions scaled_amount = count * 1e-6 scale_suffix = "M" - elif count > 1e3: + elif count > 1e3 : # Thousands scaled_amount = count * 1e-3 scale_suffix = "K" @@ -236,37 +181,37 @@ def element_count_rounded_notation(count: int) -> str: def translate_tensor_name(name): words = name.split(".") - # Source: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#standardized-tensor-names + # Source: https://github.com/ggml-org/ggml/blob/master/docs/gguf.md#standardized-tensor-names abbreviation_dictionary = { - "token_embd": "Token embedding", - "pos_embd": "Position embedding", - "output_norm": "Output normalization", - "output": "Output", - "attn_norm": "Attention normalization", - "attn_norm_2": "Attention normalization", - "attn_qkv": "Attention query-key-value", - "attn_q": "Attention query", - "attn_k": "Attention key", - "attn_v": "Attention value", - "attn_output": "Attention output", - "ffn_norm": "Feed-forward network normalization", - "ffn_up": 'Feed-forward network "up"', - "ffn_gate": 'Feed-forward network "gate"', - "ffn_down": 'Feed-forward network "down"', - "ffn_gate_inp": "Expert-routing layer for the Feed-forward network in Mixture of Expert models", - "ffn_gate_exp": 'Feed-forward network "gate" layer per expert in Mixture of Expert models', - "ffn_down_exp": 'Feed-forward network "down" layer per expert in Mixture of Expert models', - "ffn_up_exp": 'Feed-forward network "up" layer per expert in Mixture of Expert models', - "ssm_in": "State space model input projections", - "ssm_conv1d": "State space model rolling/shift", - "ssm_x": "State space model selective parametrization", - "ssm_a": "State space model state compression", - "ssm_d": "State space model skip connection", - "ssm_dt": "State space model time step", - "ssm_out": "State space model output projection", - "blk": "Block", - "enc": "Encoder", - "dec": "Decoder", + 'token_embd': 'Token embedding', + 'pos_embd': 'Position embedding', + 'output_norm': 'Output normalization', + 'output': 'Output', + 'attn_norm': 'Attention normalization', + 'attn_norm_2': 'Attention normalization', + 'attn_qkv': 'Attention query-key-value', + 'attn_q': 'Attention query', + 'attn_k': 'Attention key', + 'attn_v': 'Attention value', + 'attn_output': 'Attention output', + 'ffn_norm': 'Feed-forward network normalization', + 'ffn_up': 'Feed-forward network "up"', + 'ffn_gate': 'Feed-forward network "gate"', + 'ffn_down': 'Feed-forward network "down"', + 'ffn_gate_inp': 'Expert-routing layer for the Feed-forward network in Mixture of Expert models', + 'ffn_gate_exp': 'Feed-forward network "gate" layer per expert in Mixture of Expert models', + 'ffn_down_exp': 'Feed-forward network "down" layer per expert in Mixture of Expert models', + 'ffn_up_exp': 'Feed-forward network "up" layer per expert in Mixture of Expert models', + 'ssm_in': 'State space model input projections', + 'ssm_conv1d': 'State space model rolling/shift', + 'ssm_x': 'State space model selective parametrization', + 'ssm_a': 'State space model state compression', + 'ssm_d': 'State space model skip connection', + 'ssm_dt': 'State space model time step', + 'ssm_out': 'State space model output projection', + 'blk': 'Block', + 'enc': 'Encoder', + 'dec': 'Decoder', } expanded_words = [] @@ -277,42 +222,37 @@ def translate_tensor_name(name): else: expanded_words.append(word.title()) - return " ".join(expanded_words) + return ' '.join(expanded_words) def dump_markdown_metadata(reader: GGUFReader, args: argparse.Namespace) -> None: host_endian, file_endian = get_file_host_endian(reader) markdown_content = "" - markdown_content += f"# {args.model} - GGUF Internal File Dump\n\n" - markdown_content += f"- Endian: {file_endian} endian\n" - markdown_content += "\n" - markdown_content += "## Key Value Metadata Store\n\n" - markdown_content += f"There are {len(reader.fields)} key-value pairs in this file\n" - markdown_content += "\n" + markdown_content += f'# {args.model} - GGUF Internal File Dump\n\n' + markdown_content += f'- Endian: {file_endian} endian\n' + markdown_content += '\n' + markdown_content += '## Key Value Metadata Store\n\n' + markdown_content += f'There are {len(reader.fields)} key-value pairs in this file\n' + markdown_content += '\n' kv_dump_table: list[dict[str, str | int]] = [] for n, field in enumerate(reader.fields.values(), 1): if not field.types: - pretty_type = "N/A" + pretty_type = 'N/A' elif field.types[0] == GGUFValueType.ARRAY: nest_count = len(field.types) - 1 - pretty_type = ( - "[" * nest_count + str(field.types[-1].name) + "]" * nest_count - ) + pretty_type = '[' * nest_count + str(field.types[-1].name) + ']' * nest_count else: pretty_type = str(field.types[-1].name) def escape_markdown_inline_code(value_string): # Find the longest contiguous sequence of backticks in the string then # wrap string with appropriate number of backticks required to escape it - max_backticks = max( - (len(match.group(0)) for match in re.finditer(r"`+", value_string)), - default=0, - ) - inline_code_marker = "`" * (max_backticks + 1) + max_backticks = max((len(match.group(0)) for match in re.finditer(r'`+', value_string)), default=0) + inline_code_marker = '`' * (max_backticks + 1) # If the string starts or ends with a backtick, add a space at the beginning and end - if value_string.startswith("`") or value_string.endswith("`"): + if value_string.startswith('`') or value_string.endswith('`'): value_string = f" {value_string} " return f"{inline_code_marker}{value_string}{inline_code_marker}" @@ -323,14 +263,10 @@ def escape_markdown_inline_code(value_string): curr_type = field.types[0] if curr_type == GGUFValueType.STRING: truncate_length = 60 - value_string = str(bytes(field.parts[-1]), encoding="utf-8") + value_string = str(bytes(field.parts[-1]), encoding='utf-8') if len(value_string) > truncate_length: - head = escape_markdown_inline_code( - value_string[: truncate_length // 2] - ) - tail = escape_markdown_inline_code( - value_string[-truncate_length // 2 :] - ) + head = escape_markdown_inline_code(value_string[:truncate_length // 2]) + tail = escape_markdown_inline_code(value_string[-truncate_length // 2:]) value = "{head}...{tail}".format(head=head, tail=tail) else: value = escape_markdown_inline_code(value_string) @@ -345,19 +281,10 @@ def escape_markdown_inline_code(value_string): render_element = min(5, total_elements) for element_pos in range(render_element): truncate_length = 30 - value_string = str( - bytes( - field.parts[-1 - (total_elements - element_pos - 1) * 2] - ), - encoding="utf-8", - ) + value_string = str(bytes(field.parts[-1 - (total_elements - element_pos - 1) * 2]), encoding='utf-8') if len(value_string) > truncate_length: - head = escape_markdown_inline_code( - value_string[: truncate_length // 2] - ) - tail = escape_markdown_inline_code( - value_string[-truncate_length // 2 :] - ) + head = escape_markdown_inline_code(value_string[:truncate_length // 2]) + tail = escape_markdown_inline_code(value_string[-truncate_length // 2:]) value = "{head}...{tail}".format(head=head, tail=tail) else: value = escape_markdown_inline_code(value_string) @@ -366,33 +293,21 @@ def escape_markdown_inline_code(value_string): elif curr_type in reader.gguf_scalar_to_np: render_element = min(7, total_elements) for element_pos in range(render_element): - array_elements.append( - str(field.parts[-1 - (total_elements - element_pos - 1)][0]) - ) + array_elements.append(str(field.parts[-1 - (total_elements - element_pos - 1)][0])) value = f'[ {", ".join(array_elements).strip()}{", ..." if total_elements > len(array_elements) else ""} ]' - kv_dump_table.append( - { - "n": n, - "pretty_type": pretty_type, - "total_elements": total_elements, - "field_name": field.name, - "value": value, - } - ) + kv_dump_table.append({"n":n, "pretty_type":pretty_type, "total_elements":total_elements, "field_name":field.name, "value":value}) kv_dump_table_header_map = [ - {"key_name": "n", "header_name": "POS", "align": "right"}, - {"key_name": "pretty_type", "header_name": "TYPE", "align": "left"}, - {"key_name": "total_elements", "header_name": "Count", "align": "right"}, - {"key_name": "field_name", "header_name": "Key", "align": "left"}, - {"key_name": "value", "header_name": "Value", "align": "left"}, + {'key_name':'n', 'header_name':'POS', 'align':'right'}, + {'key_name':'pretty_type', 'header_name':'TYPE', 'align':'left'}, + {'key_name':'total_elements', 'header_name':'Count', 'align':'right'}, + {'key_name':'field_name', 'header_name':'Key', 'align':'left'}, + {'key_name':'value', 'header_name':'Value', 'align':'left'}, ] - markdown_content += markdown_table_with_alignment_support( - kv_dump_table_header_map, kv_dump_table - ) + markdown_content += markdown_table_with_alignment_support(kv_dump_table_header_map, kv_dump_table) markdown_content += "\n" @@ -405,17 +320,15 @@ def escape_markdown_inline_code(value_string): # Parsing Tensors Record for key, tensor in enumerate(reader.tensors): - tensor_components = tensor.name.split(".") + tensor_components = tensor.name.split('.') # Classify Tensor Group tensor_group_name = "base" - if tensor_components[0] == "blk": + if tensor_components[0] == 'blk': tensor_group_name = f"{tensor_components[0]}.{tensor_components[1]}" - elif ( - tensor_components[0] in ["enc", "dec"] and tensor_components[1] == "blk" - ): + elif tensor_components[0] in ['enc', 'dec'] and tensor_components[1] == 'blk': tensor_group_name = f"{tensor_components[0]}.{tensor_components[1]}.{tensor_components[2]}" - elif tensor_components[0] in ["enc", "dec"]: + elif tensor_components[0] in ['enc', 'dec']: tensor_group_name = f"{tensor_components[0]}" # Check if new Tensor Group @@ -428,11 +341,9 @@ def escape_markdown_inline_code(value_string): tensor_name_to_key[tensor.name] = key # Tensors Mapping Dump - markdown_content += f"## Tensors Overview {element_count_rounded_notation(total_elements)} Elements\n\n" - markdown_content += ( - f"Total number of elements in all tensors: {total_elements} Elements\n" - ) - markdown_content += "\n" + markdown_content += f'## Tensors Overview {element_count_rounded_notation(total_elements)} Elements\n\n' + markdown_content += f'Total number of elements in all tensors: {total_elements} Elements\n' + markdown_content += '\n' for group in tensor_prefix_order: tensors = tensor_groups[group] @@ -442,41 +353,24 @@ def escape_markdown_inline_code(value_string): markdown_content += "\n" markdown_content += "### Tensor Data Offset\n" - markdown_content += "\n" - markdown_content += "This table contains the offset and data segment relative to start of file\n" - markdown_content += "\n" + markdown_content += '\n' + markdown_content += 'This table contains the offset and data segment relative to start of file\n' + markdown_content += '\n' tensor_mapping_table: list[dict[str, str | int]] = [] for key, tensor in enumerate(reader.tensors): - data_offset_pretty = "{0:#16x}".format(tensor.data_offset) - data_size_pretty = "{0:#16x}".format(tensor.n_bytes) - tensor_mapping_table.append( - { - "t_id": key, - "layer_name": tensor.name, - "data_offset": data_offset_pretty, - "data_size": data_size_pretty, - } - ) + data_offset_pretty = '{0:#16x}'.format(tensor.data_offset) + data_size_pretty = '{0:#16x}'.format(tensor.n_bytes) + tensor_mapping_table.append({"t_id":key, "layer_name":tensor.name, "data_offset":data_offset_pretty, "data_size":data_size_pretty}) tensors_mapping_table_header_map = [ - {"key_name": "t_id", "header_name": "T_ID", "align": "right"}, - { - "key_name": "layer_name", - "header_name": "Tensor Layer Name", - "align": "left", - }, - { - "key_name": "data_offset", - "header_name": "Data Offset (B)", - "align": "right", - }, - {"key_name": "data_size", "header_name": "Data Size (B)", "align": "right"}, + {'key_name':'t_id', 'header_name':'T_ID', 'align':'right'}, + {'key_name':'layer_name', 'header_name':'Tensor Layer Name', 'align':'left'}, + {'key_name':'data_offset', 'header_name':'Data Offset (B)', 'align':'right'}, + {'key_name':'data_size', 'header_name':'Data Size (B)', 'align':'right'}, ] - markdown_content += markdown_table_with_alignment_support( - tensors_mapping_table_header_map, tensor_mapping_table - ) + markdown_content += markdown_table_with_alignment_support(tensors_mapping_table_header_map, tensor_mapping_table) markdown_content += "\n" for group in tensor_prefix_order: @@ -490,81 +384,35 @@ def escape_markdown_inline_code(value_string): prettify_element_count_size: int = 1 prettify_dimension_max_widths: dict[int, int] = {} for tensor in tensors: - prettify_element_est_count_size = max( - prettify_element_est_count_size, - len(str(element_count_rounded_notation(tensor.n_elements))), - ) - prettify_element_count_size = max( - prettify_element_count_size, len(str(tensor.n_elements)) - ) - for i, dimension_size in enumerate( - list(tensor.shape) + [1] * (4 - len(tensor.shape)) - ): - prettify_dimension_max_widths[i] = max( - prettify_dimension_max_widths.get(i, 1), - len(str(dimension_size)), - ) + prettify_element_est_count_size = max(prettify_element_est_count_size, len(str(element_count_rounded_notation(tensor.n_elements)))) + prettify_element_count_size = max(prettify_element_count_size, len(str(tensor.n_elements))) + for i, dimension_size in enumerate(list(tensor.shape) + [1] * (4 - len(tensor.shape))): + prettify_dimension_max_widths[i] = max(prettify_dimension_max_widths.get(i,1), len(str(dimension_size))) # Generate Tensor Layer Table Content tensor_dump_table: list[dict[str, str | int]] = [] for tensor in tensors: - human_friendly_name = translate_tensor_name( - tensor.name.replace(".weight", ".(W)").replace(".bias", ".(B)") - ) - pretty_dimension = " x ".join( - f"{str(d):>{prettify_dimension_max_widths[i]}}" - for i, d in enumerate( - list(tensor.shape) + [1] * (4 - len(tensor.shape)) - ) - ) + human_friendly_name = translate_tensor_name(tensor.name.replace(".weight", ".(W)").replace(".bias", ".(B)")) + pretty_dimension = ' x '.join(f'{str(d):>{prettify_dimension_max_widths[i]}}' for i, d in enumerate(list(tensor.shape) + [1] * (4 - len(tensor.shape)))) element_count_est = f"({element_count_rounded_notation(tensor.n_elements):>{prettify_element_est_count_size}})" element_count_string = f"{element_count_est} {tensor.n_elements:>{prettify_element_count_size}}" type_name_string = f"{tensor.tensor_type.name}" - tensor_dump_table.append( - { - "t_id": tensor_name_to_key[tensor.name], - "layer_name": tensor.name, - "human_layer_name": human_friendly_name, - "element_count": element_count_string, - "pretty_dimension": pretty_dimension, - "tensor_type": type_name_string, - } - ) + tensor_dump_table.append({"t_id":tensor_name_to_key[tensor.name], "layer_name":tensor.name, "human_layer_name":human_friendly_name, "element_count":element_count_string, "pretty_dimension":pretty_dimension, "tensor_type":type_name_string}) tensor_dump_table_header_map = [ - {"key_name": "t_id", "header_name": "T_ID", "align": "right"}, - { - "key_name": "layer_name", - "header_name": "Tensor Layer Name", - "align": "left", - }, - { - "key_name": "human_layer_name", - "header_name": "Human Friendly Tensor Layer Name", - "align": "left", - }, - { - "key_name": "element_count", - "header_name": "Elements", - "align": "left", - }, - { - "key_name": "pretty_dimension", - "header_name": "Shape", - "align": "left", - }, - {"key_name": "tensor_type", "header_name": "Type", "align": "left"}, + {'key_name':'t_id', 'header_name':'T_ID', 'align':'right'}, + {'key_name':'layer_name', 'header_name':'Tensor Layer Name', 'align':'left'}, + {'key_name':'human_layer_name', 'header_name':'Human Friendly Tensor Layer Name', 'align':'left'}, + {'key_name':'element_count', 'header_name':'Elements', 'align':'left'}, + {'key_name':'pretty_dimension', 'header_name':'Shape', 'align':'left'}, + {'key_name':'tensor_type', 'header_name':'Type', 'align':'left'}, ] - markdown_content += markdown_table_with_alignment_support( - tensor_dump_table_header_map, tensor_dump_table - ) + markdown_content += markdown_table_with_alignment_support(tensor_dump_table_header_map, tensor_dump_table) markdown_content += "\n" markdown_content += f"- Total elements in {group}: ({element_count_rounded_notation(group_elements):>4}) {group_elements}\n" - markdown_content += ( - f"- Percentage of total elements: {group_percentage:.2f}%\n" - ) + markdown_content += f"- Percentage of total elements: {group_percentage:.2f}%\n" markdown_content += "\n\n" print(markdown_content) # noqa: NP100 @@ -572,44 +420,23 @@ def escape_markdown_inline_code(value_string): def main() -> None: parser = argparse.ArgumentParser(description="Dump GGUF file metadata") - parser.add_argument("model", type=str, help="GGUF format model filename") - parser.add_argument( - "--no-tensors", action="store_true", help="Don't dump tensor metadata" - ) - parser.add_argument("--json", action="store_true", help="Produce JSON output") - parser.add_argument( - "--json-array", - action="store_true", - help="Include full array values in JSON output (long)", - ) - parser.add_argument( - "--data-offset", action="store_true", help="Start of data offset" - ) - parser.add_argument( - "--data-alignment", - action="store_true", - help="Data alignment applied globally to data field", - ) - parser.add_argument( - "--markdown", action="store_true", help="Produce markdown output" - ) - parser.add_argument( - "--verbose", action="store_true", help="increase output verbosity" - ) + parser.add_argument("model", type=str, help="GGUF format model filename") + parser.add_argument("--no-tensors", action="store_true", help="Don't dump tensor metadata") + parser.add_argument("--json", action="store_true", help="Produce JSON output") + parser.add_argument("--json-array", action="store_true", help="Include full array values in JSON output (long)") + parser.add_argument("--data-offset", action="store_true", help="Start of data offset") + parser.add_argument("--data-alignment", action="store_true", help="Data alignment applied globally to data field") + parser.add_argument("--markdown", action="store_true", help="Produce markdown output") + parser.add_argument("--verbose", action="store_true", help="increase output verbosity") args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"]) logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) - if ( - not args.json - and not args.markdown - and not args.data_offset - and not args.data_alignment - ): - logger.info(f"* Loading: {args.model}") + if not args.json and not args.markdown and not args.data_offset and not args.data_alignment: + logger.info(f'* Loading: {args.model}') - reader = GGUFReader(args.model, "r") + reader = GGUFReader(args.model, 'r') if args.json: dump_metadata_json(reader, args) @@ -623,5 +450,5 @@ def main() -> None: dump_metadata(reader, args) -if __name__ == "__main__": +if __name__ == '__main__': main() diff --git a/lpm_kernel/L2/gguf-py/gguf/scripts/gguf_editor_gui.py b/lpm_kernel/L2/gguf-py/gguf/scripts/gguf_editor_gui.py new file mode 100644 index 00000000..9dab6ca2 --- /dev/null +++ b/lpm_kernel/L2/gguf-py/gguf/scripts/gguf_editor_gui.py @@ -0,0 +1,1610 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import logging +import argparse +import os +import sys +import numpy +import enum +from pathlib import Path +from typing import Any, Optional, Tuple, Type +import warnings + +import numpy as np +from PySide6.QtWidgets import ( + QApplication, QMainWindow, QWidget, QVBoxLayout, QHBoxLayout, + QPushButton, QLabel, QLineEdit, QFileDialog, QTableWidget, + QTableWidgetItem, QComboBox, QMessageBox, QTabWidget, + QTextEdit, QFormLayout, + QHeaderView, QDialog, QDialogButtonBox +) +from PySide6.QtCore import Qt + +# Necessary to load the local gguf package +if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent.parent / 'gguf-py').exists(): + sys.path.insert(0, str(Path(__file__).parent.parent.parent)) + +import gguf +from gguf import GGUFReader, GGUFWriter, GGUFValueType, ReaderField +from gguf.constants import TokenType, RopeScalingType, PoolingType, GGMLQuantizationType + +logger = logging.getLogger("gguf-editor-gui") + +# Map of key names to enum types for automatic enum interpretation +KEY_TO_ENUM_TYPE = { + gguf.Keys.Tokenizer.TOKEN_TYPE: TokenType, + gguf.Keys.Rope.SCALING_TYPE: RopeScalingType, + gguf.Keys.LLM.POOLING_TYPE: PoolingType, + gguf.Keys.General.FILE_TYPE: GGMLQuantizationType, +} + +# Define the tokenizer keys that should be edited together +TOKENIZER_LINKED_KEYS = [ + gguf.Keys.Tokenizer.LIST, + gguf.Keys.Tokenizer.TOKEN_TYPE, + gguf.Keys.Tokenizer.SCORES +] + + +class TokenizerEditorDialog(QDialog): + def __init__(self, tokens, token_types, scores, parent=None): + super().__init__(parent) + self.setWindowTitle("Edit Tokenizer Data") + self.resize(900, 600) + + self.tokens = tokens.copy() if tokens else [] + self.token_types = token_types.copy() if token_types else [] + self.scores = scores.copy() if scores else [] + + # Ensure all arrays have the same length + max_len = max(len(self.tokens), len(self.token_types), len(self.scores)) + if len(self.tokens) < max_len: + self.tokens.extend([""] * (max_len - len(self.tokens))) + if len(self.token_types) < max_len: + self.token_types.extend([0] * (max_len - len(self.token_types))) + if len(self.scores) < max_len: + self.scores.extend([0.0] * (max_len - len(self.scores))) + + layout = QVBoxLayout(self) + + # Add filter controls + filter_layout = QHBoxLayout() + filter_layout.addWidget(QLabel("Filter:")) + self.filter_edit = QLineEdit() + self.filter_edit.setPlaceholderText("Type to filter tokens...") + self.filter_edit.textChanged.connect(self.apply_filter) + filter_layout.addWidget(self.filter_edit) + + # Add page controls + self.page_size = 100 # Show 100 items per page + self.current_page = 0 + self.total_pages = max(1, (len(self.tokens) + self.page_size - 1) // self.page_size) + + self.page_label = QLabel(f"Page 1 of {self.total_pages}") + filter_layout.addWidget(self.page_label) + + prev_page = QPushButton("Previous") + prev_page.clicked.connect(self.previous_page) + filter_layout.addWidget(prev_page) + + next_page = QPushButton("Next") + next_page.clicked.connect(self.next_page) + filter_layout.addWidget(next_page) + + layout.addLayout(filter_layout) + + # Tokenizer data table + self.tokens_table = QTableWidget() + self.tokens_table.setColumnCount(4) + self.tokens_table.setHorizontalHeaderLabels(["Index", "Token", "Type", "Score"]) + self.tokens_table.horizontalHeader().setSectionResizeMode(0, QHeaderView.ResizeMode.ResizeToContents) + self.tokens_table.horizontalHeader().setSectionResizeMode(1, QHeaderView.ResizeMode.Stretch) + self.tokens_table.horizontalHeader().setSectionResizeMode(2, QHeaderView.ResizeMode.ResizeToContents) + self.tokens_table.horizontalHeader().setSectionResizeMode(3, QHeaderView.ResizeMode.ResizeToContents) + + layout.addWidget(self.tokens_table) + + # Controls + controls_layout = QHBoxLayout() + + add_button = QPushButton("Add Token") + add_button.clicked.connect(self.add_token) + controls_layout.addWidget(add_button) + + remove_button = QPushButton("Remove Selected") + remove_button.clicked.connect(self.remove_selected) + controls_layout.addWidget(remove_button) + + controls_layout.addStretch() + + layout.addLayout(controls_layout) + + # Buttons + buttons = QDialogButtonBox(QDialogButtonBox.StandardButton.Ok | QDialogButtonBox.StandardButton.Cancel) + buttons.accepted.connect(self.accept) + buttons.rejected.connect(self.reject) + layout.addWidget(buttons) + + # Initialize the filtered values + self.filtered_indices = list(range(len(self.tokens))) + + # Load data for the first page + self.load_page() + + def apply_filter(self): + """Filter the tokens based on the search text.""" + filter_text = self.filter_edit.text().lower() + + if not filter_text: + # No filter, show all values + self.filtered_indices = list(range(len(self.tokens))) + else: + # Apply filter + self.filtered_indices = [] + for i, token in enumerate(self.tokens): + if filter_text in str(token).lower(): + self.filtered_indices.append(i) + + # Reset to first page and reload + self.total_pages = max(1, (len(self.filtered_indices) + self.page_size - 1) // self.page_size) + self.current_page = 0 + self.page_label.setText(f"Page 1 of {self.total_pages}") + self.load_page() + + def previous_page(self): + """Go to the previous page of results.""" + if self.current_page > 0: + self.current_page -= 1 + self.page_label.setText(f"Page {self.current_page + 1} of {self.total_pages}") + self.load_page() + + def next_page(self): + """Go to the next page of results.""" + if self.current_page < self.total_pages - 1: + self.current_page += 1 + self.page_label.setText(f"Page {self.current_page + 1} of {self.total_pages}") + self.load_page() + + def load_page(self): + """Load the current page of tokenizer data.""" + self.tokens_table.setRowCount(0) # Clear the table + + # Calculate start and end indices for the current page + start_idx = self.current_page * self.page_size + end_idx = min(start_idx + self.page_size, len(self.filtered_indices)) + + # Pre-allocate rows for better performance + self.tokens_table.setRowCount(end_idx - start_idx) + + for row, i in enumerate(range(start_idx, end_idx)): + orig_idx = self.filtered_indices[i] + + # Index + index_item = QTableWidgetItem(str(orig_idx)) + index_item.setData(Qt.ItemDataRole.UserRole, orig_idx) # Store original index + index_item.setFlags(index_item.flags() & ~Qt.ItemFlag.ItemIsEditable) + self.tokens_table.setItem(row, 0, index_item) + + # Token + token_item = QTableWidgetItem(str(self.tokens[orig_idx])) + self.tokens_table.setItem(row, 1, token_item) + + # Token Type + token_type = self.token_types[orig_idx] if orig_idx < len(self.token_types) else 0 + try: + enum_val = TokenType(token_type) + display_text = f"{enum_val.name} ({token_type})" + except (ValueError, KeyError): + display_text = f"Unknown ({token_type})" + + type_item = QTableWidgetItem(display_text) + type_item.setData(Qt.ItemDataRole.UserRole, token_type) + + # Make type cell editable with a double-click handler + type_item.setFlags(type_item.flags() & ~Qt.ItemFlag.ItemIsEditable) + self.tokens_table.setItem(row, 2, type_item) + + # Score + score = self.scores[orig_idx] if orig_idx < len(self.scores) else 0.0 + score_item = QTableWidgetItem(str(score)) + self.tokens_table.setItem(row, 3, score_item) + + # Connect double-click handler for token type cells + self.tokens_table.cellDoubleClicked.connect(self.handle_cell_double_click) + + def handle_cell_double_click(self, row, column): + """Handle double-click on a cell, specifically for token type editing.""" + if column == 2: # Token Type column + orig_item = self.tokens_table.item(row, 0) + if orig_item: + orig_idx = orig_item.data(Qt.ItemDataRole.UserRole) + self.edit_token_type(row, orig_idx) + + def edit_token_type(self, row, orig_idx): + """Edit a token type using a dialog with a dropdown of all enum options.""" + current_value = self.token_types[orig_idx] if orig_idx < len(self.token_types) else 0 + + # Create a dialog with enum options + dialog = QDialog(self) + dialog.setWindowTitle("Select Token Type") + layout = QVBoxLayout(dialog) + + combo = QComboBox() + for enum_val in TokenType: + combo.addItem(f"{enum_val.name} ({enum_val.value})", enum_val.value) + + # Set current value + try: + if isinstance(current_value, int): + enum_val = TokenType(current_value) + combo.setCurrentText(f"{enum_val.name} ({current_value})") + except (ValueError, KeyError): + pass + + layout.addWidget(combo) + + buttons = QDialogButtonBox(QDialogButtonBox.StandardButton.Ok | QDialogButtonBox.StandardButton.Cancel) + buttons.accepted.connect(dialog.accept) + buttons.rejected.connect(dialog.reject) + layout.addWidget(buttons) + + if dialog.exec() == QDialog.DialogCode.Accepted: + # Get the selected value + new_value = combo.currentData() + enum_val = TokenType(new_value) + display_text = f"{enum_val.name} ({new_value})" + + # Update the display + type_item = self.tokens_table.item(row, 2) + if type_item: + type_item.setText(display_text) + type_item.setData(Qt.ItemDataRole.UserRole, new_value) + + # Update the actual value + self.token_types[orig_idx] = new_value + + def add_token(self): + """Add a new token to the end of the list.""" + # Add to the end of the arrays + self.tokens.append("") + self.token_types.append(0) # Default to normal token + self.scores.append(0.0) + + orig_idx = len(self.tokens) - 1 + + # Add to filtered indices if it matches the current filter + filter_text = self.filter_edit.text().lower() + if not filter_text or filter_text in "": + self.filtered_indices.append(orig_idx) + + # Update pagination + self.total_pages = max(1, (len(self.filtered_indices) + self.page_size - 1) // self.page_size) + + # Go to the last page to show the new item + self.current_page = self.total_pages - 1 + self.page_label.setText(f"Page {self.current_page + 1} of {self.total_pages}") + + # Reload the page + self.load_page() + + def remove_selected(self): + """Remove selected tokens from all arrays.""" + selected_rows = [] + for item in self.tokens_table.selectedItems(): + row = item.row() + if row not in selected_rows: + selected_rows.append(row) + + if not selected_rows: + return + + # Get original indices in descending order to avoid index shifting + orig_indices = [] + for row in selected_rows: + orig_item = self.tokens_table.item(row, 0) + if orig_item: + orig_indices.append(orig_item.data(Qt.ItemDataRole.UserRole)) + orig_indices.sort(reverse=True) + + # Remove from all arrays + for idx in orig_indices: + if idx < len(self.tokens): + del self.tokens[idx] + if idx < len(self.token_types): + del self.token_types[idx] + if idx < len(self.scores): + del self.scores[idx] + + # Rebuild filtered_indices + self.filtered_indices = [] + filter_text = self.filter_edit.text().lower() + + for i, token in enumerate(self.tokens): + if not filter_text or filter_text in str(token).lower(): + self.filtered_indices.append(i) + + # Update pagination + self.total_pages = max(1, (len(self.filtered_indices) + self.page_size - 1) // self.page_size) + self.current_page = min(self.current_page, self.total_pages - 1) + self.page_label.setText(f"Page {self.current_page + 1} of {self.total_pages}") + + # Reload the page + self.load_page() + + def get_data(self): + """Return the edited tokenizer data.""" + return self.tokens, self.token_types, self.scores + + +class ArrayEditorDialog(QDialog): + def __init__(self, array_values, element_type, key=None, parent=None): + super().__init__(parent) + self.setWindowTitle("Edit Array Values") + self.resize(700, 500) + + self.array_values = array_values + self.element_type = element_type + self.key = key + + # Get enum type for this array if applicable + self.enum_type = None + if key in KEY_TO_ENUM_TYPE and element_type == GGUFValueType.INT32: + self.enum_type = KEY_TO_ENUM_TYPE[key] + + layout = QVBoxLayout(self) + + # Add enum type information if applicable + if self.enum_type is not None: + enum_info_layout = QHBoxLayout() + enum_label = QLabel(f"Editing {self.enum_type.__name__} values:") + enum_info_layout.addWidget(enum_label) + + # Add a legend for the enum values + enum_values = ", ".join([f"{e.name}={e.value}" for e in self.enum_type]) + enum_values_label = QLabel(f"Available values: {enum_values}") + enum_values_label.setWordWrap(True) + enum_info_layout.addWidget(enum_values_label, 1) + + layout.addLayout(enum_info_layout) + + # Add search/filter controls + filter_layout = QHBoxLayout() + filter_layout.addWidget(QLabel("Filter:")) + self.filter_edit = QLineEdit() + self.filter_edit.setPlaceholderText("Type to filter values...") + self.filter_edit.textChanged.connect(self.apply_filter) + filter_layout.addWidget(self.filter_edit) + + # Add page controls for large arrays + self.page_size = 100 # Show 100 items per page + self.current_page = 0 + self.total_pages = max(1, (len(array_values) + self.page_size - 1) // self.page_size) + + self.page_label = QLabel(f"Page 1 of {self.total_pages}") + filter_layout.addWidget(self.page_label) + + prev_page = QPushButton("Previous") + prev_page.clicked.connect(self.previous_page) + filter_layout.addWidget(prev_page) + + next_page = QPushButton("Next") + next_page.clicked.connect(self.next_page) + filter_layout.addWidget(next_page) + + layout.addLayout(filter_layout) + + # Array items table + self.items_table = QTableWidget() + + # Set up columns based on whether we have an enum type + if self.enum_type is not None: + self.items_table.setColumnCount(3) + self.items_table.setHorizontalHeaderLabels(["Index", "Value", "Actions"]) + self.items_table.horizontalHeader().setSectionResizeMode(0, QHeaderView.ResizeMode.ResizeToContents) + self.items_table.horizontalHeader().setSectionResizeMode(1, QHeaderView.ResizeMode.Stretch) + self.items_table.horizontalHeader().setSectionResizeMode(2, QHeaderView.ResizeMode.ResizeToContents) + else: + self.items_table.setColumnCount(2) + self.items_table.setHorizontalHeaderLabels(["Index", "Value"]) + self.items_table.horizontalHeader().setSectionResizeMode(0, QHeaderView.ResizeMode.ResizeToContents) + self.items_table.horizontalHeader().setSectionResizeMode(1, QHeaderView.ResizeMode.Stretch) + + layout.addWidget(self.items_table) + + # Controls + controls_layout = QHBoxLayout() + + add_button = QPushButton("Add Item") + add_button.clicked.connect(self.add_item) + controls_layout.addWidget(add_button) + + remove_button = QPushButton("Remove Selected") + remove_button.clicked.connect(self.remove_selected) + controls_layout.addWidget(remove_button) + + # Add bulk edit button for enum arrays + if self.enum_type is not None: + bulk_edit_button = QPushButton("Bulk Edit Selected") + bulk_edit_button.clicked.connect(self.bulk_edit_selected) + controls_layout.addWidget(bulk_edit_button) + + controls_layout.addStretch() + + layout.addLayout(controls_layout) + + # Buttons + buttons = QDialogButtonBox(QDialogButtonBox.StandardButton.Ok | QDialogButtonBox.StandardButton.Cancel) + buttons.accepted.connect(self.accept) + buttons.rejected.connect(self.reject) + layout.addWidget(buttons) + + # Initialize the filtered values + self.filtered_indices = list(range(len(self.array_values))) + + # Load array values for the first page + self.load_page() + + def apply_filter(self): + """Filter the array values based on the search text.""" + filter_text = self.filter_edit.text().lower() + + if not filter_text: + # No filter, show all values + self.filtered_indices = list(range(len(self.array_values))) + else: + # Apply filter + self.filtered_indices = [] + for i, value in enumerate(self.array_values): + # For enum values, search in both name and value + if self.enum_type is not None and isinstance(value, int): + try: + enum_val = self.enum_type(value) + display_text = f"{enum_val.name} ({value})".lower() + if filter_text in display_text: + self.filtered_indices.append(i) + except (ValueError, KeyError): + # If not a valid enum value, just check the raw value + if filter_text in str(value).lower(): + self.filtered_indices.append(i) + else: + # For non-enum values, just check the string representation + if filter_text in str(value).lower(): + self.filtered_indices.append(i) + + # Reset to first page and reload + self.total_pages = max(1, (len(self.filtered_indices) + self.page_size - 1) // self.page_size) + self.current_page = 0 + self.page_label.setText(f"Page 1 of {self.total_pages}") + self.load_page() + + def previous_page(self): + """Go to the previous page of results.""" + if self.current_page > 0: + self.current_page -= 1 + self.page_label.setText(f"Page {self.current_page + 1} of {self.total_pages}") + self.load_page() + + def next_page(self): + """Go to the next page of results.""" + if self.current_page < self.total_pages - 1: + self.current_page += 1 + self.page_label.setText(f"Page {self.current_page + 1} of {self.total_pages}") + self.load_page() + + def load_page(self): + """Load the current page of array values.""" + self.items_table.setRowCount(0) # Clear the table + + # Calculate start and end indices for the current page + start_idx = self.current_page * self.page_size + end_idx = min(start_idx + self.page_size, len(self.filtered_indices)) + + # Pre-allocate rows for better performance + self.items_table.setRowCount(end_idx - start_idx) + + for row, i in enumerate(range(start_idx, end_idx)): + orig_idx = self.filtered_indices[i] + value = self.array_values[orig_idx] + + # Index + index_item = QTableWidgetItem(str(orig_idx)) + index_item.setData(Qt.ItemDataRole.UserRole, orig_idx) # Store original index + index_item.setFlags(index_item.flags() & ~Qt.ItemFlag.ItemIsEditable) + self.items_table.setItem(row, 0, index_item) + + # Value + if self.enum_type is not None: + # Display enum value and name + try: + if isinstance(value, (int, numpy.signedinteger)): + enum_val = self.enum_type(value) + display_text = f"{enum_val.name} ({value})" + else: + display_text = str(value) + except (ValueError, KeyError): + display_text = f"Unknown ({value})" + + # Store the enum value in the item + value_item = QTableWidgetItem(display_text) + value_item.setData(Qt.ItemDataRole.UserRole, value) + value_item.setFlags(value_item.flags() & ~Qt.ItemFlag.ItemIsEditable) + self.items_table.setItem(row, 1, value_item) + + # Add an edit button in a separate column + edit_button = QPushButton("Edit") + edit_button.setProperty("row", row) + edit_button.clicked.connect(self.edit_array_enum_value) + + # Create a widget to hold the button + button_widget = QWidget() + button_layout = QHBoxLayout(button_widget) + button_layout.setContentsMargins(2, 2, 2, 2) + button_layout.addWidget(edit_button) + button_layout.addStretch() + + self.items_table.setCellWidget(row, 2, button_widget) + else: + value_item = QTableWidgetItem(str(value)) + self.items_table.setItem(row, 1, value_item) + + def edit_array_enum_value(self): + """Handle editing an enum value in the array editor.""" + button = self.sender() + row = button.property("row") + + # Get the original index from the table item + orig_item = self.items_table.item(row, 0) + new_item = self.items_table.item(row, 1) + if orig_item and new_item and self.enum_type and self.edit_enum_value(row, self.enum_type): + orig_idx = orig_item.data(Qt.ItemDataRole.UserRole) + new_value = new_item.data(Qt.ItemDataRole.UserRole) + # Update the stored value in the array + if isinstance(new_value, (int, float, str, bool)): + self.array_values[orig_idx] = new_value + + def bulk_edit_selected(self): + """Edit multiple enum values at once.""" + if not self.enum_type: + return + + selected_rows = set() + for item in self.items_table.selectedItems(): + selected_rows.add(item.row()) + + if not selected_rows: + QMessageBox.information(self, "No Selection", "Please select at least one row to edit.") + return + + # Create a dialog with enum options + dialog = QDialog(self) + dialog.setWindowTitle(f"Bulk Edit {self.enum_type.__name__} Values") + layout = QVBoxLayout(dialog) + + layout.addWidget(QLabel(f"Set {len(selected_rows)} selected items to:")) + + combo = QComboBox() + for enum_val in self.enum_type: + combo.addItem(f"{enum_val.name} ({enum_val.value})", enum_val.value) + + layout.addWidget(combo) + + buttons = QDialogButtonBox(QDialogButtonBox.StandardButton.Ok | QDialogButtonBox.StandardButton.Cancel) + buttons.accepted.connect(dialog.accept) + buttons.rejected.connect(dialog.reject) + layout.addWidget(buttons) + + if dialog.exec() == QDialog.DialogCode.Accepted: + # Get the selected value + new_value = combo.currentData() + enum_val = self.enum_type(new_value) + display_text = f"{enum_val.name} ({new_value})" + + # Update all selected rows + for row in selected_rows: + orig_item = self.items_table.item(row, 0) + new_item = self.items_table.item(row, 1) + if orig_item and new_item: + orig_idx = orig_item.data(Qt.ItemDataRole.UserRole) + self.array_values[orig_idx] = new_value + + # Update the display + new_item.setText(display_text) + new_item.setData(Qt.ItemDataRole.UserRole, new_value) + + def add_item(self): + # Add to the end of the array + orig_idx = len(self.array_values) + + # Add default value based on type + if self.enum_type is not None: + # Default to first enum value + default_value = list(self.enum_type)[0].value + self.array_values.append(default_value) + else: + if self.element_type == GGUFValueType.STRING: + self.array_values.append("") + else: + self.array_values.append(0) + + # Add to filtered indices if it matches the current filter + self.filtered_indices.append(orig_idx) + + # Update pagination + self.total_pages = max(1, (len(self.filtered_indices) + self.page_size - 1) // self.page_size) + + # Go to the last page to show the new item + self.current_page = self.total_pages - 1 + self.page_label.setText(f"Page {self.current_page + 1} of {self.total_pages}") + + # Reload the page + self.load_page() + + def remove_selected(self): + selected_rows = [] + for item in self.items_table.selectedItems(): + row = item.row() + if row not in selected_rows: + selected_rows.append(row) + + if not selected_rows: + return + + # Get original indices in descending order to avoid index shifting + orig_indices = list() + for row in selected_rows: + orig_item = self.items_table.item(row, 0) + if orig_item: + orig_indices.append(orig_item.data(Qt.ItemDataRole.UserRole)) + orig_indices.sort(reverse=True) + + # Remove from array_values + for idx in orig_indices: + del self.array_values[idx] + + # Rebuild filtered_indices + self.filtered_indices = [] + filter_text = self.filter_edit.text().lower() + + for i, value in enumerate(self.array_values): + if not filter_text: + self.filtered_indices.append(i) + else: + # Apply filter + if self.enum_type is not None and isinstance(value, int): + try: + enum_val = self.enum_type(value) + display_text = f"{enum_val.name} ({value})".lower() + if filter_text in display_text: + self.filtered_indices.append(i) + except (ValueError, KeyError): + if filter_text in str(value).lower(): + self.filtered_indices.append(i) + else: + if filter_text in str(value).lower(): + self.filtered_indices.append(i) + + # Update pagination + self.total_pages = max(1, (len(self.filtered_indices) + self.page_size - 1) // self.page_size) + self.current_page = min(self.current_page, self.total_pages - 1) + self.page_label.setText(f"Page {self.current_page + 1} of {self.total_pages}") + + # Reload the page + self.load_page() + + def edit_enum_value(self, row: int, enum_type: Type[enum.Enum]): + """Edit an enum value using a dialog with a dropdown of all enum options.""" + # Get the original index from the table item + orig_item = self.items_table.item(row, 0) + if orig_item: + orig_idx = orig_item.data(Qt.ItemDataRole.UserRole) + else: + return + current_value = self.array_values[orig_idx] + + # Create a dialog with enum options + dialog = QDialog(self) + dialog.setWindowTitle(f"Select {enum_type.__name__} Value") + layout = QVBoxLayout(dialog) + + # Add description + description = QLabel(f"Select a {enum_type.__name__} value:") + layout.addWidget(description) + + # Use a combo box for quick selection + combo = QComboBox() + for enum_val in enum_type: + combo.addItem(f"{enum_val.name} ({enum_val.value})", enum_val.value) + + # Set current value + try: + if isinstance(current_value, int): + enum_val = enum_type(current_value) + combo.setCurrentText(f"{enum_val.name} ({current_value})") + except (ValueError, KeyError): + pass + + layout.addWidget(combo) + + buttons = QDialogButtonBox(QDialogButtonBox.StandardButton.Ok | QDialogButtonBox.StandardButton.Cancel) + buttons.accepted.connect(dialog.accept) + buttons.rejected.connect(dialog.reject) + layout.addWidget(buttons) + + if dialog.exec() == QDialog.DialogCode.Accepted: + # Update the value display and stored data + new_value = combo.currentData() + enum_val = enum_type(new_value) + display_text = f"{enum_val.name} ({new_value})" + + new_item = self.items_table.item(row, 1) + if new_item: + new_item.setText(display_text) + new_item.setData(Qt.ItemDataRole.UserRole, new_value) + + # Update the actual array value + self.array_values[orig_idx] = new_value + return True + return False + + def get_array_values(self): + # The array_values list is kept up-to-date as edits are made + return self.array_values + + +class AddMetadataDialog(QDialog): + def __init__(self, parent=None): + super().__init__(parent) + self.setWindowTitle("Add Metadata") + self.resize(400, 200) + + layout = QVBoxLayout(self) + + form_layout = QFormLayout() + + self.key_edit = QLineEdit() + form_layout.addRow("Key:", self.key_edit) + + self.type_combo = QComboBox() + for value_type in GGUFValueType: + if value_type != GGUFValueType.ARRAY: # Skip array type for simplicity + self.type_combo.addItem(value_type.name, value_type) + form_layout.addRow("Type:", self.type_combo) + + self.value_edit = QTextEdit() + form_layout.addRow("Value:", self.value_edit) + + layout.addLayout(form_layout) + + buttons = QDialogButtonBox(QDialogButtonBox.StandardButton.Ok | QDialogButtonBox.StandardButton.Cancel) + buttons.accepted.connect(self.accept) + buttons.rejected.connect(self.reject) + layout.addWidget(buttons) + + def get_data(self) -> Tuple[str, GGUFValueType, Any]: + key = self.key_edit.text() + value_type = self.type_combo.currentData() + value_text = self.value_edit.toPlainText() + + # Convert value based on type + if value_type == GGUFValueType.UINT8: + value = np.uint8(int(value_text)) + elif value_type == GGUFValueType.INT8: + value = np.int8(int(value_text)) + elif value_type == GGUFValueType.UINT16: + value = np.uint16(int(value_text)) + elif value_type == GGUFValueType.INT16: + value = np.int16(int(value_text)) + elif value_type == GGUFValueType.UINT32: + value = np.uint32(int(value_text)) + elif value_type == GGUFValueType.INT32: + value = np.int32(int(value_text)) + elif value_type == GGUFValueType.FLOAT32: + value = np.float32(float(value_text)) + elif value_type == GGUFValueType.BOOL: + value = value_text.lower() in ('true', 'yes', '1') + elif value_type == GGUFValueType.STRING: + value = value_text + else: + value = value_text + + return key, value_type, value + + +class GGUFEditorWindow(QMainWindow): + def __init__(self): + super().__init__() + + self.setWindowTitle("GGUF Editor") + self.resize(1000, 800) + + self.current_file = None + self.reader = None + self.modified = False + self.metadata_changes = {} # Store changes to apply when saving + self.metadata_to_remove = set() # Store keys to remove when saving + + self.setup_ui() + + def setup_ui(self): + central_widget = QWidget() + self.setCentralWidget(central_widget) + + main_layout = QVBoxLayout(central_widget) + + # File controls + file_layout = QHBoxLayout() + + self.file_path_edit = QLineEdit() + self.file_path_edit.setReadOnly(True) + file_layout.addWidget(self.file_path_edit) + + open_button = QPushButton("Open GGUF") + open_button.clicked.connect(self.open_file) + file_layout.addWidget(open_button) + + save_button = QPushButton("Save As...") + save_button.clicked.connect(self.save_file) + file_layout.addWidget(save_button) + + main_layout.addLayout(file_layout) + + # Tabs for different views + self.tabs = QTabWidget() + + # Metadata tab + self.metadata_tab = QWidget() + metadata_layout = QVBoxLayout(self.metadata_tab) + + # Metadata table + self.metadata_table = QTableWidget() + self.metadata_table.setColumnCount(4) + self.metadata_table.setHorizontalHeaderLabels(["Key", "Type", "Value", "Actions"]) + self.metadata_table.horizontalHeader().setSectionResizeMode(0, QHeaderView.ResizeMode.Stretch) + self.metadata_table.horizontalHeader().setSectionResizeMode(1, QHeaderView.ResizeMode.ResizeToContents) + self.metadata_table.horizontalHeader().setSectionResizeMode(2, QHeaderView.ResizeMode.Stretch) + self.metadata_table.horizontalHeader().setSectionResizeMode(3, QHeaderView.ResizeMode.ResizeToContents) + metadata_layout.addWidget(self.metadata_table) + + # Metadata controls + metadata_controls = QHBoxLayout() + + add_metadata_button = QPushButton("Add Metadata") + add_metadata_button.clicked.connect(self.add_metadata) + metadata_controls.addWidget(add_metadata_button) + + metadata_controls.addStretch() + + metadata_layout.addLayout(metadata_controls) + + # Tensors tab + self.tensors_tab = QWidget() + tensors_layout = QVBoxLayout(self.tensors_tab) + + self.tensors_table = QTableWidget() + self.tensors_table.setColumnCount(5) + self.tensors_table.setHorizontalHeaderLabels(["Name", "Type", "Shape", "Elements", "Size (bytes)"]) + self.tensors_table.horizontalHeader().setSectionResizeMode(0, QHeaderView.ResizeMode.Stretch) + self.tensors_table.horizontalHeader().setSectionResizeMode(1, QHeaderView.ResizeMode.ResizeToContents) + self.tensors_table.horizontalHeader().setSectionResizeMode(2, QHeaderView.ResizeMode.ResizeToContents) + self.tensors_table.horizontalHeader().setSectionResizeMode(3, QHeaderView.ResizeMode.ResizeToContents) + self.tensors_table.horizontalHeader().setSectionResizeMode(4, QHeaderView.ResizeMode.ResizeToContents) + tensors_layout.addWidget(self.tensors_table) + + # Add tabs to tab widget + self.tabs.addTab(self.metadata_tab, "Metadata") + self.tabs.addTab(self.tensors_tab, "Tensors") + + main_layout.addWidget(self.tabs) + + # Status bar + self.statusBar().showMessage("Ready") + + def load_file(self, file_path): + """Load a GGUF file by path""" + try: + self.statusBar().showMessage(f"Loading {file_path}...") + QApplication.processEvents() + + self.reader = GGUFReader(file_path, 'r') + self.current_file = file_path + self.file_path_edit.setText(file_path) + + self.load_metadata() + self.load_tensors() + + self.metadata_changes = {} + self.metadata_to_remove = set() + self.modified = False + + self.statusBar().showMessage(f"Loaded {file_path}") + return True + except Exception as e: + QMessageBox.critical(self, "Error", f"Failed to open file: {str(e)}") + self.statusBar().showMessage("Error loading file") + return False + + def open_file(self): + file_path, _ = QFileDialog.getOpenFileName( + self, "Open GGUF File", "", "GGUF Files (*.gguf);;All Files (*)" + ) + + if not file_path: + return + + self.load_file(file_path) + + def load_metadata(self): + self.metadata_table.setRowCount(0) + + if not self.reader: + return + + # Disconnect to prevent triggering during loading + with warnings.catch_warnings(): + warnings.filterwarnings('ignore') + self.metadata_table.itemChanged.disconnect(self.on_metadata_changed) + + for i, (key, field) in enumerate(self.reader.fields.items()): + self.metadata_table.insertRow(i) + + # Key + key_item = QTableWidgetItem(key) + key_item.setFlags(key_item.flags() & ~Qt.ItemFlag.ItemIsEditable) + self.metadata_table.setItem(i, 0, key_item) + + # Type + if not field.types: + type_str = "N/A" + elif field.types[0] == GGUFValueType.ARRAY: + nest_count = len(field.types) - 1 + element_type = field.types[-1].name + # Check if this is an enum array + enum_type = self.get_enum_for_key(key) + if enum_type is not None and field.types[-1] == GGUFValueType.INT32: + element_type = enum_type.__name__ + type_str = '[' * nest_count + element_type + ']' * nest_count + else: + type_str = str(field.types[0].name) + # Check if this is an enum field + enum_type = self.get_enum_for_key(key) + if enum_type is not None and field.types[0] == GGUFValueType.INT32: + type_str = enum_type.__name__ + + type_item = QTableWidgetItem(type_str) + type_item.setFlags(type_item.flags() & ~Qt.ItemFlag.ItemIsEditable) + self.metadata_table.setItem(i, 1, type_item) + + # Value + value_str = self.format_field_value(field) + value_item = QTableWidgetItem(value_str) + + # Make only simple values editable + if len(field.types) == 1 and field.types[0] != GGUFValueType.ARRAY: + value_item.setFlags(value_item.flags() | Qt.ItemFlag.ItemIsEditable) + else: + value_item.setFlags(value_item.flags() & ~Qt.ItemFlag.ItemIsEditable) + + self.metadata_table.setItem(i, 2, value_item) + + # Actions + actions_widget = QWidget() + actions_layout = QHBoxLayout(actions_widget) + actions_layout.setContentsMargins(2, 2, 2, 2) + + # Add Edit button for arrays and enum fields + if field.types and field.types[0] == GGUFValueType.ARRAY: + edit_button = QPushButton("Edit") + edit_button.setProperty("row", i) + edit_button.setProperty("key", key) + edit_button.clicked.connect(self.edit_array_metadata) + actions_layout.addWidget(edit_button) + + # Add special label for tokenizer linked fields + if key in TOKENIZER_LINKED_KEYS: + edit_button.setText("Edit Tokenizer") + edit_button.setToolTip("Edit all tokenizer data together") + elif len(field.types) == 1 and self.get_enum_for_key(key) is not None: + edit_button = QPushButton("Edit") + edit_button.setProperty("row", i) + edit_button.setProperty("key", key) + edit_button.clicked.connect(self.edit_metadata_enum) + actions_layout.addWidget(edit_button) + + remove_button = QPushButton("Remove") + remove_button.setProperty("row", i) + remove_button.setProperty("key", key) + remove_button.clicked.connect(self.remove_metadata) + actions_layout.addWidget(remove_button) + + self.metadata_table.setCellWidget(i, 3, actions_widget) + + # Reconnect after loading + self.metadata_table.itemChanged.connect(self.on_metadata_changed) + + def extract_array_values(self, field: ReaderField) -> list: + """Extract all values from an array field.""" + if not field.types or field.types[0] != GGUFValueType.ARRAY: + return [] + + curr_type = field.types[1] + array_values = [] + total_elements = len(field.data) + + if curr_type == GGUFValueType.STRING: + for element_pos in range(total_elements): + value_string = str(bytes(field.parts[-1 - (total_elements - element_pos - 1) * 2]), encoding='utf-8') + array_values.append(value_string) + elif self.reader and curr_type in self.reader.gguf_scalar_to_np: + for element_pos in range(total_elements): + array_values.append(field.parts[-1 - (total_elements - element_pos - 1)][0]) + + return array_values + + def get_enum_for_key(self, key: str) -> Optional[Type[enum.Enum]]: + """Get the enum type for a given key if it exists.""" + return KEY_TO_ENUM_TYPE.get(key) + + def format_enum_value(self, value: Any, enum_type: Type[enum.Enum]) -> str: + """Format a value as an enum if possible.""" + try: + if isinstance(value, (int, str)): + enum_value = enum_type(value) + return f"{enum_value.name} ({value})" + except (ValueError, KeyError): + pass + return str(value) + + def format_field_value(self, field: ReaderField) -> str: + if not field.types: + return "N/A" + + if len(field.types) == 1: + curr_type = field.types[0] + if curr_type == GGUFValueType.STRING: + return str(bytes(field.parts[-1]), encoding='utf-8') + elif self.reader and curr_type in self.reader.gguf_scalar_to_np: + value = field.parts[-1][0] + # Check if this field has an enum type + enum_type = self.get_enum_for_key(field.name) + if enum_type is not None: + return self.format_enum_value(value, enum_type) + return str(value) + + if field.types[0] == GGUFValueType.ARRAY: + array_values = self.extract_array_values(field) + render_element = min(5, len(array_values)) + + # Get enum type for this array if applicable + enum_type = self.get_enum_for_key(field.name) + + if enum_type is not None: + array_elements = [] + for i in range(render_element): + array_elements.append(self.format_enum_value(array_values[i], enum_type)) + else: + array_elements = [str(array_values[i]) for i in range(render_element)] + + return f"[ {', '.join(array_elements).strip()}{', ...' if len(array_values) > len(array_elements) else ''} ]" + + return "Complex value" + + def load_tensors(self): + self.tensors_table.setRowCount(0) + + if not self.reader: + return + + for i, tensor in enumerate(self.reader.tensors): + self.tensors_table.insertRow(i) + + # Name + name_item = QTableWidgetItem(tensor.name) + name_item.setFlags(name_item.flags() & ~Qt.ItemFlag.ItemIsEditable) + self.tensors_table.setItem(i, 0, name_item) + + # Type + type_item = QTableWidgetItem(tensor.tensor_type.name) + type_item.setFlags(type_item.flags() & ~Qt.ItemFlag.ItemIsEditable) + self.tensors_table.setItem(i, 1, type_item) + + # Shape + shape_str = " × ".join(str(d) for d in tensor.shape) + shape_item = QTableWidgetItem(shape_str) + shape_item.setFlags(shape_item.flags() & ~Qt.ItemFlag.ItemIsEditable) + self.tensors_table.setItem(i, 2, shape_item) + + # Elements + elements_item = QTableWidgetItem(str(tensor.n_elements)) + elements_item.setFlags(elements_item.flags() & ~Qt.ItemFlag.ItemIsEditable) + self.tensors_table.setItem(i, 3, elements_item) + + # Size + size_item = QTableWidgetItem(f"{tensor.n_bytes:,}") + size_item.setFlags(size_item.flags() & ~Qt.ItemFlag.ItemIsEditable) + self.tensors_table.setItem(i, 4, size_item) + + def on_metadata_changed(self, item): + if item.column() != 2: # Only handle value column changes + return + + row = item.row() + orig_item = self.metadata_table.item(row, 0) + key = None + if orig_item: + key = orig_item.text() + new_value = item.text() + + field = None + if self.reader and key: + field = self.reader.get_field(key) + if not field or not field.types or not key: + return + + value_type = field.types[0] + + # Check if this is an enum field + enum_type = self.get_enum_for_key(key) + if enum_type is not None and value_type == GGUFValueType.INT32: + # Try to parse the enum value from the text + try: + # Check if it's a name + try: + enum_val = enum_type[new_value] + converted_value = enum_val.value + except (KeyError, AttributeError): + # Check if it's a number or "NAME (value)" format + if '(' in new_value and ')' in new_value: + # Extract the value from "NAME (value)" format + value_part = new_value.split('(')[1].split(')')[0].strip() + converted_value = int(value_part) + else: + # Try to convert directly to int + converted_value = int(new_value) + + # Validate that it's a valid enum value + enum_type(converted_value) + + # Store the change + self.metadata_changes[key] = (value_type, converted_value) + self.modified = True + + # Update display with formatted enum value + formatted_value = self.format_enum_value(converted_value, enum_type) + item.setText(formatted_value) + + self.statusBar().showMessage(f"Changed {key} to {formatted_value}") + return + except (ValueError, KeyError) as e: + QMessageBox.warning( + self, + f"Invalid Enum Value ({e})", + f"'{new_value}' is not a valid {enum_type.__name__} value.\n" + f"Valid values are: {', '.join(v.name for v in enum_type)}") + + # Revert to original value + original_value = self.format_field_value(field) + item.setText(original_value) + return + + try: + # Convert the string value to the appropriate type + if value_type == GGUFValueType.UINT8: + converted_value = np.uint8(int(new_value)) + elif value_type == GGUFValueType.INT8: + converted_value = np.int8(int(new_value)) + elif value_type == GGUFValueType.UINT16: + converted_value = np.uint16(int(new_value)) + elif value_type == GGUFValueType.INT16: + converted_value = np.int16(int(new_value)) + elif value_type == GGUFValueType.UINT32: + converted_value = np.uint32(int(new_value)) + elif value_type == GGUFValueType.INT32: + converted_value = np.int32(int(new_value)) + elif value_type == GGUFValueType.FLOAT32: + converted_value = np.float32(float(new_value)) + elif value_type == GGUFValueType.BOOL: + converted_value = new_value.lower() in ('true', 'yes', '1') + elif value_type == GGUFValueType.STRING: + converted_value = new_value + else: + # Unsupported type for editing + return + + # Store the change + self.metadata_changes[key] = (value_type, converted_value) + self.modified = True + + self.statusBar().showMessage(f"Changed {key} to {new_value}") + except ValueError: + QMessageBox.warning(self, "Invalid Value", f"The value '{new_value}' is not valid for type {value_type.name}") + + # Revert to original value + original_value = self.format_field_value(field) + item.setText(original_value) + + def remove_metadata(self): + button = self.sender() + key = button.property("key") + row = button.property("row") + + reply = QMessageBox.question( + self, "Confirm Removal", + f"Are you sure you want to remove the metadata key '{key}'?", + QMessageBox.StandardButton.Yes | QMessageBox.StandardButton.No, QMessageBox.StandardButton.No + ) + + if reply == QMessageBox.StandardButton.Yes: + self.metadata_table.removeRow(row) + self.metadata_to_remove.add(key) + + # If we previously had changes for this key, remove them + if key in self.metadata_changes: + del self.metadata_changes[key] + + self.modified = True + self.statusBar().showMessage(f"Marked {key} for removal") + + def edit_metadata_enum(self): + """Edit an enum metadata field.""" + button = self.sender() + key = button.property("key") + row = button.property("row") + + field = None + if self.reader: + field = self.reader.get_field(key) + if not field or not field.types: + return + + enum_type = self.get_enum_for_key(key) + if enum_type is None: + return + + # Get current value + current_value = field.contents() + + # Create a dialog with enum options + dialog = QDialog(self) + dialog.setWindowTitle(f"Select {enum_type.__name__} Value") + layout = QVBoxLayout(dialog) + + combo = QComboBox() + for enum_val in enum_type: + combo.addItem(f"{enum_val.name} ({enum_val.value})", enum_val.value) + + # Set current value + try: + if isinstance(current_value, (int, str)): + enum_val = enum_type(current_value) + combo.setCurrentText(f"{enum_val.name} ({current_value})") + except (ValueError, KeyError): + pass + + layout.addWidget(combo) + + buttons = QDialogButtonBox(QDialogButtonBox.StandardButton.Ok | QDialogButtonBox.StandardButton.Cancel) + buttons.accepted.connect(dialog.accept) + buttons.rejected.connect(dialog.reject) + layout.addWidget(buttons) + + if dialog.exec() == QDialog.DialogCode.Accepted: + # Get the selected value + new_value = combo.currentData() + enum_val = enum_type(new_value) + + # Store the change + self.metadata_changes[key] = (field.types[0], new_value) + self.modified = True + + # Update display + display_text = f"{enum_val.name} ({new_value})" + target_item = self.metadata_table.item(row, 2) + if target_item: + target_item.setText(display_text) + + self.statusBar().showMessage(f"Changed {key} to {display_text}") + + def edit_array_metadata(self): + button = self.sender() + key = button.property("key") + row = button.property("row") + + # Check if this is one of the linked tokenizer keys + if key in TOKENIZER_LINKED_KEYS: + self.edit_tokenizer_metadata(key) + return + + field = None + if self.reader: + field = self.reader.get_field(key) + if not field or not field.types or field.types[0] != GGUFValueType.ARRAY: + return + + # Get array element type + element_type = field.types[1] + + # Extract array values + array_values = self.extract_array_values(field) + + # Open array editor dialog + dialog = ArrayEditorDialog(array_values, element_type, key, self) + if dialog.exec() == QDialog.DialogCode.Accepted: + new_values = dialog.get_array_values() + + # Store the change + self.metadata_changes[key] = (GGUFValueType.ARRAY, (element_type, new_values)) + self.modified = True + + # Update display + enum_type = self.get_enum_for_key(key) + if enum_type is not None and element_type == GGUFValueType.INT32: + value_str = f"[ {', '.join(self.format_enum_value(v, enum_type) for v in new_values[:5])}{', ...' if len(new_values) > 5 else ''} ]" + else: + value_str = f"[ {', '.join(str(v) for v in new_values[:5])}{', ...' if len(new_values) > 5 else ''} ]" + target_item = self.metadata_table.item(row, 2) + if target_item: + target_item.setText(value_str) + + self.statusBar().showMessage(f"Updated array values for {key}") + + def edit_tokenizer_metadata(self, trigger_key): + """Edit the linked tokenizer metadata arrays together.""" + if not self.reader: + return + + # Get all three fields + tokens_field = self.reader.get_field(gguf.Keys.Tokenizer.LIST) + token_types_field = self.reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE) + scores_field = self.reader.get_field(gguf.Keys.Tokenizer.SCORES) + + # Extract values from each field + tokens = self.extract_array_values(tokens_field) if tokens_field else [] + token_types = self.extract_array_values(token_types_field) if token_types_field else [] + scores = self.extract_array_values(scores_field) if scores_field else [] + + # Apply any pending changes + if gguf.Keys.Tokenizer.LIST in self.metadata_changes: + _, (_, tokens) = self.metadata_changes[gguf.Keys.Tokenizer.LIST] + if gguf.Keys.Tokenizer.TOKEN_TYPE in self.metadata_changes: + _, (_, token_types) = self.metadata_changes[gguf.Keys.Tokenizer.TOKEN_TYPE] + if gguf.Keys.Tokenizer.SCORES in self.metadata_changes: + _, (_, scores) = self.metadata_changes[gguf.Keys.Tokenizer.SCORES] + + # Open the tokenizer editor dialog + dialog = TokenizerEditorDialog(tokens, token_types, scores, self) + if dialog.exec() == QDialog.DialogCode.Accepted: + new_tokens, new_token_types, new_scores = dialog.get_data() + + # Store changes for all three arrays + if tokens_field: + self.metadata_changes[gguf.Keys.Tokenizer.LIST] = ( + GGUFValueType.ARRAY, + (tokens_field.types[1], new_tokens) + ) + + if token_types_field: + self.metadata_changes[gguf.Keys.Tokenizer.TOKEN_TYPE] = ( + GGUFValueType.ARRAY, + (token_types_field.types[1], new_token_types) + ) + + if scores_field: + self.metadata_changes[gguf.Keys.Tokenizer.SCORES] = ( + GGUFValueType.ARRAY, + (scores_field.types[1], new_scores) + ) + + self.modified = True + + # Update display for all three fields + self.update_tokenizer_display(gguf.Keys.Tokenizer.LIST, new_tokens) + self.update_tokenizer_display(gguf.Keys.Tokenizer.TOKEN_TYPE, new_token_types) + self.update_tokenizer_display(gguf.Keys.Tokenizer.SCORES, new_scores) + + self.statusBar().showMessage("Updated tokenizer data") + + def update_tokenizer_display(self, key, values): + """Update the display of a tokenizer field in the metadata table.""" + for row in range(self.metadata_table.rowCount()): + key_item = self.metadata_table.item(row, 0) + if key_item and key_item.text() == key: + value_str = f"[ {', '.join(str(v) for v in values[:5])}{', ...' if len(values) > 5 else ''} ]" + value_item = self.metadata_table.item(row, 2) + if value_item: + value_item.setText(value_str) + break + + def add_metadata(self): + dialog = AddMetadataDialog(self) + if dialog.exec() == QDialog.DialogCode.Accepted: + key, value_type, value = dialog.get_data() + + if not key: + QMessageBox.warning(self, "Invalid Key", "Key cannot be empty") + return + + # Check if key already exists + for row in range(self.metadata_table.rowCount()): + orig_item = self.metadata_table.item(row, 0) + if orig_item and orig_item.text() == key: + QMessageBox.warning(self, "Duplicate Key", f"Key '{key}' already exists") + return + + # Add to table + row = self.metadata_table.rowCount() + self.metadata_table.insertRow(row) + + # Key + key_item = QTableWidgetItem(key) + key_item.setFlags(key_item.flags() & ~Qt.ItemFlag.ItemIsEditable) + self.metadata_table.setItem(row, 0, key_item) + + # Type + type_item = QTableWidgetItem(value_type.name) + type_item.setFlags(type_item.flags() & ~Qt.ItemFlag.ItemIsEditable) + self.metadata_table.setItem(row, 1, type_item) + + # Value + value_item = QTableWidgetItem(str(value)) + value_item.setFlags(value_item.flags() | Qt.ItemFlag.ItemIsEditable) + self.metadata_table.setItem(row, 2, value_item) + + # Actions + actions_widget = QWidget() + actions_layout = QHBoxLayout(actions_widget) + actions_layout.setContentsMargins(2, 2, 2, 2) + + remove_button = QPushButton("Remove") + remove_button.setProperty("row", row) + remove_button.setProperty("key", key) + remove_button.clicked.connect(self.remove_metadata) + actions_layout.addWidget(remove_button) + + self.metadata_table.setCellWidget(row, 3, actions_widget) + + # Store the change + self.metadata_changes[key] = (value_type, value) + self.modified = True + + self.statusBar().showMessage(f"Added new metadata key {key}") + + def save_file(self): + if not self.reader: + QMessageBox.warning(self, "No File Open", "Please open a GGUF file first") + return + + if not self.modified and not self.metadata_changes and not self.metadata_to_remove: + QMessageBox.information(self, "No Changes", "No changes to save") + return + + file_path, _ = QFileDialog.getSaveFileName( + self, "Save GGUF File As", "", "GGUF Files (*.gguf);;All Files (*)" + ) + + if not file_path: + return + + try: + self.statusBar().showMessage(f"Saving to {file_path}...") + QApplication.processEvents() + + # Get architecture and endianness from the original file + arch = 'unknown' + field = self.reader.get_field(gguf.Keys.General.ARCHITECTURE) + if field: + arch = field.contents() + + # Create writer + writer = GGUFWriter(file_path, arch=arch, endianess=self.reader.endianess) + + # Get alignment if present + alignment = None + field = self.reader.get_field(gguf.Keys.General.ALIGNMENT) + if field: + alignment = field.contents() + if alignment is not None: + writer.data_alignment = alignment + + # Copy metadata with changes + for field in self.reader.fields.values(): + # Skip virtual fields and fields written by GGUFWriter + if field.name == gguf.Keys.General.ARCHITECTURE or field.name.startswith('GGUF.'): + continue + + # Skip fields marked for removal + if field.name in self.metadata_to_remove: + continue + + # Apply changes if any + if field.name in self.metadata_changes: + value_type, value = self.metadata_changes[field.name] + if value_type == GGUFValueType.ARRAY: + # Handle array values + element_type, array_values = value + writer.add_array(field.name, array_values) + else: + writer.add_key_value(field.name, value, value_type) + else: + # Copy original value + value = field.contents() + if value is not None and field.types: + writer.add_key_value(field.name, value, field.types[0]) + + # Add new metadata + for key, (value_type, value) in self.metadata_changes.items(): + # Skip if the key already existed (we handled it above) + if self.reader.get_field(key) is not None: + continue + + writer.add_key_value(key, value, value_type) + + # Add tensors (including data) + for tensor in self.reader.tensors: + writer.add_tensor(tensor.name, tensor.data, raw_shape=tensor.data.shape, raw_dtype=tensor.tensor_type) + + # Write header and metadata + writer.open_output_file(Path(file_path)) + writer.write_header_to_file() + writer.write_kv_data_to_file() + + # Write tensor data using the optimized method + writer.write_tensors_to_file(progress=False) + + writer.close() + + self.statusBar().showMessage(f"Saved to {file_path}") + + # Ask if user wants to open the new file + reply = QMessageBox.question( + self, "Open Saved File", + "Would you like to open the newly saved file?", + QMessageBox.StandardButton.Yes | QMessageBox.StandardButton.No, QMessageBox.StandardButton.Yes + ) + + if reply == QMessageBox.StandardButton.Yes: + self.reader = GGUFReader(file_path, 'r') + self.current_file = file_path + self.file_path_edit.setText(file_path) + + self.load_metadata() + self.load_tensors() + + self.metadata_changes = {} + self.metadata_to_remove = set() + self.modified = False + + except Exception as e: + QMessageBox.critical(self, "Error", f"Failed to save file: {str(e)}") + self.statusBar().showMessage("Error saving file") + + +def main() -> None: + parser = argparse.ArgumentParser(description="GUI GGUF Editor") + parser.add_argument("model_path", nargs="?", help="path to GGUF model file to load at startup") + parser.add_argument("--verbose", action="store_true", help="increase output verbosity") + + args = parser.parse_args() + + logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) + + app = QApplication(sys.argv) + window = GGUFEditorWindow() + window.show() + + # Load model if specified + if args.model_path: + if os.path.isfile(args.model_path) and args.model_path.endswith('.gguf'): + window.load_file(args.model_path) + else: + logger.error(f"Invalid model path: {args.model_path}") + QMessageBox.warning( + window, + "Invalid Model Path", + f"The specified file does not exist or is not a GGUF file: {args.model_path}") + + sys.exit(app.exec()) + + +if __name__ == '__main__': + main() diff --git a/lpm_kernel/L2/gguf-py/gguf/scripts/gguf_hash.py b/lpm_kernel/L2/gguf-py/gguf/scripts/gguf_hash.py index a39d77a3..3ef98992 100755 --- a/lpm_kernel/L2/gguf-py/gguf/scripts/gguf_hash.py +++ b/lpm_kernel/L2/gguf-py/gguf/scripts/gguf_hash.py @@ -13,10 +13,7 @@ from tqdm import tqdm # Necessary to load the local gguf package -if ( - "NO_LOCAL_GGUF" not in os.environ - and (Path(__file__).parent.parent.parent.parent / "gguf-py").exists() -): +if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent.parent / 'gguf-py').exists(): sys.path.insert(0, str(Path(__file__).parent.parent.parent)) from gguf import GGUFReader # noqa: E402 @@ -25,14 +22,12 @@ logger = logging.getLogger("gguf-hash") # UUID_NAMESPACE_LLAMA_CPP = uuid.uuid5(uuid.NAMESPACE_URL, 'en.wikipedia.org/wiki/Llama.cpp') -UUID_NAMESPACE_LLAMA_CPP = uuid.UUID("ef001206-dadc-5f6d-a15f-3359e577d4e5") +UUID_NAMESPACE_LLAMA_CPP = uuid.UUID('ef001206-dadc-5f6d-a15f-3359e577d4e5') # For more information about what field.parts and field.data represent, # please see the comments in the modify_gguf.py example. -def gguf_hash( - reader: GGUFReader, filename: str, disable_progress_bar: bool, no_layer: bool -) -> None: +def gguf_hash(reader: GGUFReader, filename: str, disable_progress_bar: bool, no_layer: bool) -> None: sha1 = hashlib.sha1() sha256 = hashlib.sha256() uuidv5_sha1 = hashlib.sha1() @@ -41,10 +36,9 @@ def gguf_hash( # Total Weight Calculation For Progress Bar total_weights = 0 for n, tensor in enumerate(reader.tensors, 1): + # We don't need these - if tensor.name.endswith( - (".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq") - ): + if tensor.name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")): continue # Calculate Tensor Volume @@ -54,20 +48,13 @@ def gguf_hash( total_weights += sum_weights_in_tensor # Hash Progress Bar - bar = tqdm( - desc="Hashing", - total=total_weights, - unit="weights", - unit_scale=True, - disable=disable_progress_bar, - ) + bar = tqdm(desc="Hashing", total=total_weights, unit="weights", unit_scale=True, disable=disable_progress_bar) # Hashing Process for tensor in reader.tensors: + # We don't need these - if tensor.name.endswith( - (".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq") - ): + if tensor.name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")): continue # Progressbar @@ -77,21 +64,14 @@ def gguf_hash( bar.update(sum_weights_in_tensor) if not no_layer: + sha1_layer = hashlib.sha1() sha1_layer.update(tensor.data.data) - print( - "sha1 {0} {1}:{2}".format( - sha1_layer.hexdigest(), filename, tensor.name - ) - ) # noqa: NP100 + print("sha1 {0} {1}:{2}".format(sha1_layer.hexdigest(), filename, tensor.name)) # noqa: NP100 sha256_layer = hashlib.sha256() sha256_layer.update(tensor.data.data) - print( - "sha256 {0} {1}:{2}".format( - sha256_layer.hexdigest(), filename, tensor.name - ) - ) # noqa: NP100 + print("sha256 {0} {1}:{2}".format(sha256_layer.hexdigest(), filename, tensor.name)) # noqa: NP100 sha1.update(tensor.data.data) sha256.update(tensor.data.data) @@ -101,30 +81,22 @@ def gguf_hash( bar.close() # Display Hash Output - print("sha1 {0} {1}".format(sha1.hexdigest(), filename)) # noqa: NP100 - print("sha256 {0} {1}".format(sha256.hexdigest(), filename)) # noqa: NP100 - print( - "uuid {0} {1}".format( - uuid.UUID(bytes=uuidv5_sha1.digest()[:16], version=5), filename - ) - ) # noqa: NP100 + print("sha1 {0} {1}".format(sha1.hexdigest(), filename)) # noqa: NP100 + print("sha256 {0} {1}".format(sha256.hexdigest(), filename)) # noqa: NP100 + print("uuid {0} {1}".format(uuid.UUID(bytes=uuidv5_sha1.digest()[:16], version=5), filename)) # noqa: NP100 def main() -> None: parser = argparse.ArgumentParser(description="Dump GGUF file metadata") - parser.add_argument("model", type=str, help="GGUF format model filename") - parser.add_argument( - "--no-layer", action="store_true", help="exclude per layer hash" - ) - parser.add_argument( - "--verbose", action="store_true", help="increase output verbosity" - ) + parser.add_argument("model", type=str, help="GGUF format model filename") + parser.add_argument("--no-layer", action="store_true", help="exclude per layer hash") + parser.add_argument("--verbose", action="store_true", help="increase output verbosity") parser.add_argument("--progressbar", action="store_true", help="enable progressbar") args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"]) logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) - reader = GGUFReader(args.model, "r") + reader = GGUFReader(args.model, 'r') gguf_hash(reader, args.model, not args.progressbar, args.no_layer) -if __name__ == "__main__": +if __name__ == '__main__': main() diff --git a/lpm_kernel/L2/gguf-py/gguf/scripts/gguf_new_metadata.py b/lpm_kernel/L2/gguf-py/gguf/scripts/gguf_new_metadata.py index d534e712..7aff6c92 100755 --- a/lpm_kernel/L2/gguf-py/gguf/scripts/gguf_new_metadata.py +++ b/lpm_kernel/L2/gguf-py/gguf/scripts/gguf_new_metadata.py @@ -8,15 +8,11 @@ import json from pathlib import Path -import numpy as np from tqdm import tqdm from typing import Any, Sequence, NamedTuple # Necessary to load the local gguf package -if ( - "NO_LOCAL_GGUF" not in os.environ - and (Path(__file__).parent.parent.parent.parent / "gguf-py").exists() -): +if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent.parent / 'gguf-py').exists(): sys.path.insert(0, str(Path(__file__).parent.parent.parent)) import gguf @@ -27,50 +23,13 @@ class MetadataDetails(NamedTuple): type: gguf.GGUFValueType value: Any - description: str = "" - - -def get_byteorder(reader: gguf.GGUFReader) -> gguf.GGUFEndian: - if np.uint32(1) == np.uint32(1).newbyteorder("<"): - # Host is little endian - host_endian = gguf.GGUFEndian.LITTLE - swapped_endian = gguf.GGUFEndian.BIG - else: - # Sorry PDP or other weird systems that don't use BE or LE. - host_endian = gguf.GGUFEndian.BIG - swapped_endian = gguf.GGUFEndian.LITTLE - - if reader.byte_order == "S": - return swapped_endian - else: - return host_endian - - -def decode_field(field: gguf.ReaderField | None) -> Any: - if field and field.types: - main_type = field.types[0] - - if main_type == gguf.GGUFValueType.ARRAY: - sub_type = field.types[-1] - - if sub_type == gguf.GGUFValueType.STRING: - return [ - str(bytes(field.parts[idx]), encoding="utf-8") for idx in field.data - ] - else: - return [pv for idx in field.data for pv in field.parts[idx].tolist()] - if main_type == gguf.GGUFValueType.STRING: - return str(bytes(field.parts[-1]), encoding="utf-8") - else: - return field.parts[-1][0] - - return None + description: str = '' def get_field_data(reader: gguf.GGUFReader, key: str) -> Any: field = reader.get_field(key) - return decode_field(field) + return field.contents() if field else None def find_token(token_list: Sequence[int], token: str) -> Sequence[int]: @@ -82,48 +41,36 @@ def find_token(token_list: Sequence[int], token: str) -> Sequence[int]: return token_ids -def copy_with_new_metadata( - reader: gguf.GGUFReader, - writer: gguf.GGUFWriter, - new_metadata: dict[str, MetadataDetails], - remove_metadata: Sequence[str], -) -> None: +def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new_metadata: dict[str, MetadataDetails], remove_metadata: Sequence[str]) -> None: for field in reader.fields.values(): # Suppress virtual fields and fields written by GGUFWriter - if field.name == gguf.Keys.General.ARCHITECTURE or field.name.startswith( - "GGUF." - ): - logger.debug(f"Suppressing {field.name}") + if field.name == gguf.Keys.General.ARCHITECTURE or field.name.startswith('GGUF.'): + logger.debug(f'Suppressing {field.name}') continue # Skip old chat templates if we have new ones - if ( - field.name.startswith(gguf.Keys.Tokenizer.CHAT_TEMPLATE) - and gguf.Keys.Tokenizer.CHAT_TEMPLATE in new_metadata - ): - logger.debug(f"Skipping {field.name}") + if field.name.startswith(gguf.Keys.Tokenizer.CHAT_TEMPLATE) and gguf.Keys.Tokenizer.CHAT_TEMPLATE in new_metadata: + logger.debug(f'Skipping {field.name}') continue if field.name in remove_metadata: - logger.debug(f"Removing {field.name}") + logger.debug(f'Removing {field.name}') continue - old_val = MetadataDetails(field.types[0], decode_field(field)) + old_val = MetadataDetails(field.types[0], field.contents()) val = new_metadata.get(field.name, old_val) if field.name in new_metadata: - logger.debug( - f'Modifying {field.name}: "{old_val.value}" -> "{val.value}" {val.description}' - ) + logger.debug(f'Modifying {field.name}: "{old_val.value}" -> "{val.value}" {val.description}') del new_metadata[field.name] elif val.value is not None: - logger.debug(f"Copying {field.name}") + logger.debug(f'Copying {field.name}') if val.value is not None: writer.add_key_value(field.name, val.value, val.type) if gguf.Keys.Tokenizer.CHAT_TEMPLATE in new_metadata: - logger.debug("Adding chat template(s)") + logger.debug('Adding chat template(s)') writer.add_chat_template(new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE].value) del new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] @@ -135,13 +82,7 @@ def copy_with_new_metadata( for tensor in reader.tensors: total_bytes += tensor.n_bytes - writer.add_tensor_info( - tensor.name, - tensor.data.shape, - tensor.data.dtype, - tensor.data.nbytes, - tensor.tensor_type, - ) + writer.add_tensor_info(tensor.name, tensor.data.shape, tensor.data.dtype, tensor.data.nbytes, tensor.tensor_type) bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True) @@ -157,78 +98,22 @@ def copy_with_new_metadata( def main() -> None: - tokenizer_metadata = ( - getattr(gguf.Keys.Tokenizer, n) - for n in gguf.Keys.Tokenizer.__dict__.keys() - if not n.startswith("_") - ) - token_names = dict( - (n.split(".")[-1][: -len("_token_id")], n) - for n in tokenizer_metadata - if n.endswith("_token_id") - ) - - parser = argparse.ArgumentParser( - description="Make a copy of a GGUF file with new metadata" - ) - parser.add_argument("input", type=Path, help="GGUF format model input filename") - parser.add_argument("output", type=Path, help="GGUF format model output filename") - parser.add_argument( - "--general-name", type=str, help="The models general.name", metavar='"name"' - ) - parser.add_argument( - "--general-description", - type=str, - help="The models general.description", - metavar='"Description ..."', - ) - parser.add_argument( - "--chat-template", - type=str, - help="Chat template string (or JSON string containing templates)", - metavar='"{% ... %} ..."', - ) - parser.add_argument( - "--chat-template-config", - type=Path, - help="Config file containing chat template(s)", - metavar="tokenizer_config.json", - ) - parser.add_argument( - "--pre-tokenizer", - type=str, - help="The models tokenizer.ggml.pre", - metavar='"pre tokenizer"', - ) - parser.add_argument( - "--remove-metadata", - action="append", - type=str, - help="Remove metadata (by key name) from output model", - metavar="general.url", - ) - parser.add_argument( - "--special-token", - action="append", - type=str, - help="Special token by value", - nargs=2, - metavar=(" | ".join(token_names.keys()), '""'), - ) - parser.add_argument( - "--special-token-by-id", - action="append", - type=str, - help="Special token by id", - nargs=2, - metavar=(" | ".join(token_names.keys()), "0"), - ) - parser.add_argument( - "--force", action="store_true", help="Bypass warnings without confirmation" - ) - parser.add_argument( - "--verbose", action="store_true", help="Increase output verbosity" - ) + tokenizer_metadata = (getattr(gguf.Keys.Tokenizer, n) for n in gguf.Keys.Tokenizer.__dict__.keys() if not n.startswith('_')) + token_names = dict((n.split('.')[-1][:-len('_token_id')], n) for n in tokenizer_metadata if n.endswith('_token_id')) + + parser = argparse.ArgumentParser(description="Make a copy of a GGUF file with new metadata") + parser.add_argument("input", type=Path, help="GGUF format model input filename") + parser.add_argument("output", type=Path, help="GGUF format model output filename") + parser.add_argument("--general-name", type=str, help="The models general.name", metavar='"name"') + parser.add_argument("--general-description", type=str, help="The models general.description", metavar='"Description ..."') + parser.add_argument("--chat-template", type=str, help="Chat template string (or JSON string containing templates)", metavar='"{% ... %} ..."') + parser.add_argument("--chat-template-config", type=Path, help="Config file containing chat template(s)", metavar='tokenizer_config.json') + parser.add_argument("--pre-tokenizer", type=str, help="The models tokenizer.ggml.pre", metavar='"pre tokenizer"') + parser.add_argument("--remove-metadata", action="append", type=str, help="Remove metadata (by key name) from output model", metavar='general.url') + parser.add_argument("--special-token", action="append", type=str, help="Special token by value", nargs=2, metavar=(' | '.join(token_names.keys()), '""')) + parser.add_argument("--special-token-by-id", action="append", type=str, help="Special token by id", nargs=2, metavar=(' | '.join(token_names.keys()), '0')) + parser.add_argument("--force", action="store_true", help="Bypass warnings without confirmation") + parser.add_argument("--verbose", action="store_true", help="Increase output verbosity") args = parser.parse_args(None if len(sys.argv) > 2 else ["--help"]) logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) @@ -237,58 +122,40 @@ def main() -> None: remove_metadata = args.remove_metadata or [] if args.general_name: - new_metadata[gguf.Keys.General.NAME] = MetadataDetails( - gguf.GGUFValueType.STRING, args.general_name - ) + new_metadata[gguf.Keys.General.NAME] = MetadataDetails(gguf.GGUFValueType.STRING, args.general_name) if args.general_description: - new_metadata[gguf.Keys.General.DESCRIPTION] = MetadataDetails( - gguf.GGUFValueType.STRING, args.general_description - ) + new_metadata[gguf.Keys.General.DESCRIPTION] = MetadataDetails(gguf.GGUFValueType.STRING, args.general_description) if args.chat_template: - new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = MetadataDetails( - gguf.GGUFValueType.STRING, - json.loads(args.chat_template) - if args.chat_template.startswith("[") - else args.chat_template, - ) + new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = MetadataDetails(gguf.GGUFValueType.STRING, json.loads(args.chat_template) if args.chat_template.startswith('[') else args.chat_template) if args.chat_template_config: - with open(args.chat_template_config, "r") as fp: + with open(args.chat_template_config, 'r') as fp: config = json.load(fp) - template = config.get("chat_template") + template = config.get('chat_template') if template: - new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = MetadataDetails( - gguf.GGUFValueType.STRING, template - ) + new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = MetadataDetails(gguf.GGUFValueType.STRING, template) if args.pre_tokenizer: - new_metadata[gguf.Keys.Tokenizer.PRE] = MetadataDetails( - gguf.GGUFValueType.STRING, args.pre_tokenizer - ) + new_metadata[gguf.Keys.Tokenizer.PRE] = MetadataDetails(gguf.GGUFValueType.STRING, args.pre_tokenizer) if remove_metadata: - logger.warning("*** Warning *** Warning *** Warning **") - logger.warning("* Most metadata is required for a fully functional GGUF file,") - logger.warning( - "* removing crucial metadata may result in a corrupt output file!" - ) + logger.warning('*** Warning *** Warning *** Warning **') + logger.warning('* Most metadata is required for a fully functional GGUF file,') + logger.warning('* removing crucial metadata may result in a corrupt output file!') if not args.force: - logger.warning( - "* Enter exactly YES if you are positive you want to proceed:" - ) - response = input("YES, I am sure> ") - if response != "YES": + logger.warning('* Enter exactly YES if you are positive you want to proceed:') + response = input('YES, I am sure> ') + if response != 'YES': logger.info("You didn't enter YES. Okay then, see ya!") sys.exit(0) - logger.info(f"* Loading: {args.input}") - reader = gguf.GGUFReader(args.input, "r") + logger.info(f'* Loading: {args.input}') + reader = gguf.GGUFReader(args.input, 'r') arch = get_field_data(reader, gguf.Keys.General.ARCHITECTURE) - endianess = get_byteorder(reader) token_list = get_field_data(reader, gguf.Keys.Tokenizer.LIST) or [] @@ -297,15 +164,11 @@ def main() -> None: logger.warning(f'Unknown special token "{name}", ignoring...') else: ids = find_token(token_list, token) - new_metadata[token_names[name]] = MetadataDetails( - gguf.GGUFValueType.UINT32, ids[0], f"= {token}" - ) + new_metadata[token_names[name]] = MetadataDetails(gguf.GGUFValueType.UINT32, ids[0], f'= {token}') if len(ids) > 1: - logger.warning( - f'Multiple "{token}" tokens found, choosing ID {ids[0]}, use --special-token-by-id if you want another:' - ) - logger.warning(", ".join(str(i) for i in ids)) + logger.warning(f'Multiple "{token}" tokens found, choosing ID {ids[0]}, use --special-token-by-id if you want another:') + logger.warning(', '.join(str(i) for i in ids)) for name, id_string in args.special_token_by_id or []: if name not in token_names: @@ -316,33 +179,29 @@ def main() -> None: id_int = int(id_string) if id_int >= 0 and id_int < len(token_list): - new_metadata[token_names[name]] = MetadataDetails( - gguf.GGUFValueType.UINT32, id_int, f"= {token_list[id_int]}" - ) + new_metadata[token_names[name]] = MetadataDetails(gguf.GGUFValueType.UINT32, id_int, f'= {token_list[id_int]}') else: - raise LookupError(f"Token ID {id_int} is not within token list!") + raise LookupError(f'Token ID {id_int} is not within token list!') if os.path.isfile(args.output) and not args.force: - logger.warning("*** Warning *** Warning *** Warning **") - logger.warning( - f'* The "{args.output}" GGUF file already exists, it will be overwritten!' - ) - logger.warning("* Enter exactly YES if you are positive you want to proceed:") - response = input("YES, I am sure> ") - if response != "YES": + logger.warning('*** Warning *** Warning *** Warning **') + logger.warning(f'* The "{args.output}" GGUF file already exists, it will be overwritten!') + logger.warning('* Enter exactly YES if you are positive you want to proceed:') + response = input('YES, I am sure> ') + if response != 'YES': logger.info("You didn't enter YES. Okay then, see ya!") sys.exit(0) - logger.info(f"* Writing: {args.output}") - writer = gguf.GGUFWriter(args.output, arch=arch, endianess=endianess) + logger.info(f'* Writing: {args.output}') + writer = gguf.GGUFWriter(args.output, arch=arch, endianess=reader.endianess) alignment = get_field_data(reader, gguf.Keys.General.ALIGNMENT) if alignment is not None: - logger.debug(f"Setting custom alignment: {alignment}") + logger.debug(f'Setting custom alignment: {alignment}') writer.data_alignment = alignment copy_with_new_metadata(reader, writer, new_metadata, remove_metadata) -if __name__ == "__main__": +if __name__ == '__main__': main() diff --git a/lpm_kernel/L2/gguf-py/gguf/scripts/gguf_set_metadata.py b/lpm_kernel/L2/gguf-py/gguf/scripts/gguf_set_metadata.py index f613ffca..f5809c35 100755 --- a/lpm_kernel/L2/gguf-py/gguf/scripts/gguf_set_metadata.py +++ b/lpm_kernel/L2/gguf-py/gguf/scripts/gguf_set_metadata.py @@ -6,10 +6,7 @@ from pathlib import Path # Necessary to load the local gguf package -if ( - "NO_LOCAL_GGUF" not in os.environ - and (Path(__file__).parent.parent.parent.parent / "gguf-py").exists() -): +if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent.parent / 'gguf-py').exists(): sys.path.insert(0, str(Path(__file__).parent.parent.parent)) from gguf import GGUFReader # noqa: E402 @@ -18,8 +15,8 @@ def minimal_example(filename: str) -> None: - reader = GGUFReader(filename, "r+") - field = reader.fields["tokenizer.ggml.bos_token_id"] + reader = GGUFReader(filename, 'r+') + field = reader.fields['tokenizer.ggml.bos_token_id'] if field is None: return part_index = field.data[0] @@ -47,68 +44,52 @@ def minimal_example(filename: str) -> None: def set_metadata(reader: GGUFReader, args: argparse.Namespace) -> None: field = reader.get_field(args.key) if field is None: - logger.error(f"! Field {repr(args.key)} not found") + logger.error(f'! Field {repr(args.key)} not found') sys.exit(1) # Note that field.types is a list of types. This is because the GGUF # format supports arrays. For example, an array of UINT32 would # look like [GGUFValueType.ARRAY, GGUFValueType.UINT32] handler = reader.gguf_scalar_to_np.get(field.types[0]) if field.types else None if handler is None: - logger.error( - f"! This tool only supports changing simple values, {repr(args.key)} has unsupported type {field.types}" - ) + logger.error(f'! This tool only supports changing simple values, {repr(args.key)} has unsupported type {field.types}') sys.exit(1) current_value = field.parts[field.data[0]][0] new_value = handler(args.value) - logger.info( - f"* Preparing to change field {repr(args.key)} from {current_value} to {new_value}" - ) + logger.info(f'* Preparing to change field {repr(args.key)} from {current_value} to {new_value}') if current_value == new_value: - logger.info( - f"- Key {repr(args.key)} already set to requested value {current_value}" - ) + logger.info(f'- Key {repr(args.key)} already set to requested value {current_value}') sys.exit(0) if args.dry_run: sys.exit(0) if not args.force: - logger.warning("*** Warning *** Warning *** Warning **") - logger.warning( - "* Changing fields in a GGUF file can make it unusable. Proceed at your own risk." - ) - logger.warning("* Enter exactly YES if you are positive you want to proceed:") - response = input("YES, I am sure> ") - if response != "YES": + logger.warning('*** Warning *** Warning *** Warning **') + logger.warning('* Changing fields in a GGUF file can make it unusable. Proceed at your own risk.') + logger.warning('* Enter exactly YES if you are positive you want to proceed:') + response = input('YES, I am sure> ') + if response != 'YES': logger.info("You didn't enter YES. Okay then, see ya!") sys.exit(0) field.parts[field.data[0]][0] = new_value - logger.info("* Field changed. Successful completion.") + logger.info('* Field changed. Successful completion.') def main() -> None: - parser = argparse.ArgumentParser( - description="Set a simple value in GGUF file metadata" - ) - parser.add_argument("model", type=str, help="GGUF format model filename") - parser.add_argument("key", type=str, help="Metadata key to set") - parser.add_argument("value", type=str, help="Metadata value to set") - parser.add_argument( - "--dry-run", action="store_true", help="Don't actually change anything" - ) - parser.add_argument( - "--force", action="store_true", help="Change the field without confirmation" - ) - parser.add_argument( - "--verbose", action="store_true", help="increase output verbosity" - ) + parser = argparse.ArgumentParser(description="Set a simple value in GGUF file metadata") + parser.add_argument("model", type=str, help="GGUF format model filename") + parser.add_argument("key", type=str, help="Metadata key to set") + parser.add_argument("value", type=str, help="Metadata value to set") + parser.add_argument("--dry-run", action="store_true", help="Don't actually change anything") + parser.add_argument("--force", action="store_true", help="Change the field without confirmation") + parser.add_argument("--verbose", action="store_true", help="increase output verbosity") args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"]) logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) - logger.info(f"* Loading: {args.model}") - reader = GGUFReader(args.model, "r" if args.dry_run else "r+") + logger.info(f'* Loading: {args.model}') + reader = GGUFReader(args.model, 'r' if args.dry_run else 'r+') set_metadata(reader, args) -if __name__ == "__main__": +if __name__ == '__main__': main() diff --git a/lpm_kernel/L2/gguf-py/gguf/tensor_mapping.py b/lpm_kernel/L2/gguf-py/gguf/tensor_mapping.py index a82f1a5c..311d1ff6 100644 --- a/lpm_kernel/L2/gguf-py/gguf/tensor_mapping.py +++ b/lpm_kernel/L2/gguf-py/gguf/tensor_mapping.py @@ -9,671 +9,1060 @@ class TensorNameMap: mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = { # Token embeddings MODEL_TENSOR.TOKEN_EMBD: ( - "gpt_neox.embed_in", # gptneox - "transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais exaone - "transformer.word_embeddings", # falcon - "word_embeddings", # bloom - "model.embed_tokens", # llama-hf nemotron olmoe olmo2 rwkv6qwen2 - "tok_embeddings", # llama-pth - "embeddings.word_embeddings", # bert nomic-bert + "gpt_neox.embed_in", # gptneox + "transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais exaone + "transformer.word_embeddings", # falcon + "word_embeddings", # bloom + "model.embed_tokens", # llama-hf nemotron olmoe olmo2 rwkv6qwen2 glm4-0414 + "tok_embeddings", # llama-pth + "embeddings.word_embeddings", # bert nomic-bert "language_model.embedding.word_embeddings", # persimmon - "wte", # gpt2 - "transformer.embd.wte", # phi2 - "model.tok_embeddings", # internlm2 - "model.embedding", # mamba-qbert - "backbone.embedding", # mamba - "backbone.embeddings", # mamba-hf - "transformer.in_out_embed", # Grok - "embedding.word_embeddings", # chatglm - "transformer.token_embeddings", # openelm - "shared", # t5 - "rwkv.embeddings", # rwkv + "wte", # gpt2 + "transformer.embd.wte", # phi2 + "model.tok_embeddings", # internlm2 + "model.embedding", # mamba-qbert + "backbone.embedding", # mamba + "backbone.embeddings", # mamba-hf + "transformer.in_out_embed", # Grok + "embedding.word_embeddings", # chatglm + "transformer.token_embeddings", # openelm + "shared", # t5 + "rwkv.embeddings", # rwkv6 + "model.embeddings", # rwkv7 + "model.word_embeddings", # bailingmoe + "language_model.model.embed_tokens", # llama4 ), + # Token type embeddings MODEL_TENSOR.TOKEN_TYPES: ( "embeddings.token_type_embeddings", # bert nomic-bert ), + # Normalization of token embeddings MODEL_TENSOR.TOKEN_EMBD_NORM: ( "word_embeddings_layernorm", # bloom - "embeddings.LayerNorm", # bert - "emb_ln", # nomic-bert - "transformer.norm", # openelm - "rwkv.blocks.0.pre_ln", # rwkv - "backbone.norm", # wavtokenizer + "embeddings.LayerNorm", # bert + "emb_ln", # nomic-bert + "transformer.norm", # openelm + "rwkv.blocks.0.pre_ln", # rwkv + "rwkv.blocks.0.pre_ln", # rwkv6 + "model.pre_ln", # rwkv7 + "model.layers.0.pre_norm", # rwkv7 + "backbone.norm", # wavtokenizer ), + # Position embeddings MODEL_TENSOR.POS_EMBD: ( - "transformer.wpe", # gpt2 + "transformer.wpe", # gpt2 "embeddings.position_embeddings", # bert - "wpe", # gpt2 + "wpe", # gpt2 ), + # Output MODEL_TENSOR.OUTPUT: ( - "embed_out", # gptneox - "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo2 phimoe - "output", # llama-pth bloom internlm2 + "embed_out", # gptneox + "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo2 phimoe + "output", # llama-pth bloom internlm2 "word_embeddings_for_head", # persimmon - "lm_head.linear", # phi2 - "output_layer", # chatglm - "head", # rwkv - "head.out", # wavtokenizer + "lm_head.linear", # phi2 + "output_layer", # chatglm + "head", # rwkv + "head.out", # wavtokenizer + "language_model.lm_head", # llama4 ), + # Output norm MODEL_TENSOR.OUTPUT_NORM: ( - "gpt_neox.final_layer_norm", # gptneox - "transformer.ln_f", # gpt2 gpt-j falcon jais exaone - "model.norm", # llama-hf baichuan internlm2 olmoe olmo2 phimoe - "norm", # llama-pth - "transformer.norm_f", # mpt dbrx - "ln_f", # refact bloom qwen gpt2 + "gpt_neox.final_layer_norm", # gptneox + "transformer.ln_f", # gpt2 gpt-j falcon jais exaone + "model.norm", # llama-hf baichuan internlm2 olmoe olmo2 phimoe + "norm", # llama-pth + "transformer.norm_f", # mpt dbrx + "ln_f", # refact bloom qwen gpt2 "language_model.encoder.final_layernorm", # persimmon - "model.final_layernorm", # persimmon - "lm_head.ln", # phi2 - "model.norm_f", # mamba-qbert - "backbone.norm_f", # mamba - "transformer.rms_norm", # Grok - "encoder.final_layernorm", # chatglm - "transformer.norm", # openelm - "model.norm", # nemotron - "rwkv.ln_out", # rwkv - "backbone.final_layer_norm", # wavtokenizer + "model.final_layernorm", # persimmon + "lm_head.ln", # phi2 + "model.norm_f", # mamba-qbert + "backbone.norm_f", # mamba + "transformer.rms_norm", # Grok + "encoder.final_layernorm", # chatglm + "transformer.norm", # openelm + "model.norm", # nemotron + "rwkv.ln_out", # rwkv6 + "model.ln_out", # rwkv7 + "backbone.final_layer_norm", # wavtokenizer + "language_model.model.norm", # llama4 ), + # Rope frequencies MODEL_TENSOR.ROPE_FREQS: ( "rope.freqs", # llama-pth "rotary_pos_emb.inv_freq", # chatglm ), + MODEL_TENSOR.ROPE_FACTORS_LONG: (), MODEL_TENSOR.ROPE_FACTORS_SHORT: (), + MODEL_TENSOR.CONV1D: ( - "backbone.embed", # roberta + "backbone.embed", # roberta ), } block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = { # Attention norm MODEL_TENSOR.ATTN_NORM: ( - "gpt_neox.layers.{bid}.input_layernorm", # gptneox - "transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen jais exaone - "transformer.blocks.{bid}.norm_1", # mpt - "transformer.h.{bid}.input_layernorm", # falcon7b - "h.{bid}.input_layernorm", # bloom - "transformer.h.{bid}.ln_mlp", # falcon40b - "model.layers.{bid}.input_layernorm", # llama-hf nemotron olmoe phimoe - "layers.{bid}.attention_norm", # llama-pth + "gpt_neox.layers.{bid}.input_layernorm", # gptneox + "transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen jais exaone + "transformer.blocks.{bid}.norm_1", # mpt + "transformer.h.{bid}.input_layernorm", # falcon7b + "h.{bid}.input_layernorm", # bloom + "transformer.h.{bid}.ln_mlp", # falcon40b + "model.layers.{bid}.input_layernorm", # llama-hf nemotron olmoe phimoe + "layers.{bid}.attention_norm", # llama-pth "language_model.encoder.layers.{bid}.input_layernorm", # persimmon - "model.layers.{bid}.ln1", # yi - "h.{bid}.ln_1", # gpt2 - "transformer.h.{bid}.ln", # phi2 - "model.layers.layers.{bid}.norm", # plamo - "model.layers.{bid}.attention_norm", # internlm2 - "model.layers.{bid}.norm", # mamba-qbert - "backbone.layers.{bid}.norm", # mamba - "transformer.decoder_layer.{bid}.rms_norm", # Grok - "transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx - "encoder.layers.{bid}.input_layernorm", # chatglm - "transformer.layers.{bid}.attn_norm", # openelm - "rwkv.blocks.{bid}.ln1", # rwkv + "model.layers.{bid}.ln1", # yi + "h.{bid}.ln_1", # gpt2 + "transformer.h.{bid}.ln", # phi2 + "model.layers.layers.{bid}.norm", # plamo + "model.layers.{bid}.attention_norm", # internlm2 + "model.layers.{bid}.norm", # mamba-qbert + "backbone.layers.{bid}.norm", # mamba + "transformer.decoder_layer.{bid}.rms_norm", # Grok + "transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx + "encoder.layers.{bid}.input_layernorm", # chatglm + "transformer.layers.{bid}.attn_norm", # openelm + "rwkv.blocks.{bid}.ln1", # rwkv6 + "model.layers.{bid}.ln1", # rwkv7 + "language_model.model.layers.{bid}.input_layernorm", # llama4 ), + # Attention norm 2 MODEL_TENSOR.ATTN_NORM_2: ( - "transformer.h.{bid}.ln_attn", # falcon40b - "encoder.layer.{bid}.layer_norm_1", # jina-v2-code - "rwkv.blocks.{bid}.ln2", # rwkv + "transformer.h.{bid}.ln_attn", # falcon40b + "encoder.layer.{bid}.layer_norm_1", # jina-v2-code + "rwkv.blocks.{bid}.ln2", # rwkv6 + "model.layers.{bid}.ln2", # rwkv7 ), + # Attention query-key-value MODEL_TENSOR.ATTN_QKV: ( - "gpt_neox.layers.{bid}.attention.query_key_value", # gptneox - "transformer.h.{bid}.attn.c_attn", # gpt2 qwen jais - "transformer.blocks.{bid}.attn.Wqkv", # mpt - "transformer.blocks.{bid}.norm_attn_norm.attn.Wqkv", # dbrx - "transformer.h.{bid}.self_attention.query_key_value", # falcon - "h.{bid}.self_attention.query_key_value", # bloom + "gpt_neox.layers.{bid}.attention.query_key_value", # gptneox + "transformer.h.{bid}.attn.c_attn", # gpt2 qwen jais + "transformer.blocks.{bid}.attn.Wqkv", # mpt + "transformer.blocks.{bid}.norm_attn_norm.attn.Wqkv", # dbrx + "transformer.h.{bid}.self_attention.query_key_value", # falcon + "h.{bid}.self_attention.query_key_value", # bloom "language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon - "model.layers.{bid}.self_attn.query_key_value", # persimmon - "h.{bid}.attn.c_attn", # gpt2 - "transformer.h.{bid}.mixer.Wqkv", # phi2 - "encoder.layers.{bid}.attn.Wqkv", # nomic-bert - "model.layers.{bid}.self_attn.qkv_proj", # phi3 - "encoder.layers.{bid}.self_attention.query_key_value", # chatglm - "transformer.layers.{bid}.attn.qkv_proj", # openelm + "model.layers.{bid}.self_attn.query_key_value", # persimmon + "h.{bid}.attn.c_attn", # gpt2 + "transformer.h.{bid}.mixer.Wqkv", # phi2 + "encoder.layers.{bid}.attn.Wqkv", # nomic-bert + "model.layers.{bid}.self_attn.qkv_proj", # phi3 + "encoder.layers.{bid}.self_attention.query_key_value", # chatglm + "transformer.layers.{bid}.attn.qkv_proj", # openelm ), + # Attention query MODEL_TENSOR.ATTN_Q: ( - "model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron olmoe olmo2 phimoe - "model.layers.{bid}.self_attn.q_proj_no_perm", # llama-custom - "layers.{bid}.attention.wq", # llama-pth - "encoder.layer.{bid}.attention.self.query", # bert - "transformer.h.{bid}.attn.q_proj", # gpt-j - "model.layers.layers.{bid}.self_attn.q_proj", # plamo - "model.layers.{bid}.attention.wq", # internlm2 - "transformer.decoder_layer.{bid}.multi_head_attention.query", # Grok - "transformer.h.{bid}.attn.attention.q_proj", # exaone + "model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron olmoe olmo2 phimoe + "model.layers.{bid}.self_attn.q_proj_no_perm", # llama-custom + "layers.{bid}.attention.wq", # llama-pth + "encoder.layer.{bid}.attention.self.query", # bert + "transformer.h.{bid}.attn.q_proj", # gpt-j + "model.layers.layers.{bid}.self_attn.q_proj", # plamo + "model.layers.{bid}.attention.wq", # internlm2 + "transformer.decoder_layer.{bid}.multi_head_attention.query",# Grok + "transformer.h.{bid}.attn.attention.q_proj", # exaone + "language_model.model.layers.{bid}.self_attn.q_proj", # llama4 ), + # Attention key MODEL_TENSOR.ATTN_K: ( - "model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron olmoe olmo2 phimoe - "model.layers.{bid}.self_attn.k_proj_no_perm", # llama-custom - "layers.{bid}.attention.wk", # llama-pth - "encoder.layer.{bid}.attention.self.key", # bert - "transformer.h.{bid}.attn.k_proj", # gpt-j - "transformer.h.{bid}.attn.k", # refact - "model.layers.layers.{bid}.self_attn.k_proj", # plamo - "model.layers.{bid}.attention.wk", # internlm2 - "transformer.decoder_layer.{bid}.multi_head_attention.key", # Grok - "transformer.h.{bid}.attn.attention.k_proj", # exaone + "model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron olmoe olmo2 phimoe + "model.layers.{bid}.self_attn.k_proj_no_perm", # llama-custom + "layers.{bid}.attention.wk", # llama-pth + "encoder.layer.{bid}.attention.self.key", # bert + "transformer.h.{bid}.attn.k_proj", # gpt-j + "transformer.h.{bid}.attn.k", # refact + "model.layers.layers.{bid}.self_attn.k_proj", # plamo + "model.layers.{bid}.attention.wk", # internlm2 + "transformer.decoder_layer.{bid}.multi_head_attention.key",# Grok + "transformer.h.{bid}.attn.attention.k_proj", # exaone + "language_model.model.layers.{bid}.self_attn.k_proj", # llama4 ), + # Attention value MODEL_TENSOR.ATTN_V: ( - "model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron olmoe olmo2 phimoe - "layers.{bid}.attention.wv", # llama-pth - "encoder.layer.{bid}.attention.self.value", # bert - "transformer.h.{bid}.attn.v_proj", # gpt-j - "transformer.h.{bid}.attn.v", # refact - "model.layers.layers.{bid}.self_attn.v_proj", # plamo - "model.layers.{bid}.attention.wv", # internlm2 - "transformer.decoder_layer.{bid}.multi_head_attention.value", # Grok - "transformer.h.{bid}.attn.attention.v_proj", # exaone + "model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron olmoe olmo2 phimoe + "layers.{bid}.attention.wv", # llama-pth + "encoder.layer.{bid}.attention.self.value", # bert + "transformer.h.{bid}.attn.v_proj", # gpt-j + "transformer.h.{bid}.attn.v", # refact + "model.layers.layers.{bid}.self_attn.v_proj", # plamo + "model.layers.{bid}.attention.wv", # internlm2 + "transformer.decoder_layer.{bid}.multi_head_attention.value",# Grok + "transformer.h.{bid}.attn.attention.v_proj", # exaone + "language_model.model.layers.{bid}.self_attn.v_proj", # llama4 ), + # Attention output MODEL_TENSOR.ATTN_OUT: ( - "gpt_neox.layers.{bid}.attention.dense", # gptneox - "transformer.h.{bid}.attn.c_proj", # gpt2 refact qwen jais - "transformer.blocks.{bid}.attn.out_proj", # mpt - "transformer.h.{bid}.self_attention.dense", # falcon - "h.{bid}.self_attention.dense", # bloom - "model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron olmoe olmo2 phimoe - "model.layers.{bid}.self_attn.linear_attn", # deci - "layers.{bid}.attention.wo", # llama-pth - "encoder.layer.{bid}.attention.output.dense", # bert - "transformer.h.{bid}.attn.out_proj", # gpt-j - "language_model.encoder.layers.{bid}.self_attention.dense", # persimmon - "model.layers.{bid}.self_attn.dense", # persimmon - "h.{bid}.attn.c_proj", # gpt2 - "transformer.h.{bid}.mixer.out_proj", # phi2 - "model.layers.layers.{bid}.self_attn.o_proj", # plamo - "model.layers.{bid}.attention.wo", # internlm2 - "encoder.layers.{bid}.attn.out_proj", # nomic-bert + "gpt_neox.layers.{bid}.attention.dense", # gptneox + "transformer.h.{bid}.attn.c_proj", # gpt2 refact qwen jais + "transformer.blocks.{bid}.attn.out_proj", # mpt + "transformer.h.{bid}.self_attention.dense", # falcon + "h.{bid}.self_attention.dense", # bloom + "model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron olmoe olmo2 phimoe + "model.layers.{bid}.self_attn.linear_attn", # deci + "layers.{bid}.attention.wo", # llama-pth + "encoder.layer.{bid}.attention.output.dense", # bert + "transformer.h.{bid}.attn.out_proj", # gpt-j + "language_model.encoder.layers.{bid}.self_attention.dense", # persimmon + "model.layers.{bid}.self_attn.dense", # persimmon + "h.{bid}.attn.c_proj", # gpt2 + "transformer.h.{bid}.mixer.out_proj", # phi2 + "model.layers.layers.{bid}.self_attn.o_proj", # plamo + "model.layers.{bid}.attention.wo", # internlm2 + "encoder.layers.{bid}.attn.out_proj", # nomic-bert "transformer.decoder_layer.{bid}.multi_head_attention.linear", # Grok - "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", # dbrx - "encoder.layers.{bid}.self_attention.dense", # chatglm - "transformer.layers.{bid}.attn.out_proj", # openelm - "transformer.h.{bid}.attn.attention.out_proj", # exaone + "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", # dbrx + "encoder.layers.{bid}.self_attention.dense", # chatglm + "transformer.layers.{bid}.attn.out_proj", # openelm + "transformer.h.{bid}.attn.attention.out_proj", # exaone + "language_model.model.layers.{bid}.self_attn.o_proj", # llama4 ), + # Attention output norm MODEL_TENSOR.ATTN_OUT_NORM: ( "encoder.layer.{bid}.attention.output.LayerNorm", # bert - "encoder.layers.{bid}.norm1", # nomic-bert - "transformer.decoder_layer.{bid}.rms_norm_1", # Grok + "encoder.layers.{bid}.norm1", # nomic-bert + "transformer.decoder_layer.{bid}.rms_norm_1", # Grok "transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx ), + MODEL_TENSOR.ATTN_POST_NORM: ( - "model.layers.{bid}.post_attention_layernorm", # gemma2 olmo2 + "model.layers.{bid}.post_attention_layernorm", # gemma2 olmo2 # ge + "model.layers.{bid}.post_self_attn_layernorm", # glm-4-0414 ), + # Rotary embeddings MODEL_TENSOR.ATTN_ROT_EMBD: ( - "model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf - "layers.{bid}.attention.inner_attention.rope.freqs", # llama-pth - "model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq", # plamo - "transformer.h.{bid}.attn.rotary_emb.inv_freq", # codeshell + "model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf + "layers.{bid}.attention.inner_attention.rope.freqs", # llama-pth + "model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq", # plamo + "transformer.h.{bid}.attn.rotary_emb.inv_freq", # codeshell ), + # Feed-forward norm MODEL_TENSOR.FFN_NORM: ( - "gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox - "transformer.h.{bid}.ln_2", # gpt2 refact qwen jais exaone - "h.{bid}.post_attention_layernorm", # bloom - "transformer.blocks.{bid}.norm_2", # mpt - "model.layers.{bid}.post_attention_layernorm", # llama-hf nemotron olmoe phimoe - "layers.{bid}.ffn_norm", # llama-pth + "gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox + "transformer.h.{bid}.ln_2", # gpt2 refact qwen jais exaone + "h.{bid}.post_attention_layernorm", # bloom + "transformer.blocks.{bid}.norm_2", # mpt + "model.layers.{bid}.post_attention_layernorm", # llama-hf nemotron olmoe phimoe + "layers.{bid}.ffn_norm", # llama-pth "language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon - "model.layers.{bid}.ln2", # yi - "h.{bid}.ln_2", # gpt2 - "model.layers.{bid}.ffn_norm", # internlm2 - "transformer.decoder_layer.{bid}.rms_norm_2", # Grok - "encoder.layers.{bid}.post_attention_layernorm", # chatglm - "transformer.layers.{bid}.ffn_norm", # openelm + "model.layers.{bid}.ln2", # yi + "h.{bid}.ln_2", # gpt2 + "model.layers.{bid}.ffn_norm", # internlm2 + "transformer.decoder_layer.{bid}.rms_norm_2", # Grok + "encoder.layers.{bid}.post_attention_layernorm", # chatglm + "transformer.layers.{bid}.ffn_norm", # openelm + "language_model.model.layers.{bid}.post_attention_layernorm", # llama4 ), + # Post feed-forward norm MODEL_TENSOR.FFN_PRE_NORM: ( - "model.layers.{bid}.pre_feedforward_layernorm", # gemma2 + "model.layers.{bid}.pre_feedforward_layernorm", # gemma2 ), + # Post feed-forward norm MODEL_TENSOR.FFN_POST_NORM: ( - "model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2 + "model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2 + "model.layers.{bid}.post_mlp_layernorm", # glm-4-0414 ), + MODEL_TENSOR.FFN_GATE_INP: ( - "layers.{bid}.feed_forward.gate", # mixtral - "model.layers.{bid}.block_sparse_moe.gate", # mixtral phimoe - "model.layers.{bid}.mlp.gate", # qwen2moe olmoe - "transformer.decoder_layer.{bid}.router", # Grok - "transformer.blocks.{bid}.ffn.router.layer", # dbrx - "model.layers.{bid}.block_sparse_moe.router.layer", # granitemoe + "layers.{bid}.feed_forward.gate", # mixtral + "model.layers.{bid}.block_sparse_moe.gate", # mixtral phimoe + "model.layers.{bid}.mlp.gate", # qwen2moe olmoe + "transformer.decoder_layer.{bid}.router", # Grok + "transformer.blocks.{bid}.ffn.router.layer", # dbrx + "model.layers.{bid}.block_sparse_moe.router.layer", # granitemoe + "language_model.model.layers.{bid}.feed_forward.router", # llama4 + "encoder.layers.{bid}.mlp.router.layer", # nomic-bert-moe ), + MODEL_TENSOR.FFN_GATE_INP_SHEXP: ( - "model.layers.{bid}.mlp.shared_expert_gate", # qwen2moe + "model.layers.{bid}.mlp.shared_expert_gate", # qwen2moe ), + MODEL_TENSOR.FFN_EXP_PROBS_B: ( - "model.layers.{bid}.mlp.gate.e_score_correction", # deepseek-v3 + "model.layers.{bid}.mlp.gate.e_score_correction", # deepseek-v3 ), + # Feed-forward up MODEL_TENSOR.FFN_UP: ( - "gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox - "transformer.h.{bid}.mlp.c_fc", # gpt2 jais - "transformer.blocks.{bid}.ffn.up_proj", # mpt - "transformer.h.{bid}.mlp.dense_h_to_4h", # falcon - "h.{bid}.mlp.dense_h_to_4h", # bloom - "model.layers.{bid}.mlp.up_proj", # llama-hf refact nemotron olmo2 - "layers.{bid}.feed_forward.w3", # llama-pth - "encoder.layer.{bid}.intermediate.dense", # bert - "transformer.h.{bid}.mlp.fc_in", # gpt-j - "transformer.h.{bid}.mlp.linear_3", # refact + "gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox + "transformer.h.{bid}.mlp.c_fc", # gpt2 jais + "transformer.blocks.{bid}.ffn.up_proj", # mpt + "transformer.h.{bid}.mlp.dense_h_to_4h", # falcon + "h.{bid}.mlp.dense_h_to_4h", # bloom + "model.layers.{bid}.mlp.up_proj", # llama-hf refact nemotron olmo2 + "layers.{bid}.feed_forward.w3", # llama-pth + "encoder.layer.{bid}.intermediate.dense", # bert + "transformer.h.{bid}.mlp.fc_in", # gpt-j + "transformer.h.{bid}.mlp.linear_3", # refact "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon - "model.layers.{bid}.mlp.dense_h_to_4h", # persimmon - "transformer.h.{bid}.mlp.w1", # qwen - "h.{bid}.mlp.c_fc", # gpt2 - "transformer.h.{bid}.mlp.fc1", # phi2 - "model.layers.{bid}.mlp.fc1", # phi2 - "model.layers.{bid}.mlp.gate_up_proj", # phi3 - "model.layers.layers.{bid}.mlp.up_proj", # plamo - "model.layers.{bid}.feed_forward.w3", # internlm2 - "encoder.layers.{bid}.mlp.fc11", # nomic-bert - "model.layers.{bid}.mlp.c_fc", # starcoder2 - "encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2 - "model.layers.{bid}.residual_mlp.w3", # arctic - "encoder.layers.{bid}.mlp.dense_h_to_4h", # chatglm - "transformer.h.{bid}.mlp.c_fc_1", # exaone + "model.layers.{bid}.mlp.dense_h_to_4h", # persimmon + "transformer.h.{bid}.mlp.w1", # qwen + "h.{bid}.mlp.c_fc", # gpt2 + "transformer.h.{bid}.mlp.fc1", # phi2 + "model.layers.{bid}.mlp.fc1", # phi2 + "model.layers.{bid}.mlp.gate_up_proj", # phi3 glm-4-0414 + "model.layers.layers.{bid}.mlp.up_proj", # plamo + "model.layers.{bid}.feed_forward.w3", # internlm2 + "encoder.layers.{bid}.mlp.fc11", # nomic-bert + "encoder.layers.{bid}.mlp.fc1", # nomic-bert-moe + "model.layers.{bid}.mlp.c_fc", # starcoder2 + "encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2 + "model.layers.{bid}.residual_mlp.w3", # arctic + "encoder.layers.{bid}.mlp.dense_h_to_4h", # chatglm + "transformer.h.{bid}.mlp.c_fc_1", # exaone + "language_model.model.layers.{bid}.feed_forward.up_proj", # llama4 ), + MODEL_TENSOR.FFN_UP_EXP: ( - "layers.{bid}.feed_forward.experts.w3", # mixtral (merged) - "transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged) - "transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx - "model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged) - "model.layers.{bid}.block_sparse_moe.experts.w3", # phimoe (merged) + "layers.{bid}.feed_forward.experts.w3", # mixtral (merged) + "transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged) + "transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx + "model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged) + "model.layers.{bid}.block_sparse_moe.experts.w3", # phimoe (merged) + "language_model.model.layers.{bid}.feed_forward.experts.up_proj", # llama4 + "encoder.layers.{bid}.mlp.experts.mlp.w1", # nomic-bert-moe ), + MODEL_TENSOR.FFN_UP_SHEXP: ( "model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe - "model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek deepseek2 + "model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek deepseek2 + "language_model.model.layers.{bid}.feed_forward.shared_expert.up_proj", # llama4 ), + # AWQ-activation gate MODEL_TENSOR.FFN_ACT: ( "transformer.blocks.{bid}.ffn.act", # mpt ), + # Feed-forward gate MODEL_TENSOR.FFN_GATE: ( - "model.layers.{bid}.mlp.gate_proj", # llama-hf refact olmo2 - "layers.{bid}.feed_forward.w1", # llama-pth - "transformer.h.{bid}.mlp.w2", # qwen - "transformer.h.{bid}.mlp.c_fc2", # jais - "model.layers.layers.{bid}.mlp.gate_proj", # plamo - "model.layers.{bid}.feed_forward.w1", # internlm2 - "encoder.layers.{bid}.mlp.fc12", # nomic-bert - "encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2 - "transformer.h.{bid}.mlp.linear_1", # refact - "model.layers.{bid}.residual_mlp.w1", # arctic - "transformer.h.{bid}.mlp.c_fc_0", # exaone + "model.layers.{bid}.mlp.gate_proj", # llama-hf refact olmo2 + "layers.{bid}.feed_forward.w1", # llama-pth + "transformer.h.{bid}.mlp.w2", # qwen + "transformer.h.{bid}.mlp.c_fc2", # jais + "model.layers.layers.{bid}.mlp.gate_proj", # plamo + "model.layers.{bid}.feed_forward.w1", # internlm2 + "encoder.layers.{bid}.mlp.fc12", # nomic-bert + "encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2 + "transformer.h.{bid}.mlp.linear_1", # refact + "model.layers.{bid}.residual_mlp.w1", # arctic + "transformer.h.{bid}.mlp.c_fc_0", # exaone + "language_model.model.layers.{bid}.feed_forward.gate_proj", # llama4 ), + MODEL_TENSOR.FFN_GATE_EXP: ( - "layers.{bid}.feed_forward.experts.w1", # mixtral (merged) - "transformer.decoder_layer.{bid}.moe.linear", # Grok (merged) - "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx - "model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged) - "model.layers.{bid}.block_sparse_moe.experts.w1", # phimoe (merged) + "layers.{bid}.feed_forward.experts.w1", # mixtral (merged) + "transformer.decoder_layer.{bid}.moe.linear", # Grok (merged) + "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx + "model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged) + "model.layers.{bid}.block_sparse_moe.experts.w1", # phimoe (merged) + "language_model.model.layers.{bid}.feed_forward.experts.gate_proj", # llama4 ), + MODEL_TENSOR.FFN_GATE_SHEXP: ( "model.layers.{bid}.mlp.shared_expert.gate_proj", # qwen2moe - "model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek deepseek2 + "model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek deepseek2 + "language_model.model.layers.{bid}.feed_forward.shared_expert.gate_proj", # llama4 ), + # Feed-forward down MODEL_TENSOR.FFN_DOWN: ( - "gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox - "transformer.h.{bid}.mlp.c_proj", # gpt2 refact qwen jais - "transformer.blocks.{bid}.ffn.down_proj", # mpt - "transformer.h.{bid}.mlp.dense_4h_to_h", # falcon - "h.{bid}.mlp.dense_4h_to_h", # bloom - "model.layers.{bid}.mlp.down_proj", # llama-hf nemotron olmo2 - "layers.{bid}.feed_forward.w2", # llama-pth - "encoder.layer.{bid}.output.dense", # bert - "transformer.h.{bid}.mlp.fc_out", # gpt-j + "gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox + "transformer.h.{bid}.mlp.c_proj", # gpt2 refact qwen jais + "transformer.blocks.{bid}.ffn.down_proj", # mpt + "transformer.h.{bid}.mlp.dense_4h_to_h", # falcon + "h.{bid}.mlp.dense_4h_to_h", # bloom + "model.layers.{bid}.mlp.down_proj", # llama-hf nemotron olmo2 + "layers.{bid}.feed_forward.w2", # llama-pth + "encoder.layer.{bid}.output.dense", # bert + "transformer.h.{bid}.mlp.fc_out", # gpt-j "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon - "model.layers.{bid}.mlp.dense_4h_to_h", # persimmon - "h.{bid}.mlp.c_proj", # gpt2 - "transformer.h.{bid}.mlp.fc2", # phi2 - "model.layers.{bid}.mlp.fc2", # phi2 - "model.layers.layers.{bid}.mlp.down_proj", # plamo - "model.layers.{bid}.feed_forward.w2", # internlm2 - "encoder.layers.{bid}.mlp.fc2", # nomic-bert - "model.layers.{bid}.mlp.c_proj", # starcoder2 - "encoder.layer.{bid}.mlp.wo", # jina-bert-v2 - "transformer.layers.{bid}.ffn.proj_2", # openelm - "model.layers.{bid}.residual_mlp.w2", # arctic - "encoder.layer.{bid}.mlp.down_layer", # jina-bert-v2 - "encoder.layers.{bid}.mlp.dense_4h_to_h", # chatglm - "model.layers.h.{bid}.mlp.c_proj", # exaone + "model.layers.{bid}.mlp.dense_4h_to_h", # persimmon + "h.{bid}.mlp.c_proj", # gpt2 + "transformer.h.{bid}.mlp.fc2", # phi2 + "model.layers.{bid}.mlp.fc2", # phi2 + "model.layers.layers.{bid}.mlp.down_proj", # plamo + "model.layers.{bid}.feed_forward.w2", # internlm2 + "encoder.layers.{bid}.mlp.fc2", # nomic-bert + "model.layers.{bid}.mlp.c_proj", # starcoder2 + "encoder.layer.{bid}.mlp.wo", # jina-bert-v2 + "transformer.layers.{bid}.ffn.proj_2", # openelm + "model.layers.{bid}.residual_mlp.w2", # arctic + "encoder.layer.{bid}.mlp.down_layer", # jina-bert-v2 + "encoder.layers.{bid}.mlp.dense_4h_to_h", # chatglm + "model.layers.h.{bid}.mlp.c_proj", # exaone + "language_model.model.layers.{bid}.feed_forward.down_proj", # llama4 ), + MODEL_TENSOR.FFN_DOWN_EXP: ( - "layers.{bid}.feed_forward.experts.w2", # mixtral (merged) - "transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged) - "transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx - "model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged) - "model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe - "model.layers.{bid}.block_sparse_moe.experts.w2", # phimoe (merged) + "layers.{bid}.feed_forward.experts.w2", # mixtral (merged) + "transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged) + "transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx + "model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged) + "model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe + "model.layers.{bid}.block_sparse_moe.experts.w2", # phimoe (merged) + "language_model.model.layers.{bid}.feed_forward.experts.down_proj", # llama4 + "encoder.layers.{bid}.mlp.experts.mlp.w2", # nomic-bert-moe ), + MODEL_TENSOR.FFN_DOWN_SHEXP: ( "model.layers.{bid}.mlp.shared_expert.down_proj", # qwen2moe - "model.layers.{bid}.mlp.shared_experts.down_proj", # deepseek deepseek2 + "model.layers.{bid}.mlp.shared_experts.down_proj", # deepseek deepseek2 + "language_model.model.layers.{bid}.feed_forward.shared_expert.down_proj", # llama4 ), + MODEL_TENSOR.ATTN_Q_NORM: ( "language_model.encoder.layers.{bid}.self_attention.q_layernorm", - "model.layers.{bid}.self_attn.q_layernorm", # persimmon - "model.layers.{bid}.self_attn.q_norm", # cohere olmoe chameleon olmo2 - "transformer.blocks.{bid}.attn.q_ln", # sea-lion - "encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2 - "transformer.layers.{bid}.attn.q_norm", # openelm + "model.layers.{bid}.self_attn.q_layernorm", # persimmon + "model.layers.{bid}.self_attn.q_norm", # cohere olmoe chameleon olmo2 + "transformer.blocks.{bid}.attn.q_ln", # sea-lion + "encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2 + "transformer.layers.{bid}.attn.q_norm", # openelm ), + MODEL_TENSOR.ATTN_K_NORM: ( "language_model.encoder.layers.{bid}.self_attention.k_layernorm", - "model.layers.{bid}.self_attn.k_layernorm", # persimmon - "model.layers.{bid}.self_attn.k_norm", # cohere olmoe chameleon olmo2 - "transformer.blocks.{bid}.attn.k_ln", # sea-lion - "encoder.layer.{bid}.attention.self.layer_norm_k", # jina-bert-v2 - "transformer.layers.{bid}.attn.k_norm", # openelm + "model.layers.{bid}.self_attn.k_layernorm", # persimmon + "model.layers.{bid}.self_attn.k_norm", # cohere olmoe chameleon olmo2 + "transformer.blocks.{bid}.attn.k_ln", # sea-lion + "encoder.layer.{bid}.attention.self.layer_norm_k", # jina-bert-v2 + "transformer.layers.{bid}.attn.k_norm", # openelm ), + MODEL_TENSOR.ROPE_FREQS: ( "language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", # persimmon ), + MODEL_TENSOR.LAYER_OUT_NORM: ( - "encoder.layer.{bid}.output.LayerNorm", # bert - "encoder.layers.{bid}.norm2", # nomic-bert - "transformer.decoder_layer.{bid}.rms_norm_3", # Grok - "encoder.layer.{bid}.mlp.layernorm", # jina-bert-v2 - "encoder.layer.{bid}.layer_norm_2", # jina-v2-code + "encoder.layer.{bid}.output.LayerNorm", # bert + "encoder.layers.{bid}.norm2", # nomic-bert + "transformer.decoder_layer.{bid}.rms_norm_3", # Grok + "encoder.layer.{bid}.mlp.layernorm", # jina-bert-v2 + "encoder.layer.{bid}.layer_norm_2" # jina-v2-code ), + MODEL_TENSOR.SSM_IN: ( "model.layers.{bid}.in_proj", "backbone.layers.{bid}.mixer.in_proj", ), + MODEL_TENSOR.SSM_CONV1D: ( "model.layers.{bid}.conv1d", "backbone.layers.{bid}.mixer.conv1d", ), + MODEL_TENSOR.SSM_X: ( "model.layers.{bid}.x_proj", "backbone.layers.{bid}.mixer.x_proj", ), + MODEL_TENSOR.SSM_DT: ( "model.layers.{bid}.dt_proj", "backbone.layers.{bid}.mixer.dt_proj", ), + MODEL_TENSOR.SSM_A: ( "model.layers.{bid}.A_log", "backbone.layers.{bid}.mixer.A_log", ), + MODEL_TENSOR.SSM_D: ( "model.layers.{bid}.D", "backbone.layers.{bid}.mixer.D", ), + MODEL_TENSOR.SSM_OUT: ( "model.layers.{bid}.out_proj", "backbone.layers.{bid}.mixer.out_proj", ), + + MODEL_TENSOR.TIME_MIX_W0: ( + "model.layers.{bid}.attention.w0", # rwkv7 + ), + MODEL_TENSOR.TIME_MIX_W1: ( - "rwkv.blocks.{bid}.attention.time_maa_w1", # rwkv v6 - "model.layers.{bid}.self_attn.time_maa_w1", # rwkv6qwen2 + "rwkv.blocks.{bid}.attention.time_maa_w1", # rwkv6 + "model.layers.{bid}.self_attn.time_maa_w1", # rwkv6qwen2 + "model.layers.{bid}.attention.w1", # rwkv7 ), + MODEL_TENSOR.TIME_MIX_W2: ( - "rwkv.blocks.{bid}.attention.time_maa_w2", # rwkv v6 - "model.layers.{bid}.self_attn.time_maa_w2", # rwkv6qwen2 + "rwkv.blocks.{bid}.attention.time_maa_w2", # rwkv6 + "model.layers.{bid}.self_attn.time_maa_w2", # rwkv6qwen2 + "model.layers.{bid}.attention.w2", # rwkv7 + ), + + MODEL_TENSOR.TIME_MIX_A0: ( + "model.layers.{bid}.attention.a0", # rwkv7 + ), + + MODEL_TENSOR.TIME_MIX_A1: ( + "model.layers.{bid}.attention.a1", # rwkv7 + ), + + MODEL_TENSOR.TIME_MIX_A2: ( + "model.layers.{bid}.attention.a2", # rwkv7 ), + + MODEL_TENSOR.TIME_MIX_V0: ( + "model.layers.{bid}.attention.v0", # rwkv7 + ), + + MODEL_TENSOR.TIME_MIX_V1: ( + "model.layers.{bid}.attention.v1", # rwkv7 + ), + + MODEL_TENSOR.TIME_MIX_V2: ( + "model.layers.{bid}.attention.v2", # rwkv7 + ), + + MODEL_TENSOR.TIME_MIX_G1: ( + "model.layers.{bid}.attention.g1", # rwkv7 + ), + + MODEL_TENSOR.TIME_MIX_G2: ( + "model.layers.{bid}.attention.g2", # rwkv7 + ), + + MODEL_TENSOR.TIME_MIX_K_K: ( + "model.layers.{bid}.attention.k_k", # rwkv7 + ), + + MODEL_TENSOR.TIME_MIX_K_A: ( + "model.layers.{bid}.attention.k_a", # rwkv7 + ), + + MODEL_TENSOR.TIME_MIX_R_K: ( + "model.layers.{bid}.attention.r_k", # rwkv7 + ), + MODEL_TENSOR.TIME_MIX_LERP_X: ( - "rwkv.blocks.{bid}.attention.time_maa_x", # rwkv v6 + "rwkv.blocks.{bid}.attention.time_maa_x", # rwkv6 "model.layers.{bid}.self_attn.time_maa_x", # rwkv6qwen2 ), + MODEL_TENSOR.TIME_MIX_LERP_K: ( - "rwkv.blocks.{bid}.attention.time_maa_k", # rwkv v6 + "rwkv.blocks.{bid}.attention.time_maa_k", # rwkv6 "model.layers.{bid}.self_attn.time_maa_k", # rwkv6qwen2 ), + MODEL_TENSOR.TIME_MIX_LERP_V: ( - "rwkv.blocks.{bid}.attention.time_maa_v", # rwkv v6 + "rwkv.blocks.{bid}.attention.time_maa_v", # rwkv6 "model.layers.{bid}.self_attn.time_maa_v", # rwkv6qwen2 ), + MODEL_TENSOR.TIME_MIX_LERP_R: ( - "rwkv.blocks.{bid}.attention.time_maa_r", # rwkv v6 + "rwkv.blocks.{bid}.attention.time_maa_r", # rwkv6 "model.layers.{bid}.self_attn.time_maa_r", # rwkv6qwen2 ), + MODEL_TENSOR.TIME_MIX_LERP_G: ( - "rwkv.blocks.{bid}.attention.time_maa_g", # rwkv v6 + "rwkv.blocks.{bid}.attention.time_maa_g", # rwkv6 "model.layers.{bid}.self_attn.time_maa_g", # rwkv6qwen2 ), + MODEL_TENSOR.TIME_MIX_LERP_W: ( - "rwkv.blocks.{bid}.attention.time_maa_w", # rwkv v6 + "rwkv.blocks.{bid}.attention.time_maa_w", # rwkv6 "model.layers.{bid}.self_attn.time_maa_w", # rwkv6qwen2 ), + MODEL_TENSOR.TIME_MIX_FIRST: ( - "rwkv.blocks.{bid}.attention.time_faaaa", # rwkv v6 + "rwkv.blocks.{bid}.attention.time_faaaa", # rwkv6 ), + MODEL_TENSOR.TIME_MIX_DECAY: ( - "rwkv.blocks.{bid}.attention.time_decay", # rwkv v6 + "rwkv.blocks.{bid}.attention.time_decay", # rwkv6 "model.layers.{bid}.self_attn.time_decay", # rwkv6qwen2 ), + MODEL_TENSOR.TIME_MIX_DECAY_W1: ( - "rwkv.blocks.{bid}.attention.time_decay_w1", # rwkv v6 - "model.layers.{bid}.self_attn.time_decay_w1", # rwkv6qwen2 + "rwkv.blocks.{bid}.attention.time_decay_w1", # rwkv6 + "model.layers.{bid}.self_attn.time_decay_w1", # rwkv6qwen2 ), + MODEL_TENSOR.TIME_MIX_DECAY_W2: ( - "rwkv.blocks.{bid}.attention.time_decay_w2", # rwkv v6 - "model.layers.{bid}.self_attn.time_decay_w2", # rwkv6qwen2 + "rwkv.blocks.{bid}.attention.time_decay_w2", # rwkv6 + "model.layers.{bid}.self_attn.time_decay_w2", # rwkv6qwen2 ), + MODEL_TENSOR.TIME_MIX_KEY: ( - "rwkv.blocks.{bid}.attention.key", # rwkv - "model.layers.{bid}.self_attn.k_proj", # rwkv6qwen2 + "rwkv.blocks.{bid}.attention.key", # rwkv6 + "model.layers.{bid}.self_attn.k_proj", # rwkv6qwen2 + "model.layers.{bid}.attention.key", # rwkv7 + "model.layers.{bid}.attention.k_proj", # rwkv7 ), + MODEL_TENSOR.TIME_MIX_VALUE: ( - "rwkv.blocks.{bid}.attention.value", # rwkv - "model.layers.{bid}.self_attn.v_proj", # rwkv6qwen2 + "rwkv.blocks.{bid}.attention.value", # rwkv6 + "model.layers.{bid}.self_attn.v_proj", # rwkv6qwen2 + "model.layers.{bid}.attention.value", # rwkv7 + "model.layers.{bid}.attention.v_proj", # rwkv7 ), + MODEL_TENSOR.TIME_MIX_RECEPTANCE: ( - "rwkv.blocks.{bid}.attention.receptance", # rwkv - "model.layers.{bid}.self_attn.q_proj", # rwkv6qwen2 + "rwkv.blocks.{bid}.attention.receptance", # rwkv6 + "model.layers.{bid}.self_attn.q_proj", # rwkv6qwen2 + "model.layers.{bid}.attention.receptance", # rwkv7 + "model.layers.{bid}.attention.r_proj", # rwkv7 ), + MODEL_TENSOR.TIME_MIX_GATE: ( - "rwkv.blocks.{bid}.attention.gate", # rwkv - "model.layers.{bid}.self_attn.gate", # rwkv6qwen2 + "rwkv.blocks.{bid}.attention.gate", # rwkv6 + "model.layers.{bid}.self_attn.gate", # rwkv6qwen2 ), + MODEL_TENSOR.TIME_MIX_LN: ( - "rwkv.blocks.{bid}.attention.ln_x", # rwkv + "rwkv.blocks.{bid}.attention.ln_x", # rwkv6 + "model.layers.{bid}.attention.ln_x" # rwkv7 ), + MODEL_TENSOR.TIME_MIX_OUTPUT: ( - "rwkv.blocks.{bid}.attention.output", # rwkv - "model.layers.{bid}.self_attn.o_proj", # rwkv6qwen2 + "rwkv.blocks.{bid}.attention.output", # rwkv6 + "model.layers.{bid}.self_attn.o_proj", # rwkv6qwen2 + "model.layers.{bid}.attention.output", # rwkv7 + "model.layers.{bid}.attention.o_proj", # rwkv7 ), + MODEL_TENSOR.CHANNEL_MIX_LERP_K: ( - "rwkv.blocks.{bid}.feed_forward.time_maa_k", # rwkv v6 + "rwkv.blocks.{bid}.feed_forward.time_maa_k", # rwkv6 + "model.layers.{bid}.feed_forward.x_k", # rwkv7 ), + MODEL_TENSOR.CHANNEL_MIX_LERP_R: ( - "rwkv.blocks.{bid}.feed_forward.time_maa_r", # rwkv v6 + "rwkv.blocks.{bid}.feed_forward.time_maa_r", # rwkv6 ), + MODEL_TENSOR.CHANNEL_MIX_KEY: ( - "rwkv.blocks.{bid}.feed_forward.key", # rwkv + "rwkv.blocks.{bid}.feed_forward.key", # rwkv6 + "model.layers.{bid}.feed_forward.key", # rwkv7 ), + MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE: ( - "rwkv.blocks.{bid}.feed_forward.receptance", # rwkv + "rwkv.blocks.{bid}.feed_forward.receptance", # rwkv6 ), + MODEL_TENSOR.CHANNEL_MIX_VALUE: ( - "rwkv.blocks.{bid}.feed_forward.value", # rwkv + "rwkv.blocks.{bid}.feed_forward.value", # rwkv6 + "model.layers.{bid}.feed_forward.value", # rwkv7 ), + MODEL_TENSOR.ATTN_Q_A: ( - "model.layers.{bid}.self_attn.q_a_proj", # deepseek2 + "model.layers.{bid}.self_attn.q_a_proj", # deepseek2 ), + MODEL_TENSOR.ATTN_Q_B: ( - "model.layers.{bid}.self_attn.q_b_proj", # deepseek2 + "model.layers.{bid}.self_attn.q_b_proj", # deepseek2 ), + MODEL_TENSOR.ATTN_KV_A_MQA: ( - "model.layers.{bid}.self_attn.kv_a_proj_with_mqa", # deepseek2 + "model.layers.{bid}.self_attn.kv_a_proj_with_mqa", # deepseek2 ), + MODEL_TENSOR.ATTN_KV_B: ( - "model.layers.{bid}.self_attn.kv_b_proj", # deepseek2 + "model.layers.{bid}.self_attn.kv_b_proj", # deepseek2 + ), + + MODEL_TENSOR.ATTN_K_B: ( + "model.layers.{bid}.self_attn.k_b_proj", # deepseek2 ), + + MODEL_TENSOR.ATTN_V_B: ( + "model.layers.{bid}.self_attn.v_b_proj", # deepseek2 + ), + MODEL_TENSOR.ATTN_Q_A_NORM: ( - "model.layers.{bid}.self_attn.q_a_layernorm", # deepseek2 + "model.layers.{bid}.self_attn.q_a_layernorm", # deepseek2 ), + MODEL_TENSOR.ATTN_KV_A_NORM: ( - "model.layers.{bid}.self_attn.kv_a_layernorm", # deepseek2 + "model.layers.{bid}.self_attn.kv_a_layernorm", # deepseek2 ), + MODEL_TENSOR.ATTN_SUB_NORM: ( "model.layers.{bid}.self_attn.inner_attn_ln", # bitnet ), + MODEL_TENSOR.FFN_SUB_NORM: ( "model.layers.{bid}.mlp.ffn_layernorm", # bitnet ), + MODEL_TENSOR.DEC_ATTN_NORM: ( - "decoder.block.{bid}.layer.0.layer_norm", # t5 + "decoder.block.{bid}.layer.0.layer_norm", # t5 ), + MODEL_TENSOR.DEC_ATTN_Q: ( - "decoder.block.{bid}.layer.0.SelfAttention.q", # t5 + "decoder.block.{bid}.layer.0.SelfAttention.q", # t5 ), + MODEL_TENSOR.DEC_ATTN_K: ( - "decoder.block.{bid}.layer.0.SelfAttention.k", # t5 + "decoder.block.{bid}.layer.0.SelfAttention.k", # t5 ), + MODEL_TENSOR.DEC_ATTN_V: ( - "decoder.block.{bid}.layer.0.SelfAttention.v", # t5 + "decoder.block.{bid}.layer.0.SelfAttention.v", # t5 ), + MODEL_TENSOR.DEC_ATTN_OUT: ( - "decoder.block.{bid}.layer.0.SelfAttention.o", # t5 + "decoder.block.{bid}.layer.0.SelfAttention.o", # t5 ), + MODEL_TENSOR.DEC_ATTN_REL_B: ( - "decoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", # t5 + "decoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", # t5 ), + MODEL_TENSOR.DEC_CROSS_ATTN_NORM: ( - "decoder.block.{bid}.layer.1.layer_norm", # t5 + "decoder.block.{bid}.layer.1.layer_norm", # t5 ), + MODEL_TENSOR.DEC_CROSS_ATTN_Q: ( - "decoder.block.{bid}.layer.1.EncDecAttention.q", # t5 + "decoder.block.{bid}.layer.1.EncDecAttention.q", # t5 ), + MODEL_TENSOR.DEC_CROSS_ATTN_K: ( - "decoder.block.{bid}.layer.1.EncDecAttention.k", # t5 + "decoder.block.{bid}.layer.1.EncDecAttention.k", # t5 ), + MODEL_TENSOR.DEC_CROSS_ATTN_V: ( - "decoder.block.{bid}.layer.1.EncDecAttention.v", # t5 + "decoder.block.{bid}.layer.1.EncDecAttention.v", # t5 ), + MODEL_TENSOR.DEC_CROSS_ATTN_OUT: ( - "decoder.block.{bid}.layer.1.EncDecAttention.o", # t5 + "decoder.block.{bid}.layer.1.EncDecAttention.o", # t5 ), + MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: ( - "decoder.block.{bid}.layer.1.EncDecAttention.relative_attention_bias", # t5 + "decoder.block.{bid}.layer.1.EncDecAttention.relative_attention_bias", # t5 ), + MODEL_TENSOR.DEC_FFN_NORM: ( - "decoder.block.{bid}.layer.2.layer_norm", # t5 + "decoder.block.{bid}.layer.2.layer_norm", # t5 ), + MODEL_TENSOR.DEC_FFN_GATE: ( - "decoder.block.{bid}.layer.2.DenseReluDense.wi_0", # flan-t5 + "decoder.block.{bid}.layer.2.DenseReluDense.wi_0", # flan-t5 ), + MODEL_TENSOR.DEC_FFN_UP: ( - "decoder.block.{bid}.layer.2.DenseReluDense.wi", # t5 - "decoder.block.{bid}.layer.2.DenseReluDense.wi_1", # flan-t5 + "decoder.block.{bid}.layer.2.DenseReluDense.wi", # t5 + "decoder.block.{bid}.layer.2.DenseReluDense.wi_1", # flan-t5 ), + MODEL_TENSOR.DEC_FFN_DOWN: ( - "decoder.block.{bid}.layer.2.DenseReluDense.wo", # t5 + "decoder.block.{bid}.layer.2.DenseReluDense.wo", # t5 ), + MODEL_TENSOR.DEC_OUTPUT_NORM: ( - "decoder.final_layer_norm", # t5 + "decoder.final_layer_norm", # t5 ), + MODEL_TENSOR.ENC_ATTN_NORM: ( - "encoder.block.{bid}.layer.0.layer_norm", # t5 + "encoder.block.{bid}.layer.0.layer_norm", # t5 ), + MODEL_TENSOR.ENC_ATTN_Q: ( - "encoder.block.{bid}.layer.0.SelfAttention.q", # t5 + "encoder.block.{bid}.layer.0.SelfAttention.q", # t5 ), + MODEL_TENSOR.ENC_ATTN_K: ( - "encoder.block.{bid}.layer.0.SelfAttention.k", # t5 + "encoder.block.{bid}.layer.0.SelfAttention.k", # t5 ), + MODEL_TENSOR.ENC_ATTN_V: ( - "encoder.block.{bid}.layer.0.SelfAttention.v", # t5 + "encoder.block.{bid}.layer.0.SelfAttention.v", # t5 ), + MODEL_TENSOR.ENC_ATTN_OUT: ( - "encoder.block.{bid}.layer.0.SelfAttention.o", # t5 + "encoder.block.{bid}.layer.0.SelfAttention.o", # t5 ), + MODEL_TENSOR.ENC_ATTN_REL_B: ( - "encoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", # t5 + "encoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", # t5 ), + MODEL_TENSOR.ENC_FFN_NORM: ( - "encoder.block.{bid}.layer.1.layer_norm", # t5 + "encoder.block.{bid}.layer.1.layer_norm", # t5 ), + MODEL_TENSOR.ENC_FFN_GATE: ( - "encoder.block.{bid}.layer.1.DenseReluDense.wi_0", # flan-t5 + "encoder.block.{bid}.layer.1.DenseReluDense.wi_0", # flan-t5 ), + MODEL_TENSOR.ENC_FFN_UP: ( - "encoder.block.{bid}.layer.1.DenseReluDense.wi", # t5 - "encoder.block.{bid}.layer.1.DenseReluDense.wi_1", # flan-t5 + "encoder.block.{bid}.layer.1.DenseReluDense.wi", # t5 + "encoder.block.{bid}.layer.1.DenseReluDense.wi_1", # flan-t5 ), + MODEL_TENSOR.ENC_FFN_DOWN: ( - "encoder.block.{bid}.layer.1.DenseReluDense.wo", # t5 + "encoder.block.{bid}.layer.1.DenseReluDense.wo", # t5 ), + ############################################################################ # TODO: these do not belong to block_mappings_cfg - move them to mappings_cfg MODEL_TENSOR.ENC_OUTPUT_NORM: ( - "encoder.final_layer_norm", # t5 + "encoder.final_layer_norm", # t5 ), + MODEL_TENSOR.CLS: ( - "classifier", # jina - "classifier.dense", # roberta + "classifier", # jina + "classifier.dense", # roberta ), + MODEL_TENSOR.CLS_OUT: ( - "classifier.out_proj", # roberta + "classifier.out_proj", # roberta ), ############################################################################# + MODEL_TENSOR.CONVNEXT_DW: ( - "backbone.convnext.{bid}.dwconv", # wavtokenizer + "backbone.convnext.{bid}.dwconv", # wavtokenizer ), + MODEL_TENSOR.CONVNEXT_NORM: ( - "backbone.convnext.{bid}.norm", # wavtokenizer + "backbone.convnext.{bid}.norm", # wavtokenizer ), + MODEL_TENSOR.CONVNEXT_PW1: ( - "backbone.convnext.{bid}.pwconv1", # wavtokenizer + "backbone.convnext.{bid}.pwconv1", # wavtokenizer ), + MODEL_TENSOR.CONVNEXT_PW2: ( - "backbone.convnext.{bid}.pwconv2", # wavtokenizer + "backbone.convnext.{bid}.pwconv2", # wavtokenizer ), + MODEL_TENSOR.CONVNEXT_GAMMA: ( - "backbone.convnext.{bid}.gamma", # wavtokenizer + "backbone.convnext.{bid}.gamma", # wavtokenizer ), + MODEL_TENSOR.POSNET_CONV1: ( - "backbone.posnet.{bid}.conv1", # wavtokenizer + "backbone.posnet.{bid}.conv1", # wavtokenizer ), + MODEL_TENSOR.POSNET_CONV2: ( - "backbone.posnet.{bid}.conv2", # wavtokenizer + "backbone.posnet.{bid}.conv2", # wavtokenizer ), + MODEL_TENSOR.POSNET_NORM: ( - "backbone.posnet.{bid}.norm", # wavtokenizer + "backbone.posnet.{bid}.norm", # wavtokenizer ), + MODEL_TENSOR.POSNET_NORM1: ( - "backbone.posnet.{bid}.norm1", # wavtokenizer + "backbone.posnet.{bid}.norm1", # wavtokenizer ), + MODEL_TENSOR.POSNET_NORM2: ( - "backbone.posnet.{bid}.norm2", # wavtokenizer + "backbone.posnet.{bid}.norm2", # wavtokenizer ), + MODEL_TENSOR.POSNET_ATTN_NORM: ( - "backbone.posnet.{bid}.norm", # wavtokenizer + "backbone.posnet.{bid}.norm", # wavtokenizer ), + MODEL_TENSOR.POSNET_ATTN_Q: ( - "backbone.posnet.{bid}.q", # wavtokenizer + "backbone.posnet.{bid}.q", # wavtokenizer ), + MODEL_TENSOR.POSNET_ATTN_K: ( - "backbone.posnet.{bid}.k", # wavtokenizer + "backbone.posnet.{bid}.k", # wavtokenizer ), + MODEL_TENSOR.POSNET_ATTN_V: ( - "backbone.posnet.{bid}.v", # wavtokenizer + "backbone.posnet.{bid}.v", # wavtokenizer ), + MODEL_TENSOR.POSNET_ATTN_OUT: ( - "backbone.posnet.{bid}.proj_out", # wavtokenizer + "backbone.posnet.{bid}.proj_out", # wavtokenizer + ), + + ############################################################################# + ## Vision encoder + + MODEL_TENSOR.V_MMPROJ: ( + "multi_modal_projector.linear_{bid}", + ), + + MODEL_TENSOR.V_MMPROJ_FC: ( + "model.connector.modality_projection.proj", # SmolVLM + ), + + MODEL_TENSOR.V_MMPROJ_MLP: ( + "model.mm_projector.mlp.mlp.{bid}", + ), + + MODEL_TENSOR.V_MMPROJ_PEG: ( + "model.mm_projector.peg.peg.{bid}", + ), + + MODEL_TENSOR.V_ENC_EMBD_CLS: ( + "vision_tower.vision_model.embeddings.class_embedding", + ), + + MODEL_TENSOR.V_ENC_EMBD_PATCH: ( + "vision_tower.vision_model.embeddings.patch_embedding", + "vpm.embeddings.patch_embedding", + "model.vision_model.embeddings.patch_embedding", # SmolVLM + "vision_tower.patch_conv", # pixtral + ), + + MODEL_TENSOR.V_ENC_EMBD_POS: ( + "vision_tower.vision_model.embeddings.position_embedding", + "vpm.embeddings.position_embedding", + "model.vision_model.embeddings.position_embedding", # SmolVLM + ), + + MODEL_TENSOR.V_ENC_ATTN_Q: ( + "vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj", + "vpm.encoder.layers.{bid}.self_attn.q_proj", + "model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM + "vision_tower.transformer.layers.{bid}.attention.q_proj", # pixtral + ), + + MODEL_TENSOR.V_ENC_ATTN_K: ( + "vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj", + "vpm.encoder.layers.{bid}.self_attn.k_proj", + "model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM + "vision_tower.transformer.layers.{bid}.attention.k_proj", # pixtral + ), + + MODEL_TENSOR.V_ENC_ATTN_V: ( + "vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj", + "vpm.encoder.layers.{bid}.self_attn.v_proj", + "model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM + "vision_tower.transformer.layers.{bid}.attention.v_proj", # pixtral + ), + + MODEL_TENSOR.V_ENC_INPUT_NORM: ( + "vision_tower.vision_model.encoder.layers.{bid}.layer_norm1", + "vpm.encoder.layers.{bid}.layer_norm1", + "model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM + "vision_tower.transformer.layers.{bid}.attention_norm", # pixtral + ), + + MODEL_TENSOR.V_ENC_OUTPUT: ( + "vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj", + "vpm.encoder.layers.{bid}.self_attn.out_proj", + "model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM + "vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral + ), + + MODEL_TENSOR.V_ENC_OUTPUT_NORM: ( + "vision_tower.vision_model.encoder.layers.{bid}.layer_norm2", + "vpm.encoder.layers.{bid}.layer_norm2", + "model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM + "vision_tower.transformer.layers.{bid}.ffn_norm", # pixtral + ), + + MODEL_TENSOR.V_ENC_FFN_UP: ( + "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1", + "vpm.encoder.layers.{bid}.mlp.fc1", + "model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3 (note: name is swapped) + "vision_tower.transformer.layers.{bid}.feed_forward.up_proj", # pixtral + ), + + MODEL_TENSOR.V_ENC_FFN_GATE: ( + "vision_tower.transformer.layers.{bid}.feed_forward.gate_proj", # pixtral + ), + + MODEL_TENSOR.V_ENC_FFN_DOWN: ( + "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2", + "vpm.encoder.layers.{bid}.mlp.fc2", + "model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3 (note: name is swapped) + "vision_tower.transformer.layers.{bid}.feed_forward.down_proj", # pixtral + ), + + MODEL_TENSOR.V_PRE_NORM: ( + "vision_tower.vision_model.pre_layrnorm", + "vision_tower.ln_pre", # pixtral + ), + + MODEL_TENSOR.V_POST_NORM: ( + "vision_tower.vision_model.post_layernorm", + "model.vision_model.post_layernorm", # SmolVLM + ), + + MODEL_TENSOR.V_MM_INP_PROJ: ( + "multi_modal_projector.mm_input_projection", + ), + + MODEL_TENSOR.V_MM_SOFT_EMB_NORM: ( + "multi_modal_projector.mm_soft_emb_norm", + ), + + MODEL_TENSOR.V_RESMPL_POS_EMBD_K: ( + "resampler.pos_embed_k", + ), + + MODEL_TENSOR.V_RESMPL_ATTN_Q: ( + "resampler.attn.in_proj_q", # tensor generated from resampler.attn.in_proj + ), + + MODEL_TENSOR.V_RESMPL_ATTN_K: ( + "resampler.attn.in_proj_k", # tensor generated from resampler.attn.in_proj + ), + + MODEL_TENSOR.V_RESMPL_ATTN_V: ( + "resampler.attn.in_proj_v", # tensor generated from resampler.attn.in_proj + ), + + MODEL_TENSOR.V_RESMPL_ATTN_OUT: ( + "resampler.attn.out_proj", + ), + + MODEL_TENSOR.V_RESMPL_KV: ( + "resampler.kv_proj", + ), + + MODEL_TENSOR.V_RESMPL_POST_NORM: ( + "resampler.ln_post", + ), + + MODEL_TENSOR.V_RESMPL_KV_NORM: ( + "resampler.ln_kv", + ), + + MODEL_TENSOR.V_RESMPL_Q_NORM: ( + "resampler.ln_q", + ), + + MODEL_TENSOR.V_RESMPL_PROJ: ( + "resampler.proj", + ), + + MODEL_TENSOR.V_RESMPL_QUERY: ( + "resampler.query", + ), + + MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK: ( + "v.token_embd.img_break", # for pixtral, this is a generated vector ), } # architecture-specific block mappings arch_block_mappings_cfg: dict[MODEL_ARCH, dict[MODEL_TENSOR, tuple[str, ...]]] = { MODEL_ARCH.ARCTIC: { - MODEL_TENSOR.FFN_NORM: ("model.layers.{bid}.residual_layernorm",), - MODEL_TENSOR.FFN_NORM_EXP: ("model.layers.{bid}.post_attention_layernorm",), + MODEL_TENSOR.FFN_NORM: ( + "model.layers.{bid}.residual_layernorm", + ), + MODEL_TENSOR.FFN_NORM_EXP: ( + "model.layers.{bid}.post_attention_layernorm", + ), }, } @@ -695,35 +1084,31 @@ def __init__(self, arch: MODEL_ARCH, n_blocks: int): if tensor not in MODEL_TENSORS[arch]: continue - tensor_name = TENSOR_NAMES[tensor].format(bid=bid) + tensor_name = TENSOR_NAMES[tensor].format(bid = bid) self.mapping[tensor_name] = (tensor, tensor_name) for key in keys: - key = key.format(bid=bid) + key = key.format(bid = bid) self.mapping[key] = (tensor, tensor_name) - def get_type_and_name( - self, key: str, try_suffixes: Sequence[str] = () - ) -> tuple[MODEL_TENSOR, str] | None: + def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None: result = self.mapping.get(key) if result is not None: return result for suffix in try_suffixes: if key.endswith(suffix): - result = self.mapping.get(key[: -len(suffix)]) + result = self.mapping.get(key[:-len(suffix)]) if result is not None: return result[0], result[1] + suffix return None def get_name(self, key: str, try_suffixes: Sequence[str] = ()) -> str | None: - result = self.get_type_and_name(key, try_suffixes=try_suffixes) + result = self.get_type_and_name(key, try_suffixes = try_suffixes) if result is None: return None return result[1] - def get_type( - self, key: str, try_suffixes: Sequence[str] = () - ) -> MODEL_TENSOR | None: - result = self.get_type_and_name(key, try_suffixes=try_suffixes) + def get_type(self, key: str, try_suffixes: Sequence[str] = ()) -> MODEL_TENSOR | None: + result = self.get_type_and_name(key, try_suffixes = try_suffixes) if result is None: return None return result[0] diff --git a/lpm_kernel/L2/gguf-py/gguf/utility.py b/lpm_kernel/L2/gguf-py/gguf/utility.py index 2cb0def9..e5251aef 100644 --- a/lpm_kernel/L2/gguf-py/gguf/utility.py +++ b/lpm_kernel/L2/gguf-py/gguf/utility.py @@ -1,33 +1,31 @@ from __future__ import annotations +from dataclasses import dataclass from typing import Literal +import os +import json + def fill_templated_filename(filename: str, output_type: str | None) -> str: # Given a file name fill in any type templates e.g. 'some-model-name.{ftype}.gguf' ftype_lowercase: str = output_type.lower() if output_type is not None else "" ftype_uppercase: str = output_type.upper() if output_type is not None else "" - return filename.format( - ftype_lowercase, - outtype=ftype_lowercase, - ftype=ftype_lowercase, - OUTTYPE=ftype_uppercase, - FTYPE=ftype_uppercase, - ) - - -def model_weight_count_rounded_notation( - model_params_count: int, min_digits: int = 2 -) -> str: - if model_params_count > 1e12: + return filename.format(ftype_lowercase, + outtype=ftype_lowercase, ftype=ftype_lowercase, + OUTTYPE=ftype_uppercase, FTYPE=ftype_uppercase) + + +def model_weight_count_rounded_notation(model_params_count: int, min_digits: int = 2) -> str: + if model_params_count > 1e12 : # Trillions Of Parameters scaled_model_params = model_params_count * 1e-12 scale_suffix = "T" - elif model_params_count > 1e9: + elif model_params_count > 1e9 : # Billions Of Parameters scaled_model_params = model_params_count * 1e-9 scale_suffix = "B" - elif model_params_count > 1e6: + elif model_params_count > 1e6 : # Millions Of Parameters scaled_model_params = model_params_count * 1e-6 scale_suffix = "M" @@ -36,65 +34,231 @@ def model_weight_count_rounded_notation( scaled_model_params = model_params_count * 1e-3 scale_suffix = "K" - fix = max(min_digits - len(str(round(scaled_model_params)).lstrip("0")), 0) + fix = max(min_digits - len(str(round(scaled_model_params)).lstrip('0')), 0) return f"{scaled_model_params:.{fix}f}{scale_suffix}" -def size_label( - total_params: int, shared_params: int, expert_params: int, expert_count: int -) -> str: +def size_label(total_params: int, shared_params: int, expert_params: int, expert_count: int) -> str: + if expert_count > 0: - pretty_size = model_weight_count_rounded_notation( - abs(shared_params) + abs(expert_params), min_digits=2 - ) + pretty_size = model_weight_count_rounded_notation(abs(shared_params) + abs(expert_params), min_digits=2) size_class = f"{expert_count}x{pretty_size}" else: - size_class = model_weight_count_rounded_notation( - abs(total_params), min_digits=2 - ) + size_class = model_weight_count_rounded_notation(abs(total_params), min_digits=2) return size_class -def naming_convention( - model_name: str | None, - base_name: str | None, - finetune_string: str | None, - version_string: str | None, - size_label: str | None, - output_type: str | None, - model_type: Literal["vocab", "LoRA"] | None = None, -) -> str: - # Reference: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#gguf-naming-convention +def naming_convention(model_name: str | None, base_name: str | None, finetune_string: str | None, version_string: str | None, size_label: str | None, output_type: str | None, model_type: Literal['vocab', 'LoRA'] | None = None) -> str: + # Reference: https://github.com/ggml-org/ggml/blob/master/docs/gguf.md#gguf-naming-convention if base_name is not None: - name = base_name.strip().replace(" ", "-").replace("/", "-") + name = base_name.strip().replace(' ', '-').replace('/', '-') elif model_name is not None: - name = model_name.strip().replace(" ", "-").replace("/", "-") + name = model_name.strip().replace(' ', '-').replace('/', '-') else: name = "ggml-model" parameters = f"-{size_label}" if size_label is not None else "" - finetune = ( - f"-{finetune_string.strip().replace(' ', '-')}" - if finetune_string is not None - else "" - ) + finetune = f"-{finetune_string.strip().replace(' ', '-')}" if finetune_string is not None else "" - version = ( - f"-{version_string.strip().replace(' ', '-')}" - if version_string is not None - else "" - ) + version = f"-{version_string.strip().replace(' ', '-')}" if version_string is not None else "" - encoding = ( - f"-{output_type.strip().replace(' ', '-').upper()}" - if output_type is not None - else "" - ) + encoding = f"-{output_type.strip().replace(' ', '-').upper()}" if output_type is not None else "" kind = f"-{model_type.strip().replace(' ', '-')}" if model_type is not None else "" return f"{name}{parameters}{finetune}{version}{encoding}{kind}" + + +@dataclass +class RemoteTensor: + dtype: str + shape: tuple[int, ...] + offset_start: int + size: int + url: str + + def data(self) -> bytearray: + # TODO: handle request errors (maybe with limited retries?) + # NOTE: using a bytearray, otherwise PyTorch complains the buffer is not writeable + data = bytearray(SafetensorRemote.get_data_by_range(url=self.url, start=self.offset_start, size=self.size)) + return data + + +class SafetensorRemote: + """ + Uility class to handle remote safetensor files. + This class is designed to work with Hugging Face model repositories. + + Example (one model has single safetensor file, the other has multiple): + for model_id in ["ngxson/TEST-Tiny-Llama4", "Qwen/Qwen2.5-7B-Instruct"]: + tensors = SafetensorRemote.get_list_tensors_hf_model(model_id) + print(tensors) + + Example reading tensor data: + tensors = SafetensorRemote.get_list_tensors_hf_model(model_id) + for name, meta in tensors.items(): + dtype, shape, offset_start, size, remote_safetensor_url = meta + # read the tensor data + data = SafetensorRemote.get_data_by_range(remote_safetensor_url, offset_start, size) + print(data) + """ + + BASE_DOMAIN = "https://huggingface.co" + ALIGNMENT = 8 # bytes + + @classmethod + def get_list_tensors_hf_model(cls, model_id: str) -> dict[str, RemoteTensor]: + """ + Get list of tensors from a Hugging Face model repository. + + Returns a dictionary of tensor names and their metadata. + Each tensor is represented as a tuple of (dtype, shape, offset_start, size, remote_safetensor_url) + """ + # case 1: model has only one single model.safetensor file + is_single_file = cls.check_file_exist(f"{cls.BASE_DOMAIN}/{model_id}/resolve/main/model.safetensors") + if is_single_file: + url = f"{cls.BASE_DOMAIN}/{model_id}/resolve/main/model.safetensors" + return cls.get_list_tensors(url) + + # case 2: model has multiple files + index_url = f"{cls.BASE_DOMAIN}/{model_id}/resolve/main/model.safetensors.index.json" + is_multiple_files = cls.check_file_exist(index_url) + if is_multiple_files: + # read the index file + index_data = cls.get_data_by_range(index_url, 0) + index_str = index_data.decode('utf-8') + index_json = json.loads(index_str) + assert index_json.get("weight_map") is not None, "weight_map not found in index file" + weight_map = index_json["weight_map"] + # get the list of files + all_files = list(set(weight_map.values())) + all_files.sort() # make sure we load shard files in order + # get the list of tensors + tensors: dict[str, RemoteTensor] = {} + for file in all_files: + url = f"{cls.BASE_DOMAIN}/{model_id}/resolve/main/{file}" + for key, val in cls.get_list_tensors(url).items(): + tensors[key] = val + return tensors + + raise ValueError(f"Model {model_id} does not have any safetensor files") + + @classmethod + def get_list_tensors(cls, url: str) -> dict[str, RemoteTensor]: + """ + Get list of tensors from a remote safetensor file. + + Returns a dictionary of tensor names and their metadata. + Each tensor is represented as a tuple of (dtype, shape, offset_start, size) + """ + metadata, data_start_offset = cls.get_metadata(url) + res: dict[str, RemoteTensor] = {} + + for name, meta in metadata.items(): + if name == "__metadata__": + continue + if not isinstance(meta, dict): + raise ValueError(f"Invalid metadata for tensor '{name}': {meta}") + try: + dtype = meta["dtype"] + shape = meta["shape"] + offset_start_relative, offset_end_relative = meta["data_offsets"] + size = offset_end_relative - offset_start_relative + offset_start = data_start_offset + offset_start_relative + res[name] = RemoteTensor(dtype=dtype, shape=tuple(shape), offset_start=offset_start, size=size, url=url) + except KeyError as e: + raise ValueError(f"Missing key in metadata for tensor '{name}': {e}, meta = {meta}") + + return res + + @classmethod + def get_metadata(cls, url: str) -> tuple[dict, int]: + """ + Get JSON metadata from a remote safetensor file. + + Returns tuple of (metadata, data_start_offset) + """ + # Request first 5MB of the file (hopefully enough for metadata) + read_size = 5 * 1024 * 1024 + raw_data = cls.get_data_by_range(url, 0, read_size) + + # Parse header + # First 8 bytes contain the metadata length as u64 little-endian + if len(raw_data) < 8: + raise ValueError("Not enough data to read metadata size") + metadata_length = int.from_bytes(raw_data[:8], byteorder='little') + + # Calculate the data start offset + data_start_offset = 8 + metadata_length + alignment = SafetensorRemote.ALIGNMENT + if data_start_offset % alignment != 0: + data_start_offset += alignment - (data_start_offset % alignment) + + # Check if we have enough data to read the metadata + if len(raw_data) < 8 + metadata_length: + raise ValueError(f"Could not read complete metadata. Need {8 + metadata_length} bytes, got {len(raw_data)}") + + # Extract metadata bytes and parse as JSON + metadata_bytes = raw_data[8:8 + metadata_length] + metadata_str = metadata_bytes.decode('utf-8') + try: + metadata = json.loads(metadata_str) + return metadata, data_start_offset + except json.JSONDecodeError as e: + raise ValueError(f"Failed to parse safetensor metadata as JSON: {e}") + + @classmethod + def get_data_by_range(cls, url: str, start: int, size: int = -1) -> bytes: + """ + Get raw byte data from a remote file by range. + If size is not specified, it will read the entire file. + """ + import requests + from urllib.parse import urlparse + + parsed_url = urlparse(url) + if not parsed_url.scheme or not parsed_url.netloc: + raise ValueError(f"Invalid URL: {url}") + + headers = cls._get_request_headers() + if size > -1: + headers["Range"] = f"bytes={start}-{start + size}" + response = requests.get(url, allow_redirects=True, headers=headers) + response.raise_for_status() + + # Get raw byte data + return response.content[:size] + + @classmethod + def check_file_exist(cls, url: str) -> bool: + """ + Check if a file exists at the given URL. + Returns True if the file exists, False otherwise. + """ + import requests + from urllib.parse import urlparse + + parsed_url = urlparse(url) + if not parsed_url.scheme or not parsed_url.netloc: + raise ValueError(f"Invalid URL: {url}") + + try: + headers = cls._get_request_headers() + headers["Range"] = "bytes=0-0" + response = requests.head(url, allow_redirects=True, headers=headers) + # Success (2xx) or redirect (3xx) + return 200 <= response.status_code < 400 + except requests.RequestException: + return False + + @classmethod + def _get_request_headers(cls) -> dict[str, str]: + """Prepare common headers for requests.""" + headers = {"User-Agent": "convert_hf_to_gguf"} + if os.environ.get("HF_TOKEN"): + headers["Authorization"] = f"Bearer {os.environ['HF_TOKEN']}" + return headers diff --git a/lpm_kernel/L2/gguf-py/gguf/vocab.py b/lpm_kernel/L2/gguf-py/gguf/vocab.py index 9325d5eb..cca09798 100644 --- a/lpm_kernel/L2/gguf-py/gguf/vocab.py +++ b/lpm_kernel/L2/gguf-py/gguf/vocab.py @@ -5,16 +5,7 @@ import json import os from pathlib import Path -from typing import ( - Any, - Callable, - Sequence, - Mapping, - Iterable, - Protocol, - ClassVar, - runtime_checkable, -) +from typing import Any, Callable, Sequence, Mapping, Iterable, Protocol, ClassVar, runtime_checkable from sentencepiece import SentencePieceProcessor @@ -32,9 +23,7 @@ class SpecialVocab: chat_template: str | Sequence[Mapping[str, str]] | None def __init__( - self, - path: str | os.PathLike[str], - load_merges: bool = False, + self, path: str | os.PathLike[str], load_merges: bool = False, special_token_types: Iterable[str] | None = None, n_vocab: int | None = None, ): @@ -47,60 +36,40 @@ def __init__( if special_token_types is not None: self.special_token_types = special_token_types else: - self.special_token_types = ( - "bos", - "eos", - "unk", - "sep", - "pad", - "cls", - "mask", - ) + self.special_token_types = ('bos', 'eos', 'unk', 'sep', 'pad', 'cls', 'mask') self._load(Path(path)) def __repr__(self) -> str: - return "".format( - len(self.merges), - self.special_token_ids or "unset", - self.add_special_token or "unset", + return ''.format( + len(self.merges), self.special_token_ids or "unset", self.add_special_token or "unset", ) def add_to_gguf(self, gw: GGUFWriter, quiet: bool = False) -> None: if self.merges: if not quiet: - logger.info(f"Adding {len(self.merges)} merge(s).") + logger.info(f'Adding {len(self.merges)} merge(s).') gw.add_token_merges(self.merges) elif self.load_merges: - logger.warning( - "Adding merges requested but no merges found, output may be non-functional." - ) + logger.warning('Adding merges requested but no merges found, output may be non-functional.') for typ, tokid in self.special_token_ids.items(): - id_handler: Callable[[int], None] | None = getattr( - gw, f"add_{typ}_token_id", None - ) + id_handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None) if id_handler is None: - logger.warning( - f"No handler for special token type {typ} with id {tokid} - skipping" - ) + logger.warning(f'No handler for special token type {typ} with id {tokid} - skipping') continue if not quiet: - logger.info(f"Setting special token type {typ} to {tokid}") + logger.info(f'Setting special token type {typ} to {tokid}') id_handler(tokid) for typ, value in self.add_special_token.items(): - add_handler: Callable[[bool], None] | None = getattr( - gw, f"add_add_{typ}_token", None - ) + add_handler: Callable[[bool], None] | None = getattr(gw, f'add_add_{typ}_token', None) if add_handler is None: - logger.warning( - f"No handler for add_{typ}_token with value {value} - skipping" - ) + logger.warning(f'No handler for add_{typ}_token with value {value} - skipping') continue if not quiet: - logger.info(f"Setting add_{typ}_token to {value}") + logger.info(f'Setting add_{typ}_token to {value}') add_handler(value) if self.chat_template is not None: if not quiet: - logger.info(f"Setting chat_template to {self.chat_template}") + logger.info(f'Setting chat_template to {self.chat_template}') gw.add_chat_template(self.chat_template) def _load(self, path: Path) -> None: @@ -110,12 +79,12 @@ def _load(self, path: Path) -> None: self._try_load_merges_txt(path) def _try_load_merges_txt(self, path: Path) -> bool: - merges_file = path / "merges.txt" + merges_file = path / 'merges.txt' if not merges_file.is_file(): return False - with open(merges_file, "r", encoding="utf-8") as fp: - first_line = next(fp, "").strip() - if not first_line.startswith("#"): + with open(merges_file, 'r', encoding = 'utf-8') as fp: + first_line = next(fp, '').strip() + if not first_line.startswith('#'): fp.seek(0) line_num = 0 else: @@ -128,11 +97,9 @@ def _try_load_merges_txt(self, path: Path) -> bool: continue parts = line.split(None, 3) if len(parts) != 2: - logger.warning( - f"{merges_file.name}: Line {line_num}: Entry malformed, ignoring" - ) + logger.warning(f'{merges_file.name}: Line {line_num}: Entry malformed, ignoring') continue - merges.append(f"{parts[0]} {parts[1]}") + merges.append(f'{parts[0]} {parts[1]}') self.merges = merges return True @@ -140,44 +107,36 @@ def _set_special_token(self, typ: str, tid: Any) -> None: if not isinstance(tid, int): return if tid < 0: - raise ValueError(f"invalid value for special token type {typ}: {tid}") + raise ValueError(f'invalid value for special token type {typ}: {tid}') if self.n_vocab is None or tid < self.n_vocab: if typ in self.special_token_ids: return self.special_token_ids[typ] = tid return - logger.warning( - f"Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping" - ) + logger.warning(f'Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping') def _try_load_from_tokenizer_json(self, path: Path) -> bool: - tokenizer_file = path / "tokenizer.json" + tokenizer_file = path / 'tokenizer.json' if tokenizer_file.is_file(): - with open(tokenizer_file, encoding="utf-8") as f: + with open(tokenizer_file, encoding = 'utf-8') as f: tokenizer = json.load(f) if self.load_merges: - merges = tokenizer.get("model", {}).get("merges") + merges = tokenizer.get('model', {}).get('merges') if isinstance(merges, list) and merges: if isinstance(merges[0], str): self.merges = merges - elif ( - isinstance(merges[0], list) - and len(merges[0]) == 2 - and isinstance(merges[0][0], str) - ): + elif isinstance(merges[0], list) and len(merges[0]) == 2 and isinstance(merges[0][0], str): # New format since transformers 4.45 to support spaces in merges - # ref: https://github.com/ggerganov/llama.cpp/issues/9692 + # ref: https://github.com/ggml-org/llama.cpp/issues/9692 # TODO: internally store as the new format instead of converting to old - if any(" " in s for pair in merges for s in pair): - logger.warning( - f'Spaces in merges detected, encoding as {chr(ord(" ") + 256)!r}' - ) + if any(' ' in s for pair in merges for s in pair): + logger.warning(f'Spaces in merges detected, encoding as {chr(ord(" ") + 256)!r}') self.merges = [ - " ".join( + ' '.join( [ # ensure the spaces are properly encoded - "".join( - chr(ord(c) + 256) if c == " " else c + ''.join( + chr(ord(c) + 256) if c == ' ' else c for c in part ) for part in pair @@ -187,30 +146,33 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool: ] else: raise ValueError("Unknown tokenizer merges format") - added_tokens = tokenizer.get("added_tokens", {}) + added_tokens = tokenizer.get('added_tokens', {}) else: added_tokens = {} - tokenizer_config_file = path / "tokenizer_config.json" + tokenizer_config_file = path / 'tokenizer_config.json' if not tokenizer_config_file.is_file(): return True - with open(tokenizer_config_file, encoding="utf-8") as f: + with open(tokenizer_config_file, encoding = 'utf-8') as f: tokenizer_config = json.load(f) - chat_template = tokenizer_config.get("chat_template") + chat_template_alt = None + chat_template_file = path / 'chat_template.json' + if chat_template_file.is_file(): + with open(chat_template_file, encoding = 'utf-8') as f: + chat_template_alt = json.load(f).get('chat_template') + chat_template = tokenizer_config.get('chat_template', chat_template_alt) if chat_template is None or isinstance(chat_template, (str, list)): self.chat_template = chat_template else: - logger.warning( - f"Bad type for chat_template field in {tokenizer_config_file!r} - ignoring" - ) + logger.warning(f'Bad type for chat_template field in {tokenizer_config_file!r} - ignoring') for typ in self.special_token_types: - add_entry = tokenizer_config.get(f"add_{typ}_token") + add_entry = tokenizer_config.get(f'add_{typ}_token') if isinstance(add_entry, bool): self.add_special_token[typ] = add_entry - entry = tokenizer_config.get(f"{typ}_token") + entry = tokenizer_config.get(f'{typ}_token') if isinstance(entry, str): tc_content = entry elif isinstance(entry, dict): - entry_content = entry.get("content") + entry_content = entry.get('content') if not isinstance(entry_content, str): continue tc_content = entry_content @@ -218,24 +180,20 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool: continue # We only need the first match here. maybe_token_id = next( - ( - atok.get("id") - for atok in added_tokens - if atok.get("content") == tc_content - ), + (atok.get('id') for atok in added_tokens if atok.get('content') == tc_content), None, ) self._set_special_token(typ, maybe_token_id) return True def _try_load_from_config_json(self, path: Path) -> bool: - config_file = path / "config.json" + config_file = path / 'config.json' if not config_file.is_file(): return False - with open(config_file, encoding="utf-8") as f: + with open(config_file, encoding = 'utf-8') as f: config = json.load(f) for typ in self.special_token_types: - self._set_special_token(typ, config.get(f"{typ}_token_id")) + self._set_special_token(typ, config.get(f'{typ}_token_id')) return True @@ -252,11 +210,8 @@ class Vocab(BaseVocab, Protocol): added_tokens_list: list[str] fname_tokenizer: Path - def __init__(self, base_path: Path): - ... - - def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: - ... + def __init__(self, base_path: Path): ... + def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: ... class NoVocab(BaseVocab): @@ -274,59 +229,54 @@ class BpeVocab(Vocab): def __init__(self, base_path: Path): added_tokens: dict[str, int] = {} - if (fname_tokenizer := base_path / "vocab.json").exists(): + if (fname_tokenizer := base_path / 'vocab.json').exists(): # "slow" tokenizer with open(fname_tokenizer, encoding="utf-8") as f: self.vocab = json.load(f) try: # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab. - with open(base_path / "added_tokens.json", encoding="utf-8") as f: + with open(base_path / 'added_tokens.json', encoding="utf-8") as f: added_tokens = json.load(f) except FileNotFoundError: pass else: # "fast" tokenizer - fname_tokenizer = base_path / "tokenizer.json" + fname_tokenizer = base_path / 'tokenizer.json' # if this fails, FileNotFoundError propagates to caller with open(fname_tokenizer, encoding="utf-8") as f: tokenizer_json = json.load(f) - tokenizer_model: dict[str, Any] = tokenizer_json["model"] + tokenizer_model: dict[str, Any] = tokenizer_json['model'] if ( - tokenizer_model["type"] != "BPE" - or tokenizer_model.get("byte_fallback", False) - or tokenizer_json["decoder"]["type"] != "ByteLevel" + tokenizer_model['type'] != 'BPE' or tokenizer_model.get('byte_fallback', False) + or tokenizer_json['decoder']['type'] != 'ByteLevel' ): - raise FileNotFoundError("Cannot find GPT-2 BPE tokenizer") + raise FileNotFoundError('Cannot find GPT-2 BPE tokenizer') self.vocab = tokenizer_model["vocab"] - if (added := tokenizer_json.get("added_tokens")) is not None: + if (added := tokenizer_json.get('added_tokens')) is not None: # Added tokens here can be duplicates of the main vocabulary. - added_tokens = { - item["content"]: item["id"] - for item in added - if item["content"] not in self.vocab - } + added_tokens = {item['content']: item['id'] + for item in added + if item['content'] not in self.vocab} - vocab_size = len(self.vocab) + vocab_size = len(self.vocab) expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) - actual_ids = sorted(added_tokens.values()) + actual_ids = sorted(added_tokens.values()) if expected_ids != actual_ids: expected_end_id = vocab_size + len(actual_ids) - 1 - raise ValueError( - f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range " - f"{vocab_size} - {expected_end_id}; got {actual_ids}" - ) + raise ValueError(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range " + f"{vocab_size} - {expected_end_id}; got {actual_ids}") items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) - self.added_tokens_dict = added_tokens - self.added_tokens_list = [text for (text, idx) in items] - self.vocab_size_base = vocab_size - self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) - self.fname_tokenizer = fname_tokenizer + self.added_tokens_dict = added_tokens + self.added_tokens_list = [text for (text, idx) in items] + self.vocab_size_base = vocab_size + self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) + self.fname_tokenizer = fname_tokenizer def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()} @@ -353,44 +303,40 @@ class SentencePieceVocab(Vocab): def __init__(self, base_path: Path): added_tokens: dict[str, int] = {} - if (fname_tokenizer := base_path / "tokenizer.model").exists(): + if (fname_tokenizer := base_path / 'tokenizer.model').exists(): # normal location try: - with open(base_path / "added_tokens.json", encoding="utf-8") as f: + with open(base_path / 'added_tokens.json', encoding="utf-8") as f: added_tokens = json.load(f) except FileNotFoundError: pass - elif not (fname_tokenizer := base_path.parent / "tokenizer.model").exists(): + elif not (fname_tokenizer := base_path.parent / 'tokenizer.model').exists(): # not found in alternate location either - raise FileNotFoundError("Cannot find tokenizer.model") + raise FileNotFoundError('Cannot find tokenizer.model') self.sentencepiece_tokenizer = SentencePieceProcessor() self.sentencepiece_tokenizer.LoadFromFile(str(fname_tokenizer)) vocab_size = self.sentencepiece_tokenizer.vocab_size() - new_tokens = { - id: piece for piece, id in added_tokens.items() if id >= vocab_size - } + new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size} expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens))) - actual_new_ids = sorted(new_tokens.keys()) + actual_new_ids = sorted(new_tokens.keys()) if expected_new_ids != actual_new_ids: - raise ValueError( - f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}" - ) + raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}") # Token pieces that were added to the base vocabulary. - self.added_tokens_dict = added_tokens - self.added_tokens_list = [new_tokens[id] for id in actual_new_ids] - self.vocab_size_base = vocab_size - self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) - self.fname_tokenizer = fname_tokenizer + self.added_tokens_dict = added_tokens + self.added_tokens_list = [new_tokens[id] for id in actual_new_ids] + self.vocab_size_base = vocab_size + self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) + self.fname_tokenizer = fname_tokenizer def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: tokenizer = self.sentencepiece_tokenizer for i in range(tokenizer.vocab_size()): piece = tokenizer.IdToPiece(i) - text = piece.encode("utf-8") + text = piece.encode("utf-8") score: float = tokenizer.GetScore(i) toktype = gguf.TokenType.NORMAL @@ -428,27 +374,25 @@ class LlamaHfVocab(Vocab): name = "hfft" def __init__(self, base_path: Path): - fname_tokenizer = base_path / "tokenizer.json" + fname_tokenizer = base_path / 'tokenizer.json' # if this fails, FileNotFoundError propagates to caller - with open(fname_tokenizer, encoding="utf-8") as f: + with open(fname_tokenizer, encoding='utf-8') as f: tokenizer_json = json.load(f) # pre-check so we know if we need transformers - tokenizer_model: dict[str, Any] = tokenizer_json["model"] + tokenizer_model: dict[str, Any] = tokenizer_json['model'] is_llama3 = ( - tokenizer_model["type"] == "BPE" - and tokenizer_model.get("ignore_merges", False) - and not tokenizer_model.get("byte_fallback", True) + tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False) + and not tokenizer_model.get('byte_fallback', True) ) if is_llama3: - raise TypeError("Llama 3 must be converted with BpeVocab") + raise TypeError('Llama 3 must be converted with BpeVocab') if not is_llama3 and ( - tokenizer_model["type"] != "BPE" - or not tokenizer_model.get("byte_fallback", False) - or tokenizer_json["decoder"]["type"] != "Sequence" + tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False) + or tokenizer_json['decoder']['type'] != 'Sequence' ): - raise FileNotFoundError("Cannot find Llama BPE tokenizer") + raise FileNotFoundError('Cannot find Llama BPE tokenizer') try: from transformers import AutoTokenizer @@ -470,7 +414,7 @@ def __init__(self, base_path: Path): # Initialize lists and dictionaries for added tokens self.added_tokens_list = [] self.added_tokens_dict = dict() - self.added_tokens_ids = set() + self.added_tokens_ids = set() # Process added tokens for tok, tokidx in sorted( @@ -491,7 +435,7 @@ def __init__(self, base_path: Path): # Set vocabulary sizes self.vocab_size_base = self.tokenizer.vocab_size - self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) + self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) self.fname_tokenizer = fname_tokenizer @@ -509,27 +453,17 @@ def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: token_text = reverse_vocab[token_id].encode("utf-8") # Yield token text, score, and type - yield ( - token_text, - self.get_token_score(token_id), - self.get_token_type( - token_id, - token_text, - self.special_ids, # Reuse already stored special IDs - ), + yield token_text, self.get_token_score(token_id), self.get_token_type( + token_id, token_text, self.special_ids # Reuse already stored special IDs ) - def get_token_type( - self, token_id: int, token_text: bytes, special_ids: set[int] - ) -> gguf.TokenType: + def get_token_type(self, token_id: int, token_text: bytes, special_ids: set[int]) -> gguf.TokenType: # Special case for byte tokens - if re.fullmatch(rb"<0x[0-9A-Fa-f]{2}>", token_text): + if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text): return gguf.TokenType.BYTE # Determine token type based on whether it's a special token - return ( - gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL - ) + return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL def get_token_score(self, token_id: int) -> float: # Placeholder for actual logic to determine the token's score @@ -539,9 +473,7 @@ def get_token_score(self, token_id: int) -> float: def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: for text in self.added_tokens_list: if text in self.specials: - toktype = self.get_token_type( - self.specials[text], b"", self.special_ids - ) + toktype = self.get_token_type(self.specials[text], b'', self.special_ids) score = self.get_token_score(self.specials[text]) else: toktype = gguf.TokenType.USER_DEFINED diff --git a/lpm_kernel/L2/gguf-py/pyproject.toml b/lpm_kernel/L2/gguf-py/pyproject.toml index 78c6baa6..0c827256 100644 --- a/lpm_kernel/L2/gguf-py/pyproject.toml +++ b/lpm_kernel/L2/gguf-py/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "gguf" -version = "0.15.0" +version = "0.16.2" description = "Read and write ML models in GGUF for GGML" authors = ["GGML "] packages = [ @@ -9,7 +9,7 @@ packages = [ ] readme = "README.md" homepage = "https://ggml.ai" -repository = "https://github.com/ggerganov/llama.cpp" +repository = "https://github.com/ggml-org/llama.cpp" keywords = ["ggml", "gguf", "llama.cpp"] classifiers = [ "Programming Language :: Python :: 3", @@ -23,10 +23,14 @@ numpy = ">=1.17" tqdm = ">=4.27" pyyaml = ">=5.1" sentencepiece = ">=0.1.98,<=0.2.0" +PySide6 = { version = "^6.9", python = ">=3.9,<3.14", optional = true } [tool.poetry.dev-dependencies] pytest = "^5.2" +[tool.poetry.extras] +gui = ["PySide6"] + [build-system] requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" @@ -36,3 +40,4 @@ gguf-convert-endian = "gguf.scripts:gguf_convert_endian_entrypoint" gguf-dump = "gguf.scripts:gguf_dump_entrypoint" gguf-set-metadata = "gguf.scripts:gguf_set_metadata_entrypoint" gguf-new-metadata = "gguf.scripts:gguf_new_metadata_entrypoint" +gguf-editor-gui = "gguf.scripts:gguf_editor_gui_entrypoint" diff --git a/lpm_kernel/L2/gguf-py/tests/test_metadata.py b/lpm_kernel/L2/gguf-py/tests/test_metadata.py index e9c183b8..40d484f4 100755 --- a/lpm_kernel/L2/gguf-py/tests/test_metadata.py +++ b/lpm_kernel/L2/gguf-py/tests/test_metadata.py @@ -6,602 +6,231 @@ import sys # Necessary to load the local gguf package -if ( - "NO_LOCAL_GGUF" not in os.environ - and (Path(__file__).parent.parent.parent / "gguf-py").exists() -): +if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists(): sys.path.insert(0, str(Path(__file__).parent.parent)) import gguf class TestMetadataMethod(unittest.TestCase): + def test_id_to_title(self): - self.assertEqual( - gguf.Metadata.id_to_title("Mixtral-8x7B-Instruct-v0.1"), - "Mixtral 8x7B Instruct v0.1", - ) - self.assertEqual( - gguf.Metadata.id_to_title("Meta-Llama-3-8B"), "Meta Llama 3 8B" - ) - self.assertEqual( - gguf.Metadata.id_to_title("hermes-2-pro-llama-3-8b-DPO"), - "Hermes 2 Pro Llama 3 8b DPO", - ) + self.assertEqual(gguf.Metadata.id_to_title("Mixtral-8x7B-Instruct-v0.1"), "Mixtral 8x7B Instruct v0.1") + self.assertEqual(gguf.Metadata.id_to_title("Meta-Llama-3-8B"), "Meta Llama 3 8B") + self.assertEqual(gguf.Metadata.id_to_title("hermes-2-pro-llama-3-8b-DPO"), "Hermes 2 Pro Llama 3 8b DPO") def test_get_model_id_components(self): # This is the basic standard form with organization marker - self.assertEqual( - gguf.Metadata.get_model_id_components("Mistral/Mixtral-8x7B-Instruct-v0.1"), - ( - "Mixtral-8x7B-Instruct-v0.1", - "Mistral", - "Mixtral", - "Instruct", - "v0.1", - "8x7B", - ), - ) + self.assertEqual(gguf.Metadata.get_model_id_components("Mistral/Mixtral-8x7B-Instruct-v0.1"), + ('Mixtral-8x7B-Instruct-v0.1', "Mistral", 'Mixtral', 'Instruct', 'v0.1', '8x7B')) # Similar to basic standard form but without organization marker - self.assertEqual( - gguf.Metadata.get_model_id_components("Mixtral-8x7B-Instruct-v0.1"), - ("Mixtral-8x7B-Instruct-v0.1", None, "Mixtral", "Instruct", "v0.1", "8x7B"), - ) + self.assertEqual(gguf.Metadata.get_model_id_components("Mixtral-8x7B-Instruct-v0.1"), + ('Mixtral-8x7B-Instruct-v0.1', None, 'Mixtral', 'Instruct', 'v0.1', '8x7B')) # Missing version - self.assertEqual( - gguf.Metadata.get_model_id_components("Mixtral-8x7B-Instruct"), - ("Mixtral-8x7B-Instruct", None, "Mixtral", "Instruct", None, "8x7B"), - ) + self.assertEqual(gguf.Metadata.get_model_id_components("Mixtral-8x7B-Instruct"), + ('Mixtral-8x7B-Instruct', None, 'Mixtral', 'Instruct', None, '8x7B')) # Missing finetune - self.assertEqual( - gguf.Metadata.get_model_id_components("Mixtral-8x7B-v0.1"), - ("Mixtral-8x7B-v0.1", None, "Mixtral", None, "v0.1", "8x7B"), - ) + self.assertEqual(gguf.Metadata.get_model_id_components("Mixtral-8x7B-v0.1"), + ('Mixtral-8x7B-v0.1', None, 'Mixtral', None, 'v0.1', '8x7B')) # Base name and size label only - self.assertEqual( - gguf.Metadata.get_model_id_components("Mixtral-8x7B"), - ("Mixtral-8x7B", None, "Mixtral", None, None, "8x7B"), - ) + self.assertEqual(gguf.Metadata.get_model_id_components("Mixtral-8x7B"), + ('Mixtral-8x7B', None, 'Mixtral', None, None, '8x7B')) # Base name and version only - self.assertEqual( - gguf.Metadata.get_model_id_components("Mixtral-v0.1"), - ("Mixtral-v0.1", None, "Mixtral", None, "v0.1", None), - ) + self.assertEqual(gguf.Metadata.get_model_id_components("Mixtral-v0.1"), + ('Mixtral-v0.1', None, 'Mixtral', None, 'v0.1', None)) ## Edge Cases ## # This is too ambiguous... best to err on caution and output nothing - self.assertEqual( - gguf.Metadata.get_model_id_components("Mixtral"), - ("Mixtral", None, None, None, None, None), - ) + self.assertEqual(gguf.Metadata.get_model_id_components("Mixtral"), + ('Mixtral', None, None, None, None, None)) # Basename has numbers mixed in and also size label provided. Must avoid capturing number in basename - self.assertEqual( - gguf.Metadata.get_model_id_components("NousResearch/Meta-Llama-3-8B"), - ("Meta-Llama-3-8B", "NousResearch", "Meta-Llama-3", None, None, "8B"), - ) + self.assertEqual(gguf.Metadata.get_model_id_components("NousResearch/Meta-Llama-3-8B"), + ('Meta-Llama-3-8B', "NousResearch", 'Meta-Llama-3', None, None, '8B')) # Non standard naming - self.assertEqual( - gguf.Metadata.get_model_id_components("Qwen1.5-MoE-A2.7B-Chat"), - ("Qwen1.5-MoE-A2.7B-Chat", None, "Qwen1.5-MoE", "Chat", None, "A2.7B"), - ) + self.assertEqual(gguf.Metadata.get_model_id_components("Qwen1.5-MoE-A2.7B-Chat"), + ('Qwen1.5-MoE-A2.7B-Chat', None, 'Qwen1.5-MoE', 'Chat', None, 'A2.7B')) # Capture 'sub size labels' e.g. A14B in '57B-A14B' usually refers to activated params/weight count - self.assertEqual( - gguf.Metadata.get_model_id_components("Qwen2-57B-A14B-Instruct"), - ("Qwen2-57B-A14B-Instruct", None, "Qwen2", "Instruct", None, "57B-A14B"), - ) + self.assertEqual(gguf.Metadata.get_model_id_components("Qwen2-57B-A14B-Instruct"), + ('Qwen2-57B-A14B-Instruct', None, 'Qwen2', 'Instruct', None, '57B-A14B')) # Check that it can handle a real model id with no version code # Note that 4k in this string is non standard and microsoft were referring to context length rather than weight count - self.assertEqual( - gguf.Metadata.get_model_id_components( - "microsoft/Phi-3-mini-4k-instruct", 4 * 10**9 - ), - ( - "Phi-3-mini-4k-instruct", - "microsoft", - "Phi-3", - "4k-instruct", - None, - "mini", - ), - ) + self.assertEqual(gguf.Metadata.get_model_id_components("microsoft/Phi-3-mini-4k-instruct", 4 * 10**9), + ('Phi-3-mini-4k-instruct', 'microsoft', 'Phi-3', '4k-instruct', None, 'mini')) # There is some legitimate models with only thousands of parameters - self.assertEqual( - gguf.Metadata.get_model_id_components( - "delphi-suite/stories-llama2-50k", 50 * 10**3 - ), - ("stories-llama2-50k", "delphi-suite", "stories-llama2", None, None, "50K"), - ) + self.assertEqual(gguf.Metadata.get_model_id_components("delphi-suite/stories-llama2-50k", 50 * 10**3), + ('stories-llama2-50k', 'delphi-suite', 'stories-llama2', None, None, '50K')) # Non standard and not easy to disambiguate - self.assertEqual( - gguf.Metadata.get_model_id_components("DeepSeek-Coder-V2-Lite-Instruct"), - ( - "DeepSeek-Coder-V2-Lite-Instruct", - None, - "DeepSeek-Coder-V2-Lite", - "Instruct", - None, - None, - ), - ) + self.assertEqual(gguf.Metadata.get_model_id_components("DeepSeek-Coder-V2-Lite-Instruct"), + ('DeepSeek-Coder-V2-Lite-Instruct', None, 'DeepSeek-Coder-V2-Lite', 'Instruct', None, None)) # This is a real model_id where they append 2DPO to refer to Direct Preference Optimization - self.assertEqual( - gguf.Metadata.get_model_id_components( - "crestf411/daybreak-kunoichi-2dpo-7b" - ), - ( - "daybreak-kunoichi-2dpo-7b", - "crestf411", - "daybreak-kunoichi", - "2dpo", - None, - "7B", - ), - ) + self.assertEqual(gguf.Metadata.get_model_id_components("crestf411/daybreak-kunoichi-2dpo-7b"), + ('daybreak-kunoichi-2dpo-7b', 'crestf411', 'daybreak-kunoichi', '2dpo', None, '7B')) # This is a real model id where the weight size has a decimal point - self.assertEqual( - gguf.Metadata.get_model_id_components("Qwen2-0.5B-Instruct"), - ("Qwen2-0.5B-Instruct", None, "Qwen2", "Instruct", None, "0.5B"), - ) + self.assertEqual(gguf.Metadata.get_model_id_components("Qwen2-0.5B-Instruct"), + ('Qwen2-0.5B-Instruct', None, 'Qwen2', 'Instruct', None, '0.5B')) # Uses an underscore in the size label - self.assertEqual( - gguf.Metadata.get_model_id_components("smallcloudai/Refact-1_6B-fim"), - ("Refact-1_6B-fim", "smallcloudai", "Refact", "fim", None, "1.6B"), - ) + self.assertEqual(gguf.Metadata.get_model_id_components("smallcloudai/Refact-1_6B-fim"), + ('Refact-1_6B-fim', 'smallcloudai', 'Refact', 'fim', None, '1.6B')) # Uses Iter3 for the version - self.assertEqual( - gguf.Metadata.get_model_id_components("UCLA-AGI/Gemma-2-9B-It-SPPO-Iter3"), - ( - "Gemma-2-9B-It-SPPO-Iter3", - "UCLA-AGI", - "Gemma-2", - "It-SPPO", - "Iter3", - "9B", - ), - ) + self.assertEqual(gguf.Metadata.get_model_id_components("UCLA-AGI/Gemma-2-9B-It-SPPO-Iter3"), + ('Gemma-2-9B-It-SPPO-Iter3', 'UCLA-AGI', 'Gemma-2', 'It-SPPO', 'Iter3', '9B')) # Has two potential versions in the basename - self.assertEqual( - gguf.Metadata.get_model_id_components( - "NousResearch/Hermes-2-Theta-Llama-3-8B" - ), - ( - "Hermes-2-Theta-Llama-3-8B", - "NousResearch", - "Hermes-2-Theta-Llama-3", - None, - None, - "8B", - ), - ) + self.assertEqual(gguf.Metadata.get_model_id_components("NousResearch/Hermes-2-Theta-Llama-3-8B"), + ('Hermes-2-Theta-Llama-3-8B', 'NousResearch', 'Hermes-2-Theta-Llama-3', None, None, '8B')) # Potential version in the basename - self.assertEqual( - gguf.Metadata.get_model_id_components("SeaLLMs/SeaLLMs-v3-7B-Chat"), - ("SeaLLMs-v3-7B-Chat", "SeaLLMs", "SeaLLMs-v3", "Chat", None, "7B"), - ) + self.assertEqual(gguf.Metadata.get_model_id_components("SeaLLMs/SeaLLMs-v3-7B-Chat"), + ('SeaLLMs-v3-7B-Chat', 'SeaLLMs', 'SeaLLMs-v3', 'Chat', None, '7B')) # Underscore in the basename, and 1m for the context size - self.assertEqual( - gguf.Metadata.get_model_id_components( - "internlm/internlm2_5-7b-chat-1m", 7 * 10**9 - ), - ( - "internlm2_5-7b-chat-1m", - "internlm", - "internlm2_5", - "chat-1m", - None, - "7B", - ), - ) + self.assertEqual(gguf.Metadata.get_model_id_components("internlm/internlm2_5-7b-chat-1m", 7 * 10**9), + ('internlm2_5-7b-chat-1m', 'internlm', 'internlm2_5', 'chat-1m', None, '7B')) # Version before the finetune name - self.assertEqual( - gguf.Metadata.get_model_id_components("pszemraj/jamba-900M-v0.13-KIx2"), - ("jamba-900M-v0.13-KIx2", "pszemraj", "jamba", "KIx2", "v0.13", "900M"), - ) + self.assertEqual(gguf.Metadata.get_model_id_components("pszemraj/jamba-900M-v0.13-KIx2"), + ('jamba-900M-v0.13-KIx2', 'pszemraj', 'jamba', 'KIx2', 'v0.13', '900M')) # TODO: hf suffix which could be ignored but isn't - self.assertEqual( - gguf.Metadata.get_model_id_components("state-spaces/mamba-2.8b-hf"), - ("mamba-2.8b-hf", "state-spaces", "mamba", "hf", None, "2.8B"), - ) + self.assertEqual(gguf.Metadata.get_model_id_components("state-spaces/mamba-2.8b-hf"), + ('mamba-2.8b-hf', 'state-spaces', 'mamba', 'hf', None, '2.8B')) # Two sizes, don't merge them, the other is the number of tokens on which it was trained - self.assertEqual( - gguf.Metadata.get_model_id_components( - "abacaj/llama-161M-100B", 161 * 10**6 - ), - ("llama-161M-100B", "abacaj", "llama", "100b", None, "161M"), - ) + self.assertEqual(gguf.Metadata.get_model_id_components("abacaj/llama-161M-100B", 161 * 10**6), + ('llama-161M-100B', 'abacaj', 'llama', '100b', None, '161M')) # It's a trap, there is no size label - self.assertEqual( - gguf.Metadata.get_model_id_components("SparseLLM/relu-100B", 1340 * 10**6), - ("relu-100B", "SparseLLM", "relu", "100b", None, None), - ) + self.assertEqual(gguf.Metadata.get_model_id_components("SparseLLM/relu-100B", 1340 * 10**6), + ('relu-100B', 'SparseLLM', 'relu', '100b', None, None)) # Weird size notation - self.assertEqual( - gguf.Metadata.get_model_id_components("bigscience/bloom-7b1-petals"), - ("bloom-7b1-petals", "bigscience", "bloom", "petals", None, "7.1B"), - ) + self.assertEqual(gguf.Metadata.get_model_id_components("bigscience/bloom-7b1-petals"), + ('bloom-7b1-petals', 'bigscience', 'bloom', 'petals', None, '7.1B')) # Ignore full-text size labels when there are number-based ones, and deduplicate size labels - self.assertEqual( - gguf.Metadata.get_model_id_components( - "MaziyarPanahi/GreenNode-mini-7B-multilingual-v1olet-Mistral-7B-Instruct-v0.1" - ), - ( - "GreenNode-mini-7B-multilingual-v1olet-Mistral-7B-Instruct-v0.1", - "MaziyarPanahi", - "GreenNode-mini", - "multilingual-v1olet-Mistral-Instruct", - "v0.1", - "7B", - ), - ) + self.assertEqual(gguf.Metadata.get_model_id_components("MaziyarPanahi/GreenNode-mini-7B-multilingual-v1olet-Mistral-7B-Instruct-v0.1"), + ('GreenNode-mini-7B-multilingual-v1olet-Mistral-7B-Instruct-v0.1', 'MaziyarPanahi', 'GreenNode-mini', 'multilingual-v1olet-Mistral-Instruct', 'v0.1', '7B')) # Instruct in a name without a size label - self.assertEqual( - gguf.Metadata.get_model_id_components( - "mistralai/Mistral-Nemo-Instruct-2407" - ), - ( - "Mistral-Nemo-Instruct-2407", - "mistralai", - "Mistral-Nemo", - "Instruct", - "2407", - None, - ), - ) + self.assertEqual(gguf.Metadata.get_model_id_components("mistralai/Mistral-Nemo-Instruct-2407"), + ('Mistral-Nemo-Instruct-2407', 'mistralai', 'Mistral-Nemo', 'Instruct', '2407', None)) # Non-obvious splitting relying on 'chat' keyword - self.assertEqual( - gguf.Metadata.get_model_id_components("deepseek-ai/DeepSeek-V2-Chat-0628"), - ( - "DeepSeek-V2-Chat-0628", - "deepseek-ai", - "DeepSeek-V2", - "Chat", - "0628", - None, - ), - ) + self.assertEqual(gguf.Metadata.get_model_id_components("deepseek-ai/DeepSeek-V2-Chat-0628"), + ('DeepSeek-V2-Chat-0628', 'deepseek-ai', 'DeepSeek-V2', 'Chat', '0628', None)) # Multiple versions - self.assertEqual( - gguf.Metadata.get_model_id_components( - "OpenGVLab/Mini-InternVL-Chat-2B-V1-5" - ), - ( - "Mini-InternVL-Chat-2B-V1-5", - "OpenGVLab", - "Mini-InternVL", - "Chat", - "V1-5", - "2B", - ), - ) + self.assertEqual(gguf.Metadata.get_model_id_components("OpenGVLab/Mini-InternVL-Chat-2B-V1-5"), + ('Mini-InternVL-Chat-2B-V1-5', 'OpenGVLab', 'Mini-InternVL', 'Chat', 'V1-5', '2B')) # TODO: DPO in the name - self.assertEqual( - gguf.Metadata.get_model_id_components("jondurbin/bagel-dpo-2.8b-v0.2"), - ("bagel-dpo-2.8b-v0.2", "jondurbin", "bagel-dpo", None, "v0.2", "2.8B"), - ) + self.assertEqual(gguf.Metadata.get_model_id_components("jondurbin/bagel-dpo-2.8b-v0.2"), + ('bagel-dpo-2.8b-v0.2', 'jondurbin', 'bagel-dpo', None, 'v0.2', '2.8B')) # DPO in name, but can't be used for the finetune to keep 'LLaMA-3' in the basename - self.assertEqual( - gguf.Metadata.get_model_id_components( - "voxmenthe/SFR-Iterative-DPO-LLaMA-3-8B-R-unquantized" - ), - ( - "SFR-Iterative-DPO-LLaMA-3-8B-R-unquantized", - "voxmenthe", - "SFR-Iterative-DPO-LLaMA-3", - "R-unquantized", - None, - "8B", - ), - ) + self.assertEqual(gguf.Metadata.get_model_id_components("voxmenthe/SFR-Iterative-DPO-LLaMA-3-8B-R-unquantized"), + ('SFR-Iterative-DPO-LLaMA-3-8B-R-unquantized', 'voxmenthe', 'SFR-Iterative-DPO-LLaMA-3', 'R-unquantized', None, '8B')) # Too ambiguous # TODO: should "base" be a 'finetune' or 'size_label'? # (in this case it should be a size label, but other models use it to signal that they are not finetuned) - self.assertEqual( - gguf.Metadata.get_model_id_components("microsoft/Florence-2-base"), - ("Florence-2-base", "microsoft", None, None, None, None), - ) + self.assertEqual(gguf.Metadata.get_model_id_components("microsoft/Florence-2-base"), + ('Florence-2-base', 'microsoft', None, None, None, None)) ## Invalid cases ## # Start with a dash and has dashes in rows - self.assertEqual( - gguf.Metadata.get_model_id_components( - "mistralai/-Mistral--Nemo-Base-2407-" - ), - ( - "-Mistral--Nemo-Base-2407-", - "mistralai", - "Mistral-Nemo-Base", - None, - "2407", - None, - ), - ) + self.assertEqual(gguf.Metadata.get_model_id_components("mistralai/-Mistral--Nemo-Base-2407-"), + ('-Mistral--Nemo-Base-2407-', 'mistralai', 'Mistral-Nemo-Base', None, '2407', None)) ## LoRA ## - self.assertEqual( - gguf.Metadata.get_model_id_components( - "Llama-3-Instruct-abliteration-LoRA-8B" - ), - ( - "Llama-3-Instruct-abliteration-LoRA-8B", - None, - "Llama-3", - "Instruct-abliteration-LoRA", - None, - "8B", - ), - ) + self.assertEqual(gguf.Metadata.get_model_id_components("Llama-3-Instruct-abliteration-LoRA-8B"), + ('Llama-3-Instruct-abliteration-LoRA-8B', None, 'Llama-3', 'Instruct-abliteration-LoRA', None, '8B')) # Negative size --> output is a LoRA adaper --> prune "LoRA" out of the name to avoid redundancy with the suffix - self.assertEqual( - gguf.Metadata.get_model_id_components( - "Llama-3-Instruct-abliteration-LoRA-8B", -1234 - ), - ( - "Llama-3-Instruct-abliteration-LoRA-8B", - None, - "Llama-3", - "Instruct-abliteration", - None, - "8B", - ), - ) + self.assertEqual(gguf.Metadata.get_model_id_components("Llama-3-Instruct-abliteration-LoRA-8B", -1234), + ('Llama-3-Instruct-abliteration-LoRA-8B', None, 'Llama-3', 'Instruct-abliteration', None, '8B')) def test_apply_metadata_heuristic_from_model_card(self): model_card = { - "tags": [ - "Llama-3", - "instruct", - "finetune", - "chatml", - "DPO", - "RLHF", - "gpt4", - "synthetic data", - "distillation", - "function calling", - "json mode", - "axolotl", - ], - "model-index": [{"name": "Mixtral-8x7B-Instruct-v0.1", "results": []}], - "language": ["en"], - "datasets": ["teknium/OpenHermes-2.5"], - "widget": [ - { - "example_title": "Hermes 2 Pro", - "messages": [ - { - "role": "system", - "content": "You are a sentient, superintelligent artificial general intelligence, here to teach and assist me.", - }, - { - "role": "user", - "content": "Write a short story about Goku discovering kirby has teamed up with Majin Buu to destroy the world.", - }, - ], - } - ], - "base_model": ["EmbeddedLLM/Mistral-7B-Merge-14-v0", "janai-hq/trinity-v1"], + 'tags': ['Llama-3', 'instruct', 'finetune', 'chatml', 'DPO', 'RLHF', 'gpt4', 'synthetic data', 'distillation', 'function calling', 'json mode', 'axolotl'], + 'model-index': [{'name': 'Mixtral-8x7B-Instruct-v0.1', 'results': []}], + 'language': ['en'], + 'datasets': ['teknium/OpenHermes-2.5'], + 'widget': [{'example_title': 'Hermes 2 Pro', 'messages': [{'role': 'system', 'content': 'You are a sentient, superintelligent artificial general intelligence, here to teach and assist me.'}, {'role': 'user', 'content': 'Write a short story about Goku discovering kirby has teamed up with Majin Buu to destroy the world.'}]}], + 'base_model': ["EmbeddedLLM/Mistral-7B-Merge-14-v0", "janai-hq/trinity-v1"] } - got = gguf.Metadata.apply_metadata_heuristic( - gguf.Metadata(), model_card, None, None - ) + got = gguf.Metadata.apply_metadata_heuristic(gguf.Metadata(), model_card, None, None) expect = gguf.Metadata() - expect.base_models = [ - { - "name": "Mistral 7B Merge 14 v0", - "organization": "EmbeddedLLM", - "version": "14-v0", - "repo_url": "https://huggingface.co/EmbeddedLLM/Mistral-7B-Merge-14-v0", - }, - { - "name": "Trinity v1", - "organization": "Janai Hq", - "version": "v1", - "repo_url": "https://huggingface.co/janai-hq/trinity-v1", - }, - ] - expect.tags = [ - "Llama-3", - "instruct", - "finetune", - "chatml", - "DPO", - "RLHF", - "gpt4", - "synthetic data", - "distillation", - "function calling", - "json mode", - "axolotl", - ] - expect.languages = ["en"] - expect.datasets = [ - { - "name": "OpenHermes 2.5", - "organization": "Teknium", - "version": "2.5", - "repo_url": "https://huggingface.co/teknium/OpenHermes-2.5", - } - ] + expect.base_models=[{'name': 'Mistral 7B Merge 14 v0', 'organization': 'EmbeddedLLM', 'version': '14-v0', 'repo_url': 'https://huggingface.co/EmbeddedLLM/Mistral-7B-Merge-14-v0'}, {'name': 'Trinity v1', 'organization': 'Janai Hq', 'version': 'v1', 'repo_url': 'https://huggingface.co/janai-hq/trinity-v1'}] + expect.tags=['Llama-3', 'instruct', 'finetune', 'chatml', 'DPO', 'RLHF', 'gpt4', 'synthetic data', 'distillation', 'function calling', 'json mode', 'axolotl'] + expect.languages=['en'] + expect.datasets=[{'name': 'OpenHermes 2.5', 'organization': 'Teknium', 'version': '2.5', 'repo_url': 'https://huggingface.co/teknium/OpenHermes-2.5'}] self.assertEqual(got, expect) # Base Model spec is inferred from model id - model_card = {"base_models": "teknium/OpenHermes-2.5"} - expect = gguf.Metadata( - base_models=[ - { - "name": "OpenHermes 2.5", - "organization": "Teknium", - "version": "2.5", - "repo_url": "https://huggingface.co/teknium/OpenHermes-2.5", - } - ] - ) - got = gguf.Metadata.apply_metadata_heuristic( - gguf.Metadata(), model_card, None, None - ) + model_card = {'base_models': 'teknium/OpenHermes-2.5'} + expect = gguf.Metadata(base_models=[{'name': 'OpenHermes 2.5', 'organization': 'Teknium', 'version': '2.5', 'repo_url': 'https://huggingface.co/teknium/OpenHermes-2.5'}]) + got = gguf.Metadata.apply_metadata_heuristic(gguf.Metadata(), model_card, None, None) self.assertEqual(got, expect) # Base Model spec is only url - model_card = {"base_models": ["https://huggingface.co/teknium/OpenHermes-2.5"]} - expect = gguf.Metadata( - base_models=[ - { - "name": "OpenHermes 2.5", - "organization": "Teknium", - "version": "2.5", - "repo_url": "https://huggingface.co/teknium/OpenHermes-2.5", - } - ] - ) - got = gguf.Metadata.apply_metadata_heuristic( - gguf.Metadata(), model_card, None, None - ) + model_card = {'base_models': ['https://huggingface.co/teknium/OpenHermes-2.5']} + expect = gguf.Metadata(base_models=[{'name': 'OpenHermes 2.5', 'organization': 'Teknium', 'version': '2.5', 'repo_url': 'https://huggingface.co/teknium/OpenHermes-2.5'}]) + got = gguf.Metadata.apply_metadata_heuristic(gguf.Metadata(), model_card, None, None) self.assertEqual(got, expect) # Base Model spec is given directly - model_card = { - "base_models": [ - { - "name": "OpenHermes 2.5", - "organization": "Teknium", - "version": "2.5", - "repo_url": "https://huggingface.co/teknium/OpenHermes-2.5", - } - ] - } - expect = gguf.Metadata( - base_models=[ - { - "name": "OpenHermes 2.5", - "organization": "Teknium", - "version": "2.5", - "repo_url": "https://huggingface.co/teknium/OpenHermes-2.5", - } - ] - ) - got = gguf.Metadata.apply_metadata_heuristic( - gguf.Metadata(), model_card, None, None - ) + model_card = {'base_models': [{'name': 'OpenHermes 2.5', 'organization': 'Teknium', 'version': '2.5', 'repo_url': 'https://huggingface.co/teknium/OpenHermes-2.5'}]} + expect = gguf.Metadata(base_models=[{'name': 'OpenHermes 2.5', 'organization': 'Teknium', 'version': '2.5', 'repo_url': 'https://huggingface.co/teknium/OpenHermes-2.5'}]) + got = gguf.Metadata.apply_metadata_heuristic(gguf.Metadata(), model_card, None, None) self.assertEqual(got, expect) # Dataset spec is inferred from model id - model_card = {"datasets": "teknium/OpenHermes-2.5"} - expect = gguf.Metadata( - datasets=[ - { - "name": "OpenHermes 2.5", - "organization": "Teknium", - "version": "2.5", - "repo_url": "https://huggingface.co/teknium/OpenHermes-2.5", - } - ] - ) - got = gguf.Metadata.apply_metadata_heuristic( - gguf.Metadata(), model_card, None, None - ) + model_card = {'datasets': 'teknium/OpenHermes-2.5'} + expect = gguf.Metadata(datasets=[{'name': 'OpenHermes 2.5', 'organization': 'Teknium', 'version': '2.5', 'repo_url': 'https://huggingface.co/teknium/OpenHermes-2.5'}]) + got = gguf.Metadata.apply_metadata_heuristic(gguf.Metadata(), model_card, None, None) self.assertEqual(got, expect) # Dataset spec is only url - model_card = {"datasets": ["https://huggingface.co/teknium/OpenHermes-2.5"]} - expect = gguf.Metadata( - datasets=[ - { - "name": "OpenHermes 2.5", - "organization": "Teknium", - "version": "2.5", - "repo_url": "https://huggingface.co/teknium/OpenHermes-2.5", - } - ] - ) - got = gguf.Metadata.apply_metadata_heuristic( - gguf.Metadata(), model_card, None, None - ) + model_card = {'datasets': ['https://huggingface.co/teknium/OpenHermes-2.5']} + expect = gguf.Metadata(datasets=[{'name': 'OpenHermes 2.5', 'organization': 'Teknium', 'version': '2.5', 'repo_url': 'https://huggingface.co/teknium/OpenHermes-2.5'}]) + got = gguf.Metadata.apply_metadata_heuristic(gguf.Metadata(), model_card, None, None) self.assertEqual(got, expect) # Dataset spec is given directly - model_card = { - "datasets": [ - { - "name": "OpenHermes 2.5", - "organization": "Teknium", - "version": "2.5", - "repo_url": "https://huggingface.co/teknium/OpenHermes-2.5", - } - ] - } - expect = gguf.Metadata( - datasets=[ - { - "name": "OpenHermes 2.5", - "organization": "Teknium", - "version": "2.5", - "repo_url": "https://huggingface.co/teknium/OpenHermes-2.5", - } - ] - ) - got = gguf.Metadata.apply_metadata_heuristic( - gguf.Metadata(), model_card, None, None - ) + model_card = {'datasets': [{'name': 'OpenHermes 2.5', 'organization': 'Teknium', 'version': '2.5', 'repo_url': 'https://huggingface.co/teknium/OpenHermes-2.5'}]} + expect = gguf.Metadata(datasets=[{'name': 'OpenHermes 2.5', 'organization': 'Teknium', 'version': '2.5', 'repo_url': 'https://huggingface.co/teknium/OpenHermes-2.5'}]) + got = gguf.Metadata.apply_metadata_heuristic(gguf.Metadata(), model_card, None, None) self.assertEqual(got, expect) def test_apply_metadata_heuristic_from_hf_parameters(self): hf_params = {"_name_or_path": "./hermes-2-pro-llama-3-8b-DPO"} - got = gguf.Metadata.apply_metadata_heuristic( - gguf.Metadata(), model_card=None, hf_params=hf_params, model_path=None - ) - expect = gguf.Metadata( - name="Hermes 2 Pro Llama 3 8b DPO", - finetune="DPO", - basename="hermes-2-pro-llama-3", - size_label="8B", - ) + got = gguf.Metadata.apply_metadata_heuristic(gguf.Metadata(), model_card=None, hf_params=hf_params, model_path=None) + expect = gguf.Metadata(name='Hermes 2 Pro Llama 3 8b DPO', finetune='DPO', basename='hermes-2-pro-llama-3', size_label='8B') self.assertEqual(got, expect) def test_apply_metadata_heuristic_from_model_dir(self): model_dir_path = Path("./hermes-2-pro-llama-3-8b-DPO") - got = gguf.Metadata.apply_metadata_heuristic( - gguf.Metadata(), model_card=None, hf_params=None, model_path=model_dir_path - ) - expect = gguf.Metadata( - name="Hermes 2 Pro Llama 3 8b DPO", - finetune="DPO", - basename="hermes-2-pro-llama-3", - size_label="8B", - ) + got = gguf.Metadata.apply_metadata_heuristic(gguf.Metadata(), model_card=None, hf_params=None, model_path=model_dir_path) + expect = gguf.Metadata(name='Hermes 2 Pro Llama 3 8b DPO', finetune='DPO', basename='hermes-2-pro-llama-3', size_label='8B') self.assertEqual(got, expect) diff --git a/lpm_kernel/L2/gguf-py/tests/test_quants.py b/lpm_kernel/L2/gguf-py/tests/test_quants.py index 33a1bec6..f04d5acc 100755 --- a/lpm_kernel/L2/gguf-py/tests/test_quants.py +++ b/lpm_kernel/L2/gguf-py/tests/test_quants.py @@ -16,10 +16,7 @@ import numpy as np # Necessary to load the local gguf package -if ( - "NO_LOCAL_GGUF" not in os.environ - and (Path(__file__).parent.parent.parent / "gguf-py").exists() -): +if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists(): sys.path.insert(0, str(Path(__file__).parent.parent)) import gguf @@ -67,117 +64,55 @@ def __init__(self, libggml: Path): self.libggml.ggml_quantize_requires_imatrix.argtypes = (ctypes.c_int,) for t in ( - "q4_0", - "q4_1", - "q5_0", - "q5_1", - "q8_0", - "q2_K", - "q3_K", - "q4_K", - "q5_K", - "q6_K", - "tq1_0", - "tq2_0", - "iq2_xxs", - "iq2_xs", - "iq2_s", - "iq3_xxs", - "iq3_s", - "iq1_s", - "iq1_m", - "iq4_nl", - "iq4_xs", + "q4_0", "q4_1", "q5_0", "q5_1", "q8_0", + "q2_K", "q3_K", "q4_K", "q5_K", "q6_K", + "tq1_0", "tq2_0", + "iq2_xxs", "iq2_xs", "iq2_s", "iq3_xxs", "iq3_s", "iq1_s", "iq1_m", + "iq4_nl", "iq4_xs", ): - dequant_func: ctypes._NamedFuncPointer = getattr( - self.libggml, "dequantize_row_" + t - ) + dequant_func: ctypes._NamedFuncPointer = getattr(self.libggml, "dequantize_row_" + t) dequant_func.restype = None - dequant_func.argtypes = ( - ctypes.c_void_p, - ctypes.POINTER(ctypes.c_float), - ctypes.c_int64, - ) + dequant_func.argtypes = (ctypes.c_void_p, ctypes.POINTER(ctypes.c_float), ctypes.c_int64) self.libggml.ggml_fp16_to_fp32_row.restype = None - self.libggml.ggml_fp16_to_fp32_row.argtypes = ( - ctypes.POINTER(ctypes.c_uint16), - ctypes.POINTER(ctypes.c_float), - ctypes.c_int64, - ) + self.libggml.ggml_fp16_to_fp32_row.argtypes = (ctypes.POINTER(ctypes.c_uint16), ctypes.POINTER(ctypes.c_float), ctypes.c_int64) self.libggml.ggml_bf16_to_fp32_row.restype = None - self.libggml.ggml_bf16_to_fp32_row.argtypes = ( - ctypes.POINTER(ctypes.c_uint16), - ctypes.POINTER(ctypes.c_float), - ctypes.c_int64, - ) + self.libggml.ggml_bf16_to_fp32_row.argtypes = (ctypes.POINTER(ctypes.c_uint16), ctypes.POINTER(ctypes.c_float), ctypes.c_int64) self.libggml.ggml_init.argtypes = (ggml_init_params,) self.libggml.ggml_init(ggml_init_params(1 * 1024 * 1024, 0, False)) def dequantize(self, tensor: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray: - result = np.zeros( - gguf.quant_shape_from_byte_shape(tensor.shape, qtype), - dtype=np.float32, - order="C", - ) + result = np.zeros(gguf.quant_shape_from_byte_shape(tensor.shape, qtype), dtype=np.float32, order="C") if qtype == GGMLQuantizationType.F32: # no-op result = tensor.view(np.float32) elif qtype == GGMLQuantizationType.F16: - self.libggml.ggml_fp16_to_fp32_row( - tensor.ctypes.data_as(ctypes.POINTER(ctypes.c_uint16)), - result.ctypes.data_as(c_float_p), - result.size, - ) + self.libggml.ggml_fp16_to_fp32_row(tensor.ctypes.data_as(ctypes.POINTER(ctypes.c_uint16)), result.ctypes.data_as(c_float_p), result.size) elif qtype == GGMLQuantizationType.BF16: - self.libggml.ggml_bf16_to_fp32_row( - tensor.ctypes.data_as(ctypes.POINTER(ctypes.c_uint16)), - result.ctypes.data_as(c_float_p), - result.size, - ) + self.libggml.ggml_bf16_to_fp32_row(tensor.ctypes.data_as(ctypes.POINTER(ctypes.c_uint16)), result.ctypes.data_as(c_float_p), result.size) else: lw_qname = qtype.name.lower() if lw_qname[-1] == "k": lw_qname = lw_qname[:-1] + "K" - dequant_func: ctypes._NamedFuncPointer = getattr( - self.libggml, "dequantize_row_" + lw_qname - ) - dequant_func( - tensor.ctypes.data_as(ctypes.c_void_p), - result.ctypes.data_as(c_float_p), - result.size, - ) + dequant_func: ctypes._NamedFuncPointer = getattr(self.libggml, "dequantize_row_" + lw_qname) + dequant_func(tensor.ctypes.data_as(ctypes.c_void_p), result.ctypes.data_as(c_float_p), result.size) return result def quantize(self, data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray: - result = np.zeros( - gguf.quant_shape_to_byte_shape(data.shape, qtype), dtype=np.uint8, order="C" - ) + result = np.zeros(gguf.quant_shape_to_byte_shape(data.shape, qtype), dtype=np.uint8, order="C") if self.libggml.ggml_quantize_requires_imatrix(qtype.value): # TODO: is a column-wise sum of squares appropriate? - qw = np.sum( - (data * data).reshape((-1, data.shape[-1])), axis=0 - ).ctypes.data_as(c_float_p) + qw = np.sum((data * data).reshape((-1, data.shape[-1])), axis=0).ctypes.data_as(c_float_p) else: qw = ctypes.cast(0, c_float_p) - result_size = self.libggml.ggml_quantize_chunk( - qtype.value, - data.ctypes.data_as(c_float_p), - result.ctypes.data_as(ctypes.c_void_p), - 0, - prod(data.shape[:-1]), - data.shape[-1], - qw, - ) + result_size = self.libggml.ggml_quantize_chunk(qtype.value, data.ctypes.data_as(c_float_p), result.ctypes.data_as(ctypes.c_void_p), 0, prod(data.shape[:-1]), data.shape[-1], qw) assert result.size == result_size return result -def compare_tensors( - t1: np.ndarray, t2: np.ndarray, qtype: GGMLQuantizationType -) -> bool: +def compare_tensors(t1: np.ndarray, t2: np.ndarray, qtype: GGMLQuantizationType) -> bool: same = np.array_equal(t1, t2) if same: return True @@ -195,30 +130,20 @@ def compare_tensors( if num_bad_blocks == 0 and t1.shape == t2.shape: logger.debug("Bits are equal, but arrays don't match, likely contains NANs") return True - logger.debug( - f"{num_bad_blocks} bad blocks ({100 * num_bad_blocks / x.shape[0]:.6f}%)" - ) + logger.debug(f"{num_bad_blocks} bad blocks ({100 * num_bad_blocks / x.shape[0]:.6f}%)") bad_block_id = np.argmax(diff_bits, axis=0) logger.debug(f"Worst block id: {bad_block_id}") - logger.debug( - f"Sample bad block ({diff_bits[bad_block_id]} differing bits):\n{t1[bad_block_id]}\nReference:\n{t2[bad_block_id]}" - ) + logger.debug(f"Sample bad block ({diff_bits[bad_block_id]} differing bits):\n{t1[bad_block_id]}\nReference:\n{t2[bad_block_id]}") sum_diff_bits = np.sum(diff_bits) - logger.debug( - f"{sum_diff_bits} bits differ ({100 * sum_diff_bits / (x.size * 8):.6f}%)" - ) + logger.debug(f"{sum_diff_bits} bits differ ({100 * sum_diff_bits / (x.size * 8):.6f}%)") return False def do_test(libggml_path: Path, quick: bool = False): ggml_quants = GGMLQuants(libggml_path) - np.set_printoptions( - precision=None, - threshold=(4 * 256) + 1, - formatter={"int": lambda n: "0x%02X" % n}, - ) + np.set_printoptions(precision=None, threshold=(4 * 256) + 1, formatter={"int": lambda n: "0x%02X" % n}) r = np.random.randn(8, 1024, 1024).astype(np.float32, copy=False) @@ -227,18 +152,14 @@ def do_test(libggml_path: Path, quick: bool = False): has_quantize = False try: - gguf.dequantize( - np.zeros((gguf.GGML_QUANT_SIZES[qtype][1]), dtype=np.uint8), qtype - ) + gguf.dequantize(np.zeros((gguf.GGML_QUANT_SIZES[qtype][1]), dtype=np.uint8), qtype) has_dequantize = True except (NotImplementedError, AssertionError) as e: if isinstance(e, AssertionError): logger.error(f"Error with {qtype.name}: {e}") raise e try: - gguf.quantize( - np.zeros((gguf.GGML_QUANT_SIZES[qtype][0]), dtype=np.float32), qtype - ) + gguf.quantize(np.zeros((gguf.GGML_QUANT_SIZES[qtype][0]), dtype=np.float32), qtype) has_quantize = True except (NotImplementedError, AssertionError) as e: if isinstance(e, AssertionError): @@ -289,9 +210,7 @@ def do_test(libggml_path: Path, quick: bool = False): else: logger.info(f"Dequantization from {qtype.name} matches exactly ✅") - rq_shape = gguf.quants.quant_shape_to_byte_shape( - (8, 1024, 1024 // 2), qtype - ) + rq_shape = gguf.quants.quant_shape_to_byte_shape((8, 1024, 1024 // 2), qtype) rq = np.random.random(rq_shape).astype(np.float16).view(np.uint8) logger.debug(f"Dequantizing random f16 data as {qtype.name} with Python") @@ -302,34 +221,15 @@ def do_test(libggml_path: Path, quick: bool = False): dequant_equal = compare_tensors(pydq, ggdq, qtype) if not dequant_equal: - logger.error( - f"Dequantization from random f16 data as {qtype.name} does not match ❌" - ) + logger.error(f"Dequantization from random f16 data as {qtype.name} does not match ❌") else: - logger.info( - f"Dequantization from random f16 data as {qtype.name} matches exactly ✅" - ) + logger.info(f"Dequantization from random f16 data as {qtype.name} matches exactly ✅") if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Test Python (de)quantization against the reference C implementation" - ) - parser.add_argument( - "--libggml", - type=Path, - default=Path(__file__).parent.parent.parent - / "build" - / "ggml" - / "src" - / "libggml.so", - help="The path to libggml.so", - ) - parser.add_argument( - "--quick", - action="store_true", - help="Don't quantize with C when it's not strictly necessary", - ) + parser = argparse.ArgumentParser(description="Test Python (de)quantization against the reference C implementation") + parser.add_argument("--libggml", type=Path, default=Path(__file__).parent.parent.parent / "build" / "ggml" / "src" / "libggml.so", help="The path to libggml.so") + parser.add_argument("--quick", action="store_true", help="Don't quantize with C when it's not strictly necessary") args = parser.parse_args() diff --git a/lpm_kernel/L2/mlx_training/convert_and_serve.sh b/lpm_kernel/L2/mlx_training/convert_and_serve.sh index 68d66fba..ce408a14 100644 --- a/lpm_kernel/L2/mlx_training/convert_and_serve.sh +++ b/lpm_kernel/L2/mlx_training/convert_and_serve.sh @@ -1,4 +1,4 @@ -mlx_lm.fuse --model mlx-community/Qwen2.5-7B-Instruct-4bit \ +mlx_lm.fuse --model mlx-community/Qwen3-8B-4bit \ --adapter-path "resources/model/output/mlx/adapters" \ --save-path "resources/model/output/mlx" diff --git a/lpm_kernel/L2/mlx_training/lora_config.yaml b/lpm_kernel/L2/mlx_training/lora_config.yaml index 0e644e00..57107cd8 100644 --- a/lpm_kernel/L2/mlx_training/lora_config.yaml +++ b/lpm_kernel/L2/mlx_training/lora_config.yaml @@ -1,5 +1,5 @@ # The path to the local model directory or Hugging Face repo. -model: "mlx-community/Qwen2.5-7B-Instruct-4bit" +model: "mlx-community/Qwen3-8B-4bit" # Whether or not to train (boolean) train: true diff --git a/lpm_kernel/L2/utils.py b/lpm_kernel/L2/utils.py index 5105c7bc..82638790 100644 --- a/lpm_kernel/L2/utils.py +++ b/lpm_kernel/L2/utils.py @@ -515,7 +515,10 @@ def preprocess(sample, user_name='user', is_cot=False): {"role": "user", "content": user_message}, {"role": "assistant", "content": sample['enhanced_request'].strip('\n')}, ] - return [{"content": tokenizer.apply_chat_template(messages, tokenize=False)}] + if hasattr(tokenizer, 'name_or_path') and 'qwen3' in tokenizer.name_or_path.lower(): + return [{"content": tokenizer.apply_chat_template(messages, tokenize=False, enable_thinking=False)}] + else: + return [{"content": tokenizer.apply_chat_template(messages, tokenize=False)}] if sample.get('assistant') is None and sample.get('user_feedback') is not None: user_message = f"{user_name}'s request is: " + sample['user_request'] + "\n" + "Expert's response is: " + sample['expert_response'] messages = [ @@ -523,7 +526,10 @@ def preprocess(sample, user_name='user', is_cot=False): {"role": "user", "content": user_message}, {"role": "assistant", "content": sample['user_feedback'].strip('\n')}, ] - return [{"content": tokenizer.apply_chat_template(messages, tokenize=False)}] + if hasattr(tokenizer, 'name_or_path') and 'qwen3' in tokenizer.name_or_path.lower(): + return [{"content": tokenizer.apply_chat_template(messages, tokenize=False, enable_thinking=False)}] + else: + return [{"content": tokenizer.apply_chat_template(messages, tokenize=False)}] if sample.get('assistant') is None: return [] @@ -536,7 +542,10 @@ def preprocess(sample, user_name='user', is_cot=False): ] if 'None' in sample['assistant']: return [] - return [{"content": tokenizer.apply_chat_template(messages, tokenize=False)}] + if hasattr(tokenizer, 'name_or_path') and 'qwen3' in tokenizer.name_or_path.lower(): + return [{"content": tokenizer.apply_chat_template(messages, tokenize=False, enable_thinking=False)}] + else: + return [{"content": tokenizer.apply_chat_template(messages, tokenize=False)}] dataset = load_dataset("json", data_files=data_args.dataset_name, split="train") res_dataset = [] @@ -620,11 +629,11 @@ def save_hf_model(model_name=None, log_file_path=None) -> str: config = Config() model_name = config.get("training", {}).get("model_name") if not model_name: - logger.warning("No model name provided and none found in config. Using Qwen2.5-0.5B-Instruct as fallback.") - model_name = "Qwen2.5-0.5B-Instruct" + logger.warning("No model name provided and none found in config. Using Qwen3-0.6B as fallback.") + model_name = "Qwen3-0.6B" except Exception as e: - logger.warning(f"Failed to get model name from config: {str(e)}. Using Qwen2.5-0.5B-Instruct as fallback.") - model_name = "Qwen2.5-0.5B-Instruct" + logger.warning(f"Failed to get model name from config: {str(e)}. Using Qwen3-0.6B as fallback.") + model_name = "Qwen3-0.6B" base_dir = os.path.join(os.getcwd(), "resources/L2/base_models") # Normalize model name and check for path traversal attempts diff --git a/lpm_kernel/api/domains/trainprocess/training_params_manager.py b/lpm_kernel/api/domains/trainprocess/training_params_manager.py index c6871542..fc73ab6b 100644 --- a/lpm_kernel/api/domains/trainprocess/training_params_manager.py +++ b/lpm_kernel/api/domains/trainprocess/training_params_manager.py @@ -18,7 +18,7 @@ class TrainingParamsManager: # Default training parameters _default_training_params = { - "model_name": "Qwen2.5-0.5B-Instruct", + "model_name": "Qwen3-0.6B", "learning_rate": 1e-4, "number_of_epochs": 3, "concurrency_threads": 2, diff --git a/lpm_kernel/api/services/local_llm_service.py b/lpm_kernel/api/services/local_llm_service.py index 04abd9a5..31c5b92f 100644 --- a/lpm_kernel/api/services/local_llm_service.py +++ b/lpm_kernel/api/services/local_llm_service.py @@ -113,9 +113,6 @@ def start_server(self, model_path: str, use_gpu: bool = True) -> bool: "-m", model_path, "--host", "0.0.0.0", "--port", "8080", - "--ctx-size", "2048", # Default context size (adjust based on needs) - "--parallel", "2", # Enable request parallelism - "--cont-batching" # Enable continuous batching ] # Set up environment with CUDA variables to ensure GPU detection diff --git a/pyproject.toml b/pyproject.toml index 2de8e583..784e0dd5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,8 +42,8 @@ pytest = "7.4.4" ruff = "0.1.15" pandas = "2.2.3" fnllm = {extras = ["azure", "openai"], version = "0.1.2"} -transformers = "4.47.1" -torch = "2.5.1" +transformers = "4.51.3" +torch = "2.7.0" peft = "0.14.0" trl = "0.13.0" gguf = "0.10.0"