From 54ce51272edeb2c45f34cf003cc379451bb3706b Mon Sep 17 00:00:00 2001 From: Gilad S Date: Sun, 18 May 2025 02:53:36 +0300 Subject: [PATCH 1/6] fix: remove prompt completion from cached chat session context window --- src/evaluator/LlamaChatSession/LlamaChatSession.ts | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/evaluator/LlamaChatSession/LlamaChatSession.ts b/src/evaluator/LlamaChatSession/LlamaChatSession.ts index f5ff654c..61774c76 100644 --- a/src/evaluator/LlamaChatSession/LlamaChatSession.ts +++ b/src/evaluator/LlamaChatSession/LlamaChatSession.ts @@ -343,6 +343,7 @@ export class LlamaChatSession { /** @internal */ private readonly _chatLock = {}; /** @internal */ private _chatHistory: ChatHistoryItem[]; /** @internal */ private _lastEvaluation?: LlamaChatResponse["lastEvaluation"]; + /** @internal */ private _canUseLastEvaluationForCompletion: boolean = true; /** @internal */ private _chat: LlamaChat | null; /** @internal */ public _chatHistoryStateRef = {}; /** @internal */ public readonly _preloadAndCompleteAbortControllers = new Set(); @@ -519,7 +520,9 @@ export class LlamaChatSession { const supportsParallelFunctionCalling = this._chat.chatWrapper.settings.functions.parallelism != null; const [abortController, disposeAbortController] = wrapAbortSignal(signal); - let lastEvaluation = this._lastEvaluation; + let lastEvaluation = this._canUseLastEvaluationForCompletion + ? this._lastEvaluation + : undefined; let newChatHistory = appendUserMessageToChatHistory(this._chatHistory, prompt); let newContextWindowChatHistory = lastEvaluation?.contextWindow == null ? undefined @@ -723,6 +726,7 @@ export class LlamaChatSession { } this._lastEvaluation = lastEvaluation; + this._canUseLastEvaluationForCompletion = true; this._chatHistory = newChatHistory; this._chatHistoryStateRef = {}; @@ -876,9 +880,10 @@ export class LlamaChatSession { this._lastEvaluation = { cleanHistory: this._chatHistory, - contextWindow: lastEvaluation.contextWindow, + contextWindow: asWithLastUserMessageRemoved(lastEvaluation.contextWindow), contextShiftMetadata: lastEvaluation.contextShiftMetadata }; + this._canUseLastEvaluationForCompletion = this._chatHistory.at(-1)?.type === "user"; if (!stopOnAbortSignal && metadata.stopReason === "abort" && abortController.signal?.aborted) throw abortController.signal.reason; From 811095ec65356e6e082a6005ed177884175ca0c6 Mon Sep 17 00:00:00 2001 From: Gilad S Date: Sun, 18 May 2025 03:05:59 +0300 Subject: [PATCH 2/6] fix: update recommended models --- src/cli/recommendedModels.ts | 85 ++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/src/cli/recommendedModels.ts b/src/cli/recommendedModels.ts index efba3526..5be4313c 100644 --- a/src/cli/recommendedModels.ts +++ b/src/cli/recommendedModels.ts @@ -1,6 +1,74 @@ import {ModelRecommendation} from "./utils/resolveModelRecommendationFileOptions.js"; export const recommendedModels: ModelRecommendation[] = [{ + name: "Qwen 3 32B", + abilities: ["chat", "complete", "functionCalling", "reasoning"], + description: "Qwen model was created by Alibaba and is using chain of though (CoT) to reason across a wide variety of topics.\n" + + "It's optimized for an assistant-like chat use cases, with native support for function calling.\n" + + "This model is censored, but its responses quality on many topics is extremely high.\n" + + "This is the 32 billion parameters version of the model.\n" + + "Its performance is comparable and even surpasses DeepSeek R1 and GPT-o1.", + + fileOptions: [ + "hf:Qwen/Qwen3-32B-GGUF:Q8_0", + "hf:Qwen/Qwen3-32B-GGUF:Q6_K", + "hf:Qwen/Qwen3-32B-GGUF:Q5_K_M", + "hf:Qwen/Qwen3-32B-GGUF:Q4_K_M" + ] +}, { + name: "Qwen 3 14B", + abilities: ["chat", "complete", "functionCalling", "reasoning"], + description: "Qwen model was created by Alibaba and is using chain of though (CoT) to reason across a wide variety of topics.\n" + + "It's optimized for an assistant-like chat use cases, with native support for function calling.\n" + + "This model is censored, but its responses quality on many topics is extremely high compared to its size.\n" + + "This is the 14 billion parameters version of the model.", + + fileOptions: [ + "hf:Qwen/Qwen3-14B-GGUF:Q8_0", + "hf:Qwen/Qwen3-14B-GGUF:Q6_K", + "hf:Qwen/Qwen3-14B-GGUF:Q5_K_M", + "hf:Qwen/Qwen3-14B-GGUF:Q4_K_M" + ] +}, { + name: "Qwen 3 8B", + abilities: ["chat", "complete", "functionCalling", "reasoning"], + description: "Qwen model was created by Alibaba and is using chain of though (CoT) to reason across a wide variety of topics.\n" + + "It's optimized for an assistant-like chat use cases, with native support for function calling.\n" + + "This model is censored, but its responses quality on many topics is extremely high compared to its size.\n" + + "This is the 8 billion parameters version of the model.", + + fileOptions: [ + "hf:Qwen/Qwen3-8B-GGUF:Q8_0", + "hf:Qwen/Qwen3-8B-GGUF:Q6_K", + "hf:Qwen/Qwen3-8B-GGUF:Q5_K_M", + "hf:Qwen/Qwen3-8B-GGUF:Q4_K_M" + ] +}, { + name: "Qwen 3 4B", + abilities: ["chat", "complete", "functionCalling", "reasoning"], + description: "Qwen model was created by Alibaba and is using chain of though (CoT) to reason across a wide variety of topics.\n" + + "It's optimized for an assistant-like chat use cases, with native support for function calling.\n" + + "This model is censored, but its responses quality on many topics is extremely high compared to its size.\n" + + "This is the 4 billion parameters version of the model, and is suitable for simpler tasks and can run on lower-end hardware, as well as be very fast on higher-end hardware.", + + fileOptions: [ + "hf:Qwen/Qwen3-4B-GGUF:Q8_0", + "hf:Qwen/Qwen3-4B-GGUF:Q6_K", + "hf:Qwen/Qwen3-4B-GGUF:Q5_K_M", + "hf:Qwen/Qwen3-4B-GGUF:Q4_K_M" + ] +}, { + name: "Qwen 3 0.6B", + abilities: ["chat", "complete", "functionCalling", "reasoning"], + description: "Qwen model was created by Alibaba and is using chain of though (CoT) to reason across a wide variety of topics.\n" + + "It's optimized for an assistant-like chat use cases, with native support for function calling.\n" + + "This model is censored, but its responses quality on many topics is extremely high compared to its size.\n" + + "This is the 0.6B billion parameters version of the model and is suitable for very simple tasks and can run on very resource-constraint hardware.\n", + + fileOptions: [ + "hf:Qwen/Qwen3-0.6B-GGUF:Q8_0" + ] +}, { name: "DeepSeek R1 Distill Qwen 7B", abilities: ["chat", "complete", "functionCalling", "reasoning"], description: "DeepSeek R1 model was created by DeepSeek and is using chain of though (CoT) to reason across a wide variety of topics.\n" + @@ -75,6 +143,23 @@ export const recommendedModels: ModelRecommendation[] = [{ "hf:mradermacher/DeepSeek-R1-Distill-Llama-70B-GGUF:Q5_K_S", "hf:mradermacher/DeepSeek-R1-Distill-Llama-70B-GGUF:Q4_K_M" ] +}, { + name: "Qwen 3 30B A3B MoE", + abilities: ["chat", "complete", "functionCalling", "reasoning"], + description: "Qwen model was created by Alibaba and is using chain of though (CoT) to reason across a wide variety of topics.\n" + + "It's optimized for an assistant-like chat use cases, with native support for function calling.\n" + + "This version of the model utilizes a Mixture of Experts architecture, with only 3B active parameters, thus making it very fast.\n" + + "Mixtures of Experts (MoE) is a technique where different models, each skilled in solving a particular kind of problem, work together to the improve the overall performance on complex tasks.\n" + + "This model is censored, but its responses quality on many topics is extremely high.\n" + + "This is the 30 billion parameters Mixtures of Experts (MoE) version of the model.\n" + + "Its performance is comparable and even surpasses DeepSeek V3 and GPT-4o.", + + fileOptions: [ + "hf:Qwen/Qwen3-30B-A3B-GGUF:Q8_0", + "hf:Qwen/Qwen3-30B-A3B-GGUF:Q6_K", + "hf:Qwen/Qwen3-30B-A3B-GGUF:Q5_K_M", + "hf:Qwen/Qwen3-30B-A3B-GGUF:Q4_K_M" + ] }, { name: "QwQ 32B", abilities: ["chat", "complete", "functionCalling", "reasoning"], From 65b5a8d60243d1e9a063b7b91ab22e98df16c3a4 Mon Sep 17 00:00:00 2001 From: Gilad S Date: Sun, 18 May 2025 03:06:39 +0300 Subject: [PATCH 3/6] docs: update the awesome list --- docs/guide/awesome.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/guide/awesome.md b/docs/guide/awesome.md index c290928c..708c99a1 100644 --- a/docs/guide/awesome.md +++ b/docs/guide/awesome.md @@ -15,6 +15,9 @@ import DataBadge from "../../.vitepress/components/DataBadge/DataBadge.vue"; * [Manzoni](https://manzoni.app/) ([GitHub](https://github.com/gems-platforms/manzoni-app)) - a text editor running local LLMs
+* [Clippy](https://felixrieseberg.github.io/clippy/) ([GitHub](https://github.com/felixrieseberg/clippy)) - Clippy, resurrected from the 1990s, now with some AI +
+ ## Proprietary * [BashBuddy](https://bashbuddy.run) ([GitHub](https://github.com/wosherco/bashbuddy)) - write bash commands with natural language From 69b4e1bdc22a1c106d50d34a974e4d7984fe38f2 Mon Sep 17 00:00:00 2001 From: Gilad S Date: Sun, 18 May 2025 03:17:03 +0300 Subject: [PATCH 4/6] fix: improve variable naming --- src/evaluator/LlamaChatSession/LlamaChatSession.ts | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/evaluator/LlamaChatSession/LlamaChatSession.ts b/src/evaluator/LlamaChatSession/LlamaChatSession.ts index 61774c76..cb64518d 100644 --- a/src/evaluator/LlamaChatSession/LlamaChatSession.ts +++ b/src/evaluator/LlamaChatSession/LlamaChatSession.ts @@ -343,7 +343,7 @@ export class LlamaChatSession { /** @internal */ private readonly _chatLock = {}; /** @internal */ private _chatHistory: ChatHistoryItem[]; /** @internal */ private _lastEvaluation?: LlamaChatResponse["lastEvaluation"]; - /** @internal */ private _canUseLastEvaluationForCompletion: boolean = true; + /** @internal */ private _canUseContextWindowForCompletion: boolean = true; /** @internal */ private _chat: LlamaChat | null; /** @internal */ public _chatHistoryStateRef = {}; /** @internal */ public readonly _preloadAndCompleteAbortControllers = new Set(); @@ -520,7 +520,7 @@ export class LlamaChatSession { const supportsParallelFunctionCalling = this._chat.chatWrapper.settings.functions.parallelism != null; const [abortController, disposeAbortController] = wrapAbortSignal(signal); - let lastEvaluation = this._canUseLastEvaluationForCompletion + let lastEvaluation = this._canUseContextWindowForCompletion ? this._lastEvaluation : undefined; let newChatHistory = appendUserMessageToChatHistory(this._chatHistory, prompt); @@ -726,7 +726,7 @@ export class LlamaChatSession { } this._lastEvaluation = lastEvaluation; - this._canUseLastEvaluationForCompletion = true; + this._canUseContextWindowForCompletion = true; this._chatHistory = newChatHistory; this._chatHistoryStateRef = {}; @@ -883,7 +883,7 @@ export class LlamaChatSession { contextWindow: asWithLastUserMessageRemoved(lastEvaluation.contextWindow), contextShiftMetadata: lastEvaluation.contextShiftMetadata }; - this._canUseLastEvaluationForCompletion = this._chatHistory.at(-1)?.type === "user"; + this._canUseContextWindowForCompletion = this._chatHistory.at(-1)?.type === "user"; if (!stopOnAbortSignal && metadata.stopReason === "abort" && abortController.signal?.aborted) throw abortController.signal.reason; @@ -923,6 +923,7 @@ export class LlamaChatSession { this._chatHistory = structuredClone(chatHistory); this._chatHistoryStateRef = {}; this._lastEvaluation = undefined; + this._canUseContextWindowForCompletion = false; } /** Clear the chat history and reset it to the initial state. */ From 4f468a6772a0ef584cb97641f4061154f0ce8474 Mon Sep 17 00:00:00 2001 From: Gilad S Date: Sun, 18 May 2025 03:28:08 +0300 Subject: [PATCH 5/6] fix(`getLlamaGpuTypes`): fix edge case --- src/bindings/utils/getLlamaGpuTypes.ts | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/bindings/utils/getLlamaGpuTypes.ts b/src/bindings/utils/getLlamaGpuTypes.ts index 82430132..be67d607 100644 --- a/src/bindings/utils/getLlamaGpuTypes.ts +++ b/src/bindings/utils/getLlamaGpuTypes.ts @@ -15,12 +15,18 @@ import {getPlatform} from "./getPlatform.js"; * as some of them are inadvisable for the current machine (like CUDA on an x64 Mac machine). */ export async function getLlamaGpuTypes(include: "supported" | "allValid"): Promise { - if (include === "supported") - return await getGpuTypesToUseForOption("auto"); - const platform = getPlatform(); const arch = process.arch; + if (include === "supported") { + const gpuTypes = new Set(await getGpuTypesToUseForOption("auto")); + + if (platform === "win" && arch !== "x64") + gpuTypes.delete("vulkan"); // no Vulkan prebuilt binary yet due to incomplete support for arm64 + + return [...gpuTypes]; + } + const res: LlamaGpuType[] = []; // Metal is not properly supported by llama.cpp on x64 Mac machines From 3c07be6811fb26ff5208456e64a6cbdced0578ef Mon Sep 17 00:00:00 2001 From: Gilad S Date: Mon, 19 May 2025 00:47:50 +0300 Subject: [PATCH 6/6] fix: typos --- src/cli/recommendedModels.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cli/recommendedModels.ts b/src/cli/recommendedModels.ts index 5be4313c..1609a780 100644 --- a/src/cli/recommendedModels.ts +++ b/src/cli/recommendedModels.ts @@ -62,7 +62,7 @@ export const recommendedModels: ModelRecommendation[] = [{ abilities: ["chat", "complete", "functionCalling", "reasoning"], description: "Qwen model was created by Alibaba and is using chain of though (CoT) to reason across a wide variety of topics.\n" + "It's optimized for an assistant-like chat use cases, with native support for function calling.\n" + - "This model is censored, but its responses quality on many topics is extremely high compared to its size.\n" + + "This model is censored, but its responses quality on many topics is very high compared to its small size.\n" + "This is the 0.6B billion parameters version of the model and is suitable for very simple tasks and can run on very resource-constraint hardware.\n", fileOptions: [ @@ -150,7 +150,7 @@ export const recommendedModels: ModelRecommendation[] = [{ "It's optimized for an assistant-like chat use cases, with native support for function calling.\n" + "This version of the model utilizes a Mixture of Experts architecture, with only 3B active parameters, thus making it very fast.\n" + "Mixtures of Experts (MoE) is a technique where different models, each skilled in solving a particular kind of problem, work together to the improve the overall performance on complex tasks.\n" + - "This model is censored, but its responses quality on many topics is extremely high.\n" + + "This model is censored, but its responses quality on many topics is high compared to its high generation speed.\n" + "This is the 30 billion parameters Mixtures of Experts (MoE) version of the model.\n" + "Its performance is comparable and even surpasses DeepSeek V3 and GPT-4o.",