From 54ce51272edeb2c45f34cf003cc379451bb3706b Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Sun, 18 May 2025 02:53:36 +0300
Subject: [PATCH 1/6] fix: remove prompt completion from cached chat session
 context window

---
 src/evaluator/LlamaChatSession/LlamaChatSession.ts | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/src/evaluator/LlamaChatSession/LlamaChatSession.ts b/src/evaluator/LlamaChatSession/LlamaChatSession.ts
index f5ff654c..61774c76 100644
--- a/src/evaluator/LlamaChatSession/LlamaChatSession.ts
+++ b/src/evaluator/LlamaChatSession/LlamaChatSession.ts
@@ -343,6 +343,7 @@ export class LlamaChatSession {
     /** @internal */ private readonly _chatLock = {};
     /** @internal */ private _chatHistory: ChatHistoryItem[];
     /** @internal */ private _lastEvaluation?: LlamaChatResponse["lastEvaluation"];
+    /** @internal */ private _canUseLastEvaluationForCompletion: boolean = true;
     /** @internal */ private _chat: LlamaChat | null;
     /** @internal */ public _chatHistoryStateRef = {};
     /** @internal */ public readonly _preloadAndCompleteAbortControllers = new Set<AbortController>();
@@ -519,7 +520,9 @@ export class LlamaChatSession {
 
             const supportsParallelFunctionCalling = this._chat.chatWrapper.settings.functions.parallelism != null;
             const [abortController, disposeAbortController] = wrapAbortSignal(signal);
-            let lastEvaluation = this._lastEvaluation;
+            let lastEvaluation = this._canUseLastEvaluationForCompletion
+                ? this._lastEvaluation
+                : undefined;
             let newChatHistory = appendUserMessageToChatHistory(this._chatHistory, prompt);
             let newContextWindowChatHistory = lastEvaluation?.contextWindow == null
                 ? undefined
@@ -723,6 +726,7 @@ export class LlamaChatSession {
                     }
 
                     this._lastEvaluation = lastEvaluation;
+                    this._canUseLastEvaluationForCompletion = true;
                     this._chatHistory = newChatHistory;
                     this._chatHistoryStateRef = {};
 
@@ -876,9 +880,10 @@ export class LlamaChatSession {
 
                 this._lastEvaluation = {
                     cleanHistory: this._chatHistory,
-                    contextWindow: lastEvaluation.contextWindow,
+                    contextWindow: asWithLastUserMessageRemoved(lastEvaluation.contextWindow),
                     contextShiftMetadata: lastEvaluation.contextShiftMetadata
                 };
+                this._canUseLastEvaluationForCompletion = this._chatHistory.at(-1)?.type === "user";
 
                 if (!stopOnAbortSignal && metadata.stopReason === "abort" && abortController.signal?.aborted)
                     throw abortController.signal.reason;

From 811095ec65356e6e082a6005ed177884175ca0c6 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Sun, 18 May 2025 03:05:59 +0300
Subject: [PATCH 2/6] fix: update recommended models

---
 src/cli/recommendedModels.ts | 85 ++++++++++++++++++++++++++++++++++++
 1 file changed, 85 insertions(+)

diff --git a/src/cli/recommendedModels.ts b/src/cli/recommendedModels.ts
index efba3526..5be4313c 100644
--- a/src/cli/recommendedModels.ts
+++ b/src/cli/recommendedModels.ts
@@ -1,6 +1,74 @@
 import {ModelRecommendation} from "./utils/resolveModelRecommendationFileOptions.js";
 
 export const recommendedModels: ModelRecommendation[] = [{
+    name: "Qwen 3 32B",
+    abilities: ["chat", "complete", "functionCalling", "reasoning"],
+    description: "Qwen model was created by Alibaba and is using chain of though (CoT) to reason across a wide variety of topics.\n" +
+        "It's optimized for an assistant-like chat use cases, with native support for function calling.\n" +
+        "This model is censored, but its responses quality on many topics is extremely high.\n" +
+        "This is the 32 billion parameters version of the model.\n" +
+        "Its performance is comparable and even surpasses DeepSeek R1 and GPT-o1.",
+
+    fileOptions: [
+        "hf:Qwen/Qwen3-32B-GGUF:Q8_0",
+        "hf:Qwen/Qwen3-32B-GGUF:Q6_K",
+        "hf:Qwen/Qwen3-32B-GGUF:Q5_K_M",
+        "hf:Qwen/Qwen3-32B-GGUF:Q4_K_M"
+    ]
+}, {
+    name: "Qwen 3 14B",
+    abilities: ["chat", "complete", "functionCalling", "reasoning"],
+    description: "Qwen model was created by Alibaba and is using chain of though (CoT) to reason across a wide variety of topics.\n" +
+        "It's optimized for an assistant-like chat use cases, with native support for function calling.\n" +
+        "This model is censored, but its responses quality on many topics is extremely high compared to its size.\n" +
+        "This is the 14 billion parameters version of the model.",
+
+    fileOptions: [
+        "hf:Qwen/Qwen3-14B-GGUF:Q8_0",
+        "hf:Qwen/Qwen3-14B-GGUF:Q6_K",
+        "hf:Qwen/Qwen3-14B-GGUF:Q5_K_M",
+        "hf:Qwen/Qwen3-14B-GGUF:Q4_K_M"
+    ]
+}, {
+    name: "Qwen 3 8B",
+    abilities: ["chat", "complete", "functionCalling", "reasoning"],
+    description: "Qwen model was created by Alibaba and is using chain of though (CoT) to reason across a wide variety of topics.\n" +
+        "It's optimized for an assistant-like chat use cases, with native support for function calling.\n" +
+        "This model is censored, but its responses quality on many topics is extremely high compared to its size.\n" +
+        "This is the 8 billion parameters version of the model.",
+
+    fileOptions: [
+        "hf:Qwen/Qwen3-8B-GGUF:Q8_0",
+        "hf:Qwen/Qwen3-8B-GGUF:Q6_K",
+        "hf:Qwen/Qwen3-8B-GGUF:Q5_K_M",
+        "hf:Qwen/Qwen3-8B-GGUF:Q4_K_M"
+    ]
+}, {
+    name: "Qwen 3 4B",
+    abilities: ["chat", "complete", "functionCalling", "reasoning"],
+    description: "Qwen model was created by Alibaba and is using chain of though (CoT) to reason across a wide variety of topics.\n" +
+        "It's optimized for an assistant-like chat use cases, with native support for function calling.\n" +
+        "This model is censored, but its responses quality on many topics is extremely high compared to its size.\n" +
+        "This is the 4 billion parameters version of the model, and is suitable for simpler tasks and can run on lower-end hardware, as well as be very fast on higher-end hardware.",
+
+    fileOptions: [
+        "hf:Qwen/Qwen3-4B-GGUF:Q8_0",
+        "hf:Qwen/Qwen3-4B-GGUF:Q6_K",
+        "hf:Qwen/Qwen3-4B-GGUF:Q5_K_M",
+        "hf:Qwen/Qwen3-4B-GGUF:Q4_K_M"
+    ]
+}, {
+    name: "Qwen 3 0.6B",
+    abilities: ["chat", "complete", "functionCalling", "reasoning"],
+    description: "Qwen model was created by Alibaba and is using chain of though (CoT) to reason across a wide variety of topics.\n" +
+        "It's optimized for an assistant-like chat use cases, with native support for function calling.\n" +
+        "This model is censored, but its responses quality on many topics is extremely high compared to its size.\n" +
+        "This is the 0.6B billion parameters version of the model and is suitable for very simple tasks and can run on very resource-constraint hardware.\n",
+
+    fileOptions: [
+        "hf:Qwen/Qwen3-0.6B-GGUF:Q8_0"
+    ]
+}, {
     name: "DeepSeek R1 Distill Qwen 7B",
     abilities: ["chat", "complete", "functionCalling", "reasoning"],
     description: "DeepSeek R1 model was created by DeepSeek and is using chain of though (CoT) to reason across a wide variety of topics.\n" +
@@ -75,6 +143,23 @@ export const recommendedModels: ModelRecommendation[] = [{
         "hf:mradermacher/DeepSeek-R1-Distill-Llama-70B-GGUF:Q5_K_S",
         "hf:mradermacher/DeepSeek-R1-Distill-Llama-70B-GGUF:Q4_K_M"
     ]
+}, {
+    name: "Qwen 3 30B A3B MoE",
+    abilities: ["chat", "complete", "functionCalling", "reasoning"],
+    description: "Qwen model was created by Alibaba and is using chain of though (CoT) to reason across a wide variety of topics.\n" +
+        "It's optimized for an assistant-like chat use cases, with native support for function calling.\n" +
+        "This version of the model utilizes a Mixture of Experts architecture, with only 3B active parameters, thus making it very fast.\n" +
+        "Mixtures of Experts (MoE) is a technique where different models, each skilled in solving a particular kind of problem, work together to the improve the overall performance on complex tasks.\n" +
+        "This model is censored, but its responses quality on many topics is extremely high.\n" +
+        "This is the 30 billion parameters Mixtures of Experts (MoE) version of the model.\n" +
+        "Its performance is comparable and even surpasses DeepSeek V3 and GPT-4o.",
+
+    fileOptions: [
+        "hf:Qwen/Qwen3-30B-A3B-GGUF:Q8_0",
+        "hf:Qwen/Qwen3-30B-A3B-GGUF:Q6_K",
+        "hf:Qwen/Qwen3-30B-A3B-GGUF:Q5_K_M",
+        "hf:Qwen/Qwen3-30B-A3B-GGUF:Q4_K_M"
+    ]
 }, {
     name: "QwQ 32B",
     abilities: ["chat", "complete", "functionCalling", "reasoning"],

From 65b5a8d60243d1e9a063b7b91ab22e98df16c3a4 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Sun, 18 May 2025 03:06:39 +0300
Subject: [PATCH 3/6] docs: update the awesome list

---
 docs/guide/awesome.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/guide/awesome.md b/docs/guide/awesome.md
index c290928c..708c99a1 100644
--- a/docs/guide/awesome.md
+++ b/docs/guide/awesome.md
@@ -15,6 +15,9 @@ import DataBadge from "../../.vitepress/components/DataBadge/DataBadge.vue";
 * [Manzoni](https://manzoni.app/) ([GitHub](https://github.com/gems-platforms/manzoni-app)) - a text editor running local LLMs
   <br /><DataBadge title="License" content="AGPL-3.0"/>
 
+* [Clippy](https://felixrieseberg.github.io/clippy/) ([GitHub](https://github.com/felixrieseberg/clippy)) - Clippy, resurrected from the 1990s, now with some AI
+  <br /><DataBadge title="License" content="MIT"/>
+
 
 ## Proprietary
 * [BashBuddy](https://bashbuddy.run) ([GitHub](https://github.com/wosherco/bashbuddy)) - write bash commands with natural language

From 69b4e1bdc22a1c106d50d34a974e4d7984fe38f2 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Sun, 18 May 2025 03:17:03 +0300
Subject: [PATCH 4/6] fix: improve variable naming

---
 src/evaluator/LlamaChatSession/LlamaChatSession.ts | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/evaluator/LlamaChatSession/LlamaChatSession.ts b/src/evaluator/LlamaChatSession/LlamaChatSession.ts
index 61774c76..cb64518d 100644
--- a/src/evaluator/LlamaChatSession/LlamaChatSession.ts
+++ b/src/evaluator/LlamaChatSession/LlamaChatSession.ts
@@ -343,7 +343,7 @@ export class LlamaChatSession {
     /** @internal */ private readonly _chatLock = {};
     /** @internal */ private _chatHistory: ChatHistoryItem[];
     /** @internal */ private _lastEvaluation?: LlamaChatResponse["lastEvaluation"];
-    /** @internal */ private _canUseLastEvaluationForCompletion: boolean = true;
+    /** @internal */ private _canUseContextWindowForCompletion: boolean = true;
     /** @internal */ private _chat: LlamaChat | null;
     /** @internal */ public _chatHistoryStateRef = {};
     /** @internal */ public readonly _preloadAndCompleteAbortControllers = new Set<AbortController>();
@@ -520,7 +520,7 @@ export class LlamaChatSession {
 
             const supportsParallelFunctionCalling = this._chat.chatWrapper.settings.functions.parallelism != null;
             const [abortController, disposeAbortController] = wrapAbortSignal(signal);
-            let lastEvaluation = this._canUseLastEvaluationForCompletion
+            let lastEvaluation = this._canUseContextWindowForCompletion
                 ? this._lastEvaluation
                 : undefined;
             let newChatHistory = appendUserMessageToChatHistory(this._chatHistory, prompt);
@@ -726,7 +726,7 @@ export class LlamaChatSession {
                     }
 
                     this._lastEvaluation = lastEvaluation;
-                    this._canUseLastEvaluationForCompletion = true;
+                    this._canUseContextWindowForCompletion = true;
                     this._chatHistory = newChatHistory;
                     this._chatHistoryStateRef = {};
 
@@ -883,7 +883,7 @@ export class LlamaChatSession {
                     contextWindow: asWithLastUserMessageRemoved(lastEvaluation.contextWindow),
                     contextShiftMetadata: lastEvaluation.contextShiftMetadata
                 };
-                this._canUseLastEvaluationForCompletion = this._chatHistory.at(-1)?.type === "user";
+                this._canUseContextWindowForCompletion = this._chatHistory.at(-1)?.type === "user";
 
                 if (!stopOnAbortSignal && metadata.stopReason === "abort" && abortController.signal?.aborted)
                     throw abortController.signal.reason;
@@ -923,6 +923,7 @@ export class LlamaChatSession {
         this._chatHistory = structuredClone(chatHistory);
         this._chatHistoryStateRef = {};
         this._lastEvaluation = undefined;
+        this._canUseContextWindowForCompletion = false;
     }
 
     /** Clear the chat history and reset it to the initial state. */

From 4f468a6772a0ef584cb97641f4061154f0ce8474 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Sun, 18 May 2025 03:28:08 +0300
Subject: [PATCH 5/6] fix(`getLlamaGpuTypes`): fix edge case

---
 src/bindings/utils/getLlamaGpuTypes.ts | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/bindings/utils/getLlamaGpuTypes.ts b/src/bindings/utils/getLlamaGpuTypes.ts
index 82430132..be67d607 100644
--- a/src/bindings/utils/getLlamaGpuTypes.ts
+++ b/src/bindings/utils/getLlamaGpuTypes.ts
@@ -15,12 +15,18 @@ import {getPlatform} from "./getPlatform.js";
  * as some of them are inadvisable for the current machine (like CUDA on an x64 Mac machine).
  */
 export async function getLlamaGpuTypes(include: "supported" | "allValid"): Promise<LlamaGpuType[]> {
-    if (include === "supported")
-        return await getGpuTypesToUseForOption("auto");
-
     const platform = getPlatform();
     const arch = process.arch;
 
+    if (include === "supported") {
+        const gpuTypes = new Set(await getGpuTypesToUseForOption("auto"));
+
+        if (platform === "win" && arch !== "x64")
+            gpuTypes.delete("vulkan"); // no Vulkan prebuilt binary yet due to incomplete support for arm64
+
+        return [...gpuTypes];
+    }
+
     const res: LlamaGpuType[] = [];
 
     // Metal is not properly supported by llama.cpp on x64 Mac machines

From 3c07be6811fb26ff5208456e64a6cbdced0578ef Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Mon, 19 May 2025 00:47:50 +0300
Subject: [PATCH 6/6] fix: typos

---
 src/cli/recommendedModels.ts | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/cli/recommendedModels.ts b/src/cli/recommendedModels.ts
index 5be4313c..1609a780 100644
--- a/src/cli/recommendedModels.ts
+++ b/src/cli/recommendedModels.ts
@@ -62,7 +62,7 @@ export const recommendedModels: ModelRecommendation[] = [{
     abilities: ["chat", "complete", "functionCalling", "reasoning"],
     description: "Qwen model was created by Alibaba and is using chain of though (CoT) to reason across a wide variety of topics.\n" +
         "It's optimized for an assistant-like chat use cases, with native support for function calling.\n" +
-        "This model is censored, but its responses quality on many topics is extremely high compared to its size.\n" +
+        "This model is censored, but its responses quality on many topics is very high compared to its small size.\n" +
         "This is the 0.6B billion parameters version of the model and is suitable for very simple tasks and can run on very resource-constraint hardware.\n",
 
     fileOptions: [
@@ -150,7 +150,7 @@ export const recommendedModels: ModelRecommendation[] = [{
         "It's optimized for an assistant-like chat use cases, with native support for function calling.\n" +
         "This version of the model utilizes a Mixture of Experts architecture, with only 3B active parameters, thus making it very fast.\n" +
         "Mixtures of Experts (MoE) is a technique where different models, each skilled in solving a particular kind of problem, work together to the improve the overall performance on complex tasks.\n" +
-        "This model is censored, but its responses quality on many topics is extremely high.\n" +
+        "This model is censored, but its responses quality on many topics is high compared to its high generation speed.\n" +
         "This is the 30 billion parameters Mixtures of Experts (MoE) version of the model.\n" +
         "Its performance is comparable and even surpasses DeepSeek V3 and GPT-4o.",