From 84c7d0711b93e4a3567ef886127e896cbbb83caf Mon Sep 17 00:00:00 2001
From: Howard Zhao <howardzhao06@gmail.com>
Date: Thu, 14 May 2026 15:25:37 +0800
Subject: [PATCH] Fix: wire temperature through to generation in LocalLLM

The temperature param in __init__ was passed to Llama() at load time, but llama-cpp-python applies temperature at sampling time. The value was silently ignored and the library default took over for every call. This stores self.temperature in __init__ and uses it as the default in generate(). Also adds seed=-1 so each load gets fresh randomness, and removes the now-unused temperature kwarg from the Llama(...) call.
---
 shared/llm.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/shared/llm.py b/shared/llm.py
index 289d6fb..c1a72e0 100644
--- a/shared/llm.py
+++ b/shared/llm.py
@@ -40,11 +40,12 @@ def __init__(
         """
         self.llm = Llama(
             model_path=model_path,
-            temperature=temperature,
             n_ctx=n_ctx,
             verbose=False,
+            seed=-1,
         )
         self.max_tokens = max_tokens
+        self.temperature = temperature
     
     def generate(self, prompt: str, temperature: float = None, stop: list[str] = None) -> str:
         """
@@ -64,8 +65,7 @@ def generate(self, prompt: str, temperature: float = None, stop: list[str] = Non
             "stop": stop if stop is not None else ["</s>", "\n\n", "User:", "Assistant:"],
         }
         
-        if temperature is not None:
-            kwargs["temperature"] = temperature
+        kwargs["temperature"] = temperature if temperature is not None else self.temperature
         
         response = self.llm(**kwargs)
         return response["choices"][0]["text"].strip()
\ No newline at end of file