libs/inference-server-config.lua

479 lines

1-- {{{ inference-server-config.lua
2-- Issue 10-049: Inference-server configuration loader (originally written for
3-- Ollama under 10-017; renamed and reframed for llama.cpp). Reads server
4-- definitions from config.lua and provides an API for server selection. The
5-- public surface intentionally stays close to the pre-migration shape so
6-- existing call sites in the rest of the codebase keep their structure.
7--
8-- Usage:
9-- local inference = require("inference-server-config")
10-- inference.set_project_root("/path/to/project") -- Required before other calls
11--
12-- -- Get servers
13-- local servers = inference.get_servers()
14-- local server = inference.get_server_by_name("gpu-server")
15-- local default = inference.get_default_server()
16--
17-- -- Build URL
18-- local url = inference.build_host_url(server) -- "http://192.168.0.115:10265"
19--
20-- -- Validate connection
21-- local ok, msg = inference.validate_server(server)
22-- }}}
23
24local M = {}
25
26-- {{{ Module state
27local project_root = nil
28local config = nil
29local selected_server = nil -- CLI override
30local selected_model = nil -- CLI override
31
32-- Whether the caller is in an interactive context. Off by default. The only
33-- way to enable it is for a CLI driver to call set_interactive_mode(true)
34-- after detecting -I on its command line — this is deliberately not a
35-- config.lua key, because the user-editable config file should describe
36-- what the project IS, not how the operator happens to be running it today.
37--
38-- The library applies a consistent policy whenever user input fails to
39-- resolve against a configured set of options (a typo in --server, an
40-- unrecognized --model, a missing default that points at a nonexistent
41-- entry, etc.): non-interactive callers hard-error immediately so the
42-- mistake is impossible to miss; interactive callers prompt the user to
43-- choose between using a sensible default or aborting. Silent fallback to
44-- a default is never the answer here — warnings get scrolled past in long
45-- log streams, and the wrong default can produce hours of work against the
46-- wrong endpoint before anyone notices.
47local interactive_mode = false
48-- }}}
49
50-- {{{ set_project_root
51-- Set the project root directory (required before loading config)
52function M.set_project_root(path)
53 project_root = path
54 config = nil -- Reset config when root changes
55end
56-- }}}
57
58-- {{{ local function load_config
59-- Load config.lua if not already loaded
60local function load_config()
61 if config then
62 return config
63 end
64
65 if not project_root then
66 -- Try to detect from package.path
67 local path = package.path:match("([^;]+)/libs/%?%.lua")
68 if path then
69 project_root = path
70 else
71 -- Fallback default
72 project_root = "/mnt/mtwo/programming/ai-stuff/neocities-modernization"
73 end
74 end
75
76 local config_path = project_root .. "/config.lua"
77 local ok, result = pcall(dofile, config_path)
78 if not ok then
79 -- Config not available, use empty
80 config = {}
81 return config
82 end
83
84 config = result
85 return config
86end
87-- }}}
88
89-- {{{ local function prompt_for_server_fallback
90-- Interactive recovery for "--server=<name> did not resolve."
91-- Shows the configured default server and asks whether to use it or abort.
92-- Prompts go to stderr so callers that capture stdout still surface them
93-- (though callers that capture stdout should not enable interactive mode
94-- in the first place). On success, also caches the choice by clearing
95-- selected_server, so a stage that calls get_selected_server twice does
96-- not re-prompt the operator.
97local function prompt_for_server_fallback(bad_name)
98 local cfg = load_config()
99 local default_name = cfg.default_inference_server
100 local default_server = default_name and M.get_server_by_name(default_name) or nil
101
102 io.stderr:write(string.format(
103 "\n[!] Inference server '%s' was not found in config.lua's inference_servers.\n", bad_name))
104
105 if not default_server then
106 io.stderr:write(" No usable default_inference_server is configured to fall back to.\n")
107 error(string.format(
108 "inference-server-config: --server=%s did not resolve and no default is available.", bad_name))
109 end
110
111 io.stderr:write("\nThe configured default is:\n")
112 io.stderr:write(string.format(" name: %s\n", default_server.name))
113 io.stderr:write(string.format(" host: %s\n", default_server.host or "(missing in config!)"))
114 io.stderr:write(string.format(" port: %s\n", tostring(default_server.port or "(missing in config!)")))
115 io.stderr:write(string.format(" model: %s\n", default_server.model or "(missing in config!)"))
116 io.stderr:write("\n 1) Use the default\n")
117 io.stderr:write(" 2) Error and exit\n")
118 io.stderr:write("\nSelect 1 or 2: ")
119
120 local choice = io.read("*l")
121 if choice == "1" then
122 selected_server = nil -- so subsequent calls go straight to the default without re-prompting
123 return default_server
124 end
125
126 error(string.format(
127 "inference-server-config: aborted by user — '%s' did not resolve and user chose to exit.", bad_name))
128end
129-- }}}
130
131-- {{{ function M.set_interactive_mode
132-- Flip the library into interactive mode. Only the CLI driver should call
133-- this, and only after confirming -I was passed on the command line.
134-- See the doc-comment on interactive_mode above for the policy this enables.
135function M.set_interactive_mode(enabled)
136 interactive_mode = enabled and true or false
137end
138-- }}}
139
140-- {{{ get_servers
141-- Get all configured Inference servers
142-- Returns array of server objects, or default fallback if none configured
143function M.get_servers()
144 local cfg = load_config()
145 local servers = cfg.inference_servers
146
147 if servers and #servers > 0 then
148 return servers
149 end
150
151 -- Fallback default if no servers configured. host:port matches the
152 -- operator's LAN-accessible llama.cpp box (192.168.1.100:10265) so a
153 -- bare-config run still resolves to the right endpoint.
154 return {
155 {
156 name = "local",
157 description = "Local llama.cpp instance (fallback)",
158 host = "192.168.1.100",
159 port = 10265,
160 model = "nomic-embed-text-v1.5"
161 }
162 }
163end
164-- }}}
165
166-- {{{ get_server_by_name
167-- Get a specific server by name
168-- Returns server object or nil if not found
169function M.get_server_by_name(name)
170 if not name then return nil end
171
172 for _, server in ipairs(M.get_servers()) do
173 if server.name == name then
174 return server
175 end
176 end
177
178 return nil
179end
180-- }}}
181
182-- {{{ get_default_server
183-- Resolve the configured default Inference server.
184--
185-- This is the "must work" path: callers that need an endpoint to make a
186-- request rely on this. It errors loudly if either default_inference_server
187-- is not set, or the named server does not exist in inference_servers.
188--
189-- A silent fallback to servers[1] used to live here. It was removed because
190-- it masked config drift: if default_inference_server was renamed without
191-- updating its referent, every consumer in the pipeline would silently
192-- start talking to whatever server happened to be first in the list,
193-- producing wrong results without any error message. Loud failure now
194-- guarantees that "endpoint resolution succeeded" means "your config said
195-- to use this server", not "we guessed."
196function M.get_default_server()
197 local cfg = load_config()
198
199 if not cfg.default_inference_server then
200 error("inference-server-config: config.lua does not set default_inference_server. "
201 .. "Set it to one of the names in inference_servers, or pass --server=<name> on the CLI.")
202 end
203
204 local server = M.get_server_by_name(cfg.default_inference_server)
205 if not server then
206 error(string.format(
207 "inference-server-config: default_inference_server is '%s' but no entry with that name exists in inference_servers. "
208 .. "Fix the name in config.lua, or add a matching inference_servers entry.",
209 cfg.default_inference_server))
210 end
211
212 return server
213end
214-- }}}
215
216-- {{{ get_selected_server
217-- Resolve the currently selected server.
218--
219-- Resolution order:
220-- 1. If --server=<name> was passed via set_selected_server, look it up
221-- in inference_servers.
222-- - If the name resolves, return that server.
223-- - If the name does not resolve:
224-- interactive: prompt the user to choose default or exit.
225-- non-interactive: hard-error with a message that names the
226-- offending --server=<name> and points at the fix.
227-- 2. If no --server was passed, delegate to get_default_server, which
228-- either returns a resolved default or errors loudly if the default
229-- itself is missing or unresolvable.
230--
231-- This function deliberately never falls back silently. A typoed --server
232-- used to print a stderr warning and continue against the default, which
233-- meant a busy operator could miss the warning in the log stream and
234-- spend hours of pipeline time talking to the wrong endpoint.
235function M.get_selected_server()
236 if selected_server then
237 local server = M.get_server_by_name(selected_server)
238 if server then
239 return server
240 end
241
242 if interactive_mode then
243 return prompt_for_server_fallback(selected_server)
244 end
245
246 error(string.format(
247 "inference-server-config: --server=%s does not match any entry in inference_servers (config.lua).\n"
248 .. "Fix the name on the CLI, add a matching entry to inference_servers, "
249 .. "or pass -I to enable interactive selection.",
250 selected_server))
251 end
252
253 return M.get_default_server()
254end
255-- }}}
256
257-- {{{ set_selected_server
258-- Set the selected server name (from CLI --server flag)
259function M.set_selected_server(name)
260 selected_server = name
261end
262-- }}}
263
264-- {{{ get_selected_model
265-- Resolve the model identifier to send to the inference server.
266--
267-- The library does not validate --model=<name> against any local list.
268-- The inference server is the source of truth for "what model is loaded"
269-- — the config can only ever guess. If the operator passes a --model
270-- that the server does not have, the server returns a "model not found"
271-- error and the pipeline halts there. We deliberately do not want two
272-- layers both claiming to be authoritative about model existence; that
273-- produces drift bugs where the config lists models that are no longer
274-- installed, or omits models that are.
275--
276-- The available_models field on each inference_servers entry is still
277-- useful documentation for operators (and for list_servers' --list-servers
278-- output), it is just not consulted here as a gate.
279--
280-- Resolution order:
281-- 1. Resolve the server (delegates to get_selected_server, which errors
282-- or prompts if --server=<name> did not resolve).
283-- 2. If --model=<name> was passed, return it verbatim.
284-- 3. Otherwise return server.model. If that field is missing in the
285-- inference_servers entry, hard-error — config.lua is still the source
286-- of truth for "what model do we use by default on this host."
287function M.get_selected_model()
288 local server = M.get_selected_server()
289
290 if selected_model then
291 return selected_model
292 end
293
294 -- Model-propagation fix: a --model passed to run.sh is recorded once, at
295 -- startup, on the shared per-run notepad (tmp/run-overrides.lua). Consulting
296 -- it here means EVERY short-lived child process -- the HTML, word-cloud and
297 -- word-page stages that call this (or embeddings_dir() with no argument) --
298 -- resolves the SAME model the embedding stage used, instead of silently
299 -- reverting to server.model below. An absent notepad / absent key returns
300 -- nil, so a plain run (no --model) still falls through to config.lua exactly
301 -- as before. project_root is already resolved by get_selected_server above.
302 local overrides = require("runtime-overrides")
303 overrides.set_project_root(project_root)
304 local override_model = overrides.get("model")
305 if override_model then
306 return override_model
307 end
308
309 if not server.model then
310 error(string.format(
311 "inference-server-config: server '%s' has no 'model' field in config.lua's inference_servers entry. "
312 .. "Add a model = \"<name>\" field to that entry, or pass --model=<name> on the CLI.",
313 server.name))
314 end
315 return server.model
316end
317-- }}}
318
319-- {{{ set_selected_model
320-- Set the selected model (from CLI --model flag)
321function M.set_selected_model(model)
322 selected_model = model
323end
324-- }}}
325
326-- {{{ build_host_url
327-- Build the full URL for a server. Both host and port must be set in
328-- config.lua — a server entry without them is a config bug, not a chance
329-- for the library to guess sensible defaults. The previous code silently
330-- substituted "localhost" and 11434, which meant a forgotten host = in
331-- the config would silently redirect every embedding request to nothing.
332-- Returns URL string like "http://192.168.0.115:10265".
333function M.build_host_url(server)
334 if not server then
335 server = M.get_selected_server()
336 end
337
338 local name = server.name or "(unnamed server)"
339 if not server.host then
340 error(string.format(
341 "inference-server-config: server '%s' has no 'host' field in config.lua. "
342 .. "Add a host = \"<hostname-or-ip>\" field to that inference_servers entry.", name))
343 end
344 if not server.port then
345 error(string.format(
346 "inference-server-config: server '%s' has no 'port' field in config.lua. "
347 .. "Add a port = <number> field to that inference_servers entry.", name))
348 end
349
350 return string.format("http://%s:%d", server.host, server.port)
351end
352-- }}}
353
354-- {{{ validate_server
355-- Check if a server is reachable
356-- Returns: success (bool), message (string)
357function M.validate_server(server)
358 if not server then
359 server = M.get_selected_server()
360 end
361
362 -- /v1/models is llama.cpp's OpenAI-compatible "what's loaded" endpoint.
363 -- Was /api/tags under Ollama; migrated in 10-049 along with the rest
364 -- of the API surface.
365 local url = M.build_host_url(server) .. "/v1/models"
366 local cmd = string.format("curl -s -o /dev/null -w '%%{http_code}' --max-time 3 '%s' 2>/dev/null", url)
367
368 local handle = io.popen(cmd)
369 local status = handle:read("*a")
370 handle:close()
371
372 status = status:gsub("%s+", "") -- Trim whitespace
373
374 if status == "200" then
375 return true, "Server is reachable"
376 elseif status == "000" then
377 return false, "Connection timeout - server unreachable"
378 else
379 return false, "Server returned HTTP " .. status
380 end
381end
382-- }}}
383
384-- {{{ list_servers
385-- Print a formatted list of available servers.
386-- Reads default_inference_server directly rather than calling get_default_server
387-- so that --list-servers works even when no default is configured (or is
388-- misconfigured). The purpose of this function is diagnostic, not
389-- request-issuing — it must not error when the user is trying to inspect
390-- their config.
391function M.list_servers()
392 local cfg = load_config()
393 local servers = M.get_servers()
394 local default_name = cfg.default_inference_server -- may be nil; that's fine here
395
396 print("Available Inference servers:")
397 print(string.rep("-", 70))
398
399 for _, server in ipairs(servers) do
400 local is_default = (default_name ~= nil and server.name == default_name)
401 local default_marker = is_default and " (default)" or ""
402 local url = M.build_host_url(server)
403
404 print(string.format(" %s%s", server.name, default_marker))
405 print(string.format(" %s", server.description or ""))
406 print(string.format(" URL: %s", url))
407 print(string.format(" Model: %s", server.model or "nomic-embed-text"))
408
409 if server.available_models and #server.available_models > 0 then
410 -- available_models entries may be plain strings or {model=...} tables
411 -- (a table also carries its own GGUF + prompt); show just the names.
412 local names = {}
413 for _, entry in ipairs(server.available_models) do
414 names[#names + 1] = (type(entry) == "table") and entry.model or entry
415 end
416 print(string.format(" Available models: %s", table.concat(names, ", ")))
417 end
418 print("")
419 end
420end
421-- }}}
422
423-- {{{ format_embedding_prompt
424-- Apply the active server's embedding_prompt_prefix to a text payload.
425--
426-- Some embedding models (notably nomic-embed-text v1.5+) require a
427-- task-prefix on every input — "clustering: ", "search_query: ",
428-- "search_document: ", etc. — that routes the model through different
429-- internal weights. Models that don't need a prefix (embeddinggemma,
430-- qwen3-embedding) leave the field nil and this function is a no-op.
431--
432-- Centralizing the prefix here means a model swap is a single config
433-- edit even when the new model has different prefix requirements; no
434-- caller needs to know which model is active to embed text correctly.
435function M.format_embedding_prompt(text)
436 local cfg = M.get_selected_model_config()
437 local prefix = cfg and cfg.embedding_prompt_prefix
438 if prefix and prefix ~= "" then
439 return prefix .. text
440 end
441 return text
442end
443-- }}}
444
445-- {{{ get_selected_model_config
446-- Resolve { model, model_path, embedding_prompt_prefix } for the SELECTED model
447-- on the selected server. This is what lets one server entry serve several local
448-- GGUFs: each available_models entry may be a table carrying its own model_path
449-- and prompt prefix, so `--server local --model X` loads X with X's phrasing.
450--
451-- Resolution: if the selected model matches a TABLE entry in available_models,
452-- use that entry's fields. Otherwise (a plain-string entry, or a server whose
453-- available_models is documentation-only like the remote gpu-server) fall back
454-- to the server's top-level model_path / embedding_prompt_prefix -- the default
455-- model. So the common case (no --model, default model) is unchanged.
456function M.get_selected_model_config()
457 local server = M.get_selected_server()
458 local model = M.get_selected_model()
459 if server.available_models then
460 for _, entry in ipairs(server.available_models) do
461 if type(entry) == "table" and entry.model == model then
462 return {
463 model = model,
464 model_path = entry.model_path or server.model_path,
465 embedding_prompt_prefix = entry.embedding_prompt_prefix,
466 }
467 end
468 end
469 end
470 return {
471 model = model,
472 model_path = server.model_path,
473 embedding_prompt_prefix = server.embedding_prompt_prefix,
474 }
475end
476-- }}}
477
478return M
479