src/embedding-server-manager.lua

172 lines

1#!/usr/bin/env lua
2
3-- {{{ embedding-server-manager.lua
4-- Issue 10-049: Embedding-server manager. Originally written for Ollama
5-- (10-005); reimplemented for llama.cpp. Provides liveness checks against
6-- the configured inference server, can launch start-llamacpp-server.sh
7-- when the server is down, and runs an end-to-end test embedding request.
8--
9-- Public API (preserved across the migration so call sites do not need
10-- to change shape):
11-- M.ensure_ready() — return the endpoint URL once the server is up
12-- M.test_embedding(ep, model) — send one /v1/embeddings request, verify shape
13--
14-- The big shape changes from the Ollama era:
15-- - /api/tags → /v1/models (OpenAI-compatible liveness)
16-- - /api/embeddings → /v1/embeddings (OpenAI-compatible inference)
17-- - "prompt" body → "input" body (OpenAI request shape)
18-- - response.embedding → response.data[1].embedding (OpenAI response shape)
19-- - "ollama pull" → (removed) (llama.cpp has the model on disk
20-- already; nothing to pull)
21-- - "ollama serve" → delegate to scripts/start-llamacpp-server.sh
22-- }}}
23
24-- {{{ local function setup_dir_path
25local function setup_dir_path(provided_dir)
26 if provided_dir then
27 return provided_dir
28 end
29 return "/mnt/mtwo/programming/ai-stuff/neocities-modernization"
30end
31-- }}}
32
33local DIR = setup_dir_path(arg and arg[1])
34package.path = DIR .. "/libs/?.lua;" .. package.path
35local inference_config = require("inference-server-config")
36
37local M = {}
38
39-- {{{ local function is_server_running
40-- Pings /v1/models to verify the server is alive. /v1/models is llama.cpp's
41-- OpenAI-compatible "what's loaded" endpoint — returns 200 with a JSON list
42-- of one model when the server is healthy. We don't parse the list (the
43-- consumer's job); exit code 0 from curl is enough to say "alive".
44local function is_server_running(endpoint)
45 local cmd = "curl -s --max-time 2 " .. endpoint .. "/v1/models > /dev/null 2>&1"
46 local result = os.execute(cmd)
47 return result == 0 or result == true
48end
49-- }}}
50
51-- {{{ local function start_llamacpp_service
52-- Invokes scripts/start-llamacpp-server.sh, which handles its own env setup
53-- (libs/cuda on LD_LIBRARY_PATH), config resolution, and liveness wait.
54-- Returning exit code 0 from the script means the server is up; we don't
55-- need to wait again here.
56local function start_llamacpp_service()
57 print("Starting llama.cpp embedding server via scripts/start-llamacpp-server.sh...")
58 local cmd = string.format('"%s/scripts/start-llamacpp-server.sh" "%s"', DIR, DIR)
59 local result = os.execute(cmd)
60 return result == 0 or result == true
61end
62-- }}}
63
64-- {{{ function M.ensure_ready
65-- Resolves the configured inference server, checks if it's running, starts
66-- it if not. Returns the endpoint URL on success, nil on failure. The
67-- public name dropped the "ollama" prefix in 10-049; callers update their
68-- call site in the same atomic batch as the require rename.
69function M.ensure_ready()
70 local endpoint = inference_config.build_host_url()
71 print("=== Inference Server Manager ===")
72 print("Target endpoint: " .. endpoint)
73 print("")
74
75 print("Checking if the inference server is running...")
76 if is_server_running(endpoint) then
77 print("✓ Server is already running at " .. endpoint)
78 return endpoint
79 end
80
81 print("✗ Server not running — attempting to start it")
82 if start_llamacpp_service() and is_server_running(endpoint) then
83 print("✓ Server is now ready at " .. endpoint)
84 return endpoint
85 end
86
87 print("✗ Failed to start the inference server")
88 print(" HINT: run ./scripts/start-llamacpp-server.sh manually for verbose output")
89 return nil
90end
91-- }}}
92
93-- {{{ function M.test_embedding
94-- Sends a single test embedding request to /v1/embeddings (OpenAI shape)
95-- and verifies the response body contains a "data" array with an
96-- "embedding" key. We don't strictly validate the vector dimensions —
97-- that's the consumer's job — but the substring check catches the
98-- obvious "server started but doesn't actually serve embeddings" failure.
99function M.test_embedding(endpoint, model)
100 print("Testing embedding generation...")
101
102 os.execute(string.format('"%s/scripts/ensure-tmp-symlink" "%s"', DIR, DIR))
103 local result_path = DIR .. "/tmp/embedding_test.json"
104
105 local test_cmd = string.format(
106 "curl -s -X POST %s/v1/embeddings -H 'Content-Type: application/json' " ..
107 "-d '{\"model\": \"%s\", \"input\": \"test embedding\"}' > %s",
108 endpoint, model, result_path
109 )
110 os.execute(test_cmd)
111
112 local result_file = io.open(result_path, "r")
113 if not result_file then
114 print("✗ Failed to read test response")
115 return false
116 end
117 local content = result_file:read("*a")
118 result_file:close()
119
120 if content:find('"data"', 1, true) and content:find('"embedding"', 1, true) then
121 print("✓ Embedding generation test passed")
122 return true
123 end
124 print("✗ Embedding generation test failed")
125 print("Response: " .. content)
126 return false
127end
128-- }}}
129
130-- {{{ function M.main
131-- Interactive entry point for running the manager standalone.
132function M.main(interactive_mode)
133 if interactive_mode then
134 print("=== Embedding Server Manager (interactive) ===")
135 print("1. Ensure server is running, then test (default config)")
136 print("2. Test embedding generation only")
137 io.write("Select option (1-2): ")
138 local choice = io.read()
139
140 if choice == "1" or choice == "" then
141 local endpoint = M.ensure_ready()
142 if endpoint then
143 M.test_embedding(endpoint, inference_config.get_selected_model())
144 end
145 elseif choice == "2" then
146 local endpoint = inference_config.build_host_url()
147 M.test_embedding(endpoint, inference_config.get_selected_model())
148 else
149 print("Invalid choice")
150 end
151 else
152 local endpoint = M.ensure_ready()
153 if endpoint then
154 M.test_embedding(endpoint, inference_config.get_selected_model())
155 end
156 end
157end
158-- }}}
159
160if arg then
161 local interactive_mode = false
162 for _, arg_val in ipairs(arg) do
163 if arg_val == "-I" then
164 interactive_mode = true
165 break
166 end
167 end
168 M.main(interactive_mode)
169end
170
171return M
172