scripts/start-llamacpp-server.sh

348 lines

1#!/bin/bash
2# scripts/start-llamacpp-server.sh
3# Launches the project-local llama.cpp embedding server. Reads which server
4# (host, port, model file) to launch from config.lua's inference_servers
5# via libs/inference-server-config.lua, sets up the CUDA runtime on
6# LD_LIBRARY_PATH so the binary's dlopen of libcudart succeeds, then
7# starts llama-server with --embedding so the OpenAI-compatible
8# /v1/embeddings endpoint is active. Verifies the server is responsive
9# via /health before declaring success.
10#
11# Usage:
12# ./scripts/start-llamacpp-server.sh # Default server from config.lua
13# ./scripts/start-llamacpp-server.sh --server=NAME # Specific server entry
14# ./scripts/start-llamacpp-server.sh /custom/dir # Override project DIR
15# ./scripts/start-llamacpp-server.sh --help # Show this message
16#
17# Replaces scripts/start-ollama-cuda.sh as the embedding-backend launcher
18# per issue 10-049. The on-disk binary at libs/llama.cpp/bin/llama-server
19# is produced by scripts/build-deps.sh; if it is missing, run that script
20# first.
21
22# {{{ Hard-coded project directory and default state
23DIR="/mnt/mtwo/programming/ai-stuff/neocities-modernization"
24SERVER_NAME=""
25MODEL_OVERRIDE=""
26# }}}
27
28# {{{ Color codes
29C_GREEN="\033[92m"
30C_BLUE="\033[94m"
31C_RED="\033[91m"
32C_YELLOW="\033[93m"
33C_RESET="\033[0m"
34# }}}
35
36# {{{ parse_arguments
37# Recognized flags:
38# --server=NAME : override the default_inference_server from config
39# --model=NAME : serve a specific model from that server's available_models
40# (loads that model's GGUF); defaults to the server's model
41# /path/to/dir : override the project DIR (positional)
42parse_arguments() {
43 for arg in "$@"; do
44 case "$arg" in
45 --server=*)
46 SERVER_NAME="${arg#*=}"
47 ;;
48 --model=*)
49 # Pick a specific model the server can serve (one of its
50 # available_models) and load that model's GGUF instead of the
51 # server default. Used by the model-comparison harness.
52 MODEL_OVERRIDE="${arg#*=}"
53 ;;
54 --help|-h)
55 sed -n '2,/^$/p' "$0" | sed 's/^# \?//'
56 exit 0
57 ;;
58 -*)
59 echo -e "${C_RED}Unknown option: $arg${C_RESET}" >&2
60 echo "Run with --help for usage." >&2
61 exit 1
62 ;;
63 *)
64 DIR="$arg"
65 ;;
66 esac
67 done
68}
69# }}}
70
71# {{{ resolve_server_config
72# Asks libs/inference-server-config.lua to resolve the chosen server (default
73# or --server=NAME override) and prints four lines to stdout: host, port,
74# model_path (relative to DIR), and the model identifier. Errors from the
75# module (typoed --server, missing default) propagate to stderr verbatim.
76# Running the module in a subprocess keeps the parent shell free of any
77# stray Lua state and lets us capture exactly the fields we want.
78resolve_server_config() {
79 local server_override=""
80 if [ -n "$SERVER_NAME" ]; then
81 server_override="inference.set_selected_server('${SERVER_NAME}')"
82 fi
83 local model_override=""
84 if [ -n "$MODEL_OVERRIDE" ]; then
85 model_override="inference.set_selected_model('${MODEL_OVERRIDE}')"
86 fi
87 luajit -e "
88 package.path = '${DIR}/libs/?.lua;' .. package.path
89 local inference = require('inference-server-config')
90 inference.set_project_root('${DIR}')
91 ${server_override}
92 ${model_override}
93 local server = inference.get_selected_server()
94 -- Resolve the GGUF for the SELECTED model (default = server.model), so a
95 -- --model override on a multi-model server loads the right file.
96 local mc = inference.get_selected_model_config()
97 if not mc.model_path then
98 error('inference-server-config: model \"' .. tostring(mc.model)
99 .. '\" on server \"' .. server.name .. '\" has no model_path; add it '
100 .. 'to the server entry or to that model in available_models')
101 end
102 print(server.host)
103 print(server.port)
104 print(mc.model_path)
105 print(mc.model)
106 "
107}
108# }}}
109
110# {{{ setup_env
111# Prepend libs/cuda to PATH and LD_LIBRARY_PATH so the llama-server binary
112# finds nvcc tools (when invoked) and libcudart at dlopen time. The binary's
113# own RPATH covers libs/llama.cpp/lib (built into the binary by build-deps.sh
114# via -DCMAKE_INSTALL_RPATH='$ORIGIN/../lib'), so we do not need to add that
115# directory explicitly.
116#
117# GGML_CUDA_FORCE_MMQ was tried 2026-06-20 and reverted. The hard-freeze
118# symptom predated this flag, so MMQ wasn't the cause; forcing MMQ might
119# just be picking a different-but-also-buggy CUDA kernel path on Pascal.
120# Removed to use the default kernel selection (cuBLAS where applicable).
121setup_env() {
122 export PATH="${DIR}/libs/cuda/bin:${PATH}"
123 export LD_LIBRARY_PATH="${DIR}/libs/cuda/lib64:${LD_LIBRARY_PATH:-}"
124}
125# }}}
126
127# {{{ already_running
128# Returns 0 if some server is already healthy at HOST:PORT — meaning we
129# should not try to start a second one. Returns non-zero otherwise.
130already_running() {
131 # Must be genuinely SERVING (HTTP 200), not merely accepting connections: a
132 # server still loading answers /health with 503, and curl -s exits 0 on that
133 # too. Treating a 503 as "already running" would skip our start AND fail the
134 # caller's readiness check. So compare the status code explicitly.
135 local code
136 code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 2 "http://${HOST}:${PORT}/health" 2>/dev/null)
137 [ "$code" = "200" ]
138}
139# }}}
140
141# {{{ verify_artifacts
142# Sanity-check that the things we need to actually run llama-server exist
143# on disk before launching. Failing here with a clear "run build-deps.sh"
144# message is friendlier than letting the operator debug a missing-binary
145# error or a missing-model crash from the server's log.
146verify_artifacts() {
147 local server_bin="${DIR}/libs/llama.cpp/bin/llama-server"
148 if [ ! -x "$server_bin" ]; then
149 echo -e "${C_RED}llama-server not found at $server_bin${C_RESET}" >&2
150 echo -e " ${C_YELLOW}HINT${C_RESET} run ./scripts/build-deps.sh to build llama.cpp" >&2
151 exit 1
152 fi
153
154 local abs_model_path="${DIR}/${MODEL_PATH}"
155 if [ ! -f "$abs_model_path" ]; then
156 echo -e "${C_RED}Model file not found: $abs_model_path${C_RESET}" >&2
157 echo -e " ${C_YELLOW}HINT${C_RESET} run ./scripts/build-deps.sh to download the GGUF" >&2
158 exit 1
159 fi
160}
161# }}}
162
163# {{{ launch_server
164# Start llama-server in the background with --embedding so the embedding
165# endpoints are active. Output goes to LOG_FILE in the RAM-backed tmp/
166# directory; SERVER_PID is captured so the operator can stop it cleanly.
167launch_server() {
168 local server_bin="${DIR}/libs/llama.cpp/bin/llama-server"
169 local abs_model_path="${DIR}/${MODEL_PATH}"
170
171 echo -e "${C_BLUE}Starting llama-server${C_RESET}"
172 echo -e " bin: $server_bin"
173 echo -e " model: $abs_model_path"
174 echo -e " host: $HOST"
175 echo -e " port: $PORT"
176 echo -e " log: $LOG_FILE"
177
178 # Explicit launch flags. These now mirror the known-good words-pdf launcher
179 # (scripts/start-llamacpp-server.sh there), which runs the byte-identical
180 # nomic-embed-text-v1.5.Q8_0 GGUF on the same machine without freezing.
181 # Before this change the neocities launcher set none of the GPU/batch flags
182 # and relied on llama-server's defaults — which diverge from the working
183 # reference in three ways that matter:
184 #
185 # --n-gpu-layers 99: offload ALL layers to the GPU. Without an explicit
186 # value the default can leave the model PARTIALLY offloaded,
187 # which shuttles tensors across PCIe every forward pass and
188 # interleaves longer-lived GPU work with copies — more
189 # opportunity to starve the display compositor on Pascal.
190 # Full offload is the well-trodden path the reference uses.
191 # --ctx-size 8192 / --batch-size 8192 / --ubatch-size 8192:
192 # embedding mode has no chunking — the WHOLE input must fit
193 # in one ubatch. The default ubatch is 512 (~2048 chars), so
194 # any poem longer than that was being REJECTED with "input
195 # too large to process". 8192 tokens (~32k chars) lets all
196 # but a couple of giant poems through. (nomic's own context
197 # caps at 2048 tokens, so longer inputs are truncated by the
198 # model — full-fidelity handling of those is a separate
199 # chunk-and-average task, see issues/.)
200 # --parallel 1: cap concurrent request slots at 1. Each parallel slot
201 # allocates its own KV cache, multiplying VRAM pressure
202 # linearly. Also llama-server's default; set explicitly so a
203 # future operator doesn't crank it up assuming it's free.
204 # --mlock : pin the model (~140 MB Q8_0) in RAM so the kernel cannot
205 # swap parts of it back to disk under memory pressure. On
206 # Pascal, swap-induced stuttering of forward passes can
207 # extend a CUDA kernel's wall time past the display watchdog
208 # timeout. Cost is ~140 MB of pinned RAM, cheap on a 31 GB host.
209 #
210 # -lv 1 (verbose) is added ONLY under --debug (NEOCITIES_LOG_DIR set): it
211 # emits per-request slot lines that are gold for diagnosing a freeze, but
212 # noise during normal runs.
213 local -a launch_flags=(
214 -m "$abs_model_path"
215 # Advertise the model under the name config.lua uses for it (e.g.
216 # "embeddinggemma-300m"), not the GGUF filename. /v1/models then reports
217 # that exact name, so callers that verify "is my model loaded?" by
218 # matching the model name succeed -- without --alias the server reports
219 # the .gguf path, and a name like embeddinggemma-300m fails to match
220 # embeddinggemma-300M-Q8_0.gguf (the case differs), which read as the
221 # model being absent even though it was loaded and serving fine.
222 --alias "$MODEL"
223 --embedding
224 --host "$HOST"
225 --port "$PORT"
226 --n-gpu-layers 99
227 --ctx-size 8192
228 --batch-size 8192
229 --ubatch-size 8192
230 --parallel 1
231 --mlock
232 )
233 if [ -n "${NEOCITIES_LOG_DIR:-}" ]; then
234 launch_flags+=( -lv 1 )
235 fi
236
237 # In --debug, pipe the server's stdout/stderr through fsync-logger so each
238 # log line is committed to disk immediately (it survives a hard lock). The
239 # process substitution is a SIBLING of llama-server, so $! still captures
240 # llama-server's PID — exactly the process we health-check and kill. Outside
241 # debug, the plain file redirect keeps things fast.
242 if [ -n "${NEOCITIES_LOG_DIR:-}" ]; then
243 "$server_bin" "${launch_flags[@]}" \
244 > >("${DIR}/scripts/fsync-logger" --quiet "$LOG_FILE") 2>&1 &
245 else
246 "$server_bin" "${launch_flags[@]}" > "$LOG_FILE" 2>&1 &
247 fi
248 SERVER_PID=$!
249 echo -e " pid: $SERVER_PID"
250}
251# }}}
252
253# {{{ wait_for_ready
254# Poll /health until the server responds or we hit the timeout. Returns 0
255# on success, 1 on timeout. Using /health is the lightest "are you alive"
256# probe the server exposes; /v1/models would also work but takes slightly
257# longer to respond because it walks the loaded model list.
258wait_for_ready() {
259 # Model load + the warm-up empty run (large batch) can take well over 30s on
260 # a busy GPU, so allow generous headroom.
261 local max_wait=180
262 local i=0
263 while [ "$i" -lt "$max_wait" ]; do
264 # CRITICAL: /health returns 503 while the model is still loading/warming
265 # up and 200 only when it can actually serve. `curl -s` exits 0 even on
266 # 503, so checking the exit code alone declares "ready" mid-warm-up and
267 # the first real request then 503s (the bug this fixes). Inspect the
268 # HTTP STATUS CODE and wait for a genuine 200.
269 local code
270 code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 2 "http://${HOST}:${PORT}/health" 2>/dev/null)
271 if [ "$code" = "200" ]; then
272 return 0
273 fi
274 sleep 1
275 i=$((i + 1))
276 done
277 return 1
278}
279# }}}
280
281# {{{ main
282main() {
283 parse_arguments "$@"
284 "${DIR}/scripts/ensure-tmp-symlink" "${DIR}" 2>/dev/null
285 # run.sh's --debug exports NEOCITIES_LOG_DIR pointing at durable disk
286 # (output/debug-logs) so the server log survives the reboot a hard GPU
287 # lock forces. Default stays in the RAM-backed tmp/. The PID file stays in
288 # tmp regardless: it is runtime state, and run.sh's cleanup_inference_server
289 # reads it from tmp/.
290 LOG_DIR="${NEOCITIES_LOG_DIR:-${DIR}/tmp}"
291 mkdir -p "${LOG_DIR}"
292 LOG_FILE="${LOG_DIR}/llamacpp-server.log"
293
294 echo -e "${C_BLUE}=================================${C_RESET}"
295 echo -e "${C_BLUE} llama.cpp Embedding Server${C_RESET}"
296 echo -e "${C_BLUE}=================================${C_RESET}"
297
298 local config
299 config=$(resolve_server_config) || exit 1
300 HOST=$(echo "$config" | sed -n 1p)
301 PORT=$(echo "$config" | sed -n 2p)
302 MODEL_PATH=$(echo "$config" | sed -n 3p)
303 MODEL=$(echo "$config" | sed -n 4p)
304
305 if [ -n "$SERVER_NAME" ]; then
306 echo -e " ${C_GREEN}server${C_RESET}: $SERVER_NAME (--server override)"
307 fi
308 echo -e " ${C_GREEN}model${C_RESET}: $MODEL"
309
310 setup_env
311 verify_artifacts
312
313 if already_running; then
314 echo -e "${C_GREEN}✓ llama-server is already running at ${HOST}:${PORT}${C_RESET}"
315 exit 0
316 fi
317
318 launch_server
319 if ! wait_for_ready; then
320 echo -e "${C_RED}llama-server did not become responsive within 30 seconds${C_RESET}" >&2
321 echo -e " Last 20 lines of log ($LOG_FILE):" >&2
322 tail -n 20 "$LOG_FILE" >&2
323 kill "$SERVER_PID" 2>/dev/null
324 wait "$SERVER_PID" 2>/dev/null
325 exit 1
326 fi
327
328 # Persist the PID so other processes (run.sh's auto-start cleanup, or
329 # a manual operator using `kill $(cat tmp/llamacpp-server.pid)`) can
330 # find the server later. The PID file is overwritten on each start.
331 local pid_file="${DIR}/tmp/llamacpp-server.pid"
332 echo "$SERVER_PID" > "$pid_file"
333
334 echo -e "${C_GREEN}✅ llama-server ready at http://${HOST}:${PORT}${C_RESET}"
335 echo
336 echo -e "${C_BLUE}🔧 Service management:${C_RESET}"
337 echo " • Logs: tail -f $LOG_FILE"
338 echo " • Stop: kill \$(cat $pid_file) # PID: $SERVER_PID"
339 echo " • Status: curl -s http://${HOST}:${PORT}/health"
340 echo
341 echo -e "${C_GREEN}🚀 Ready for embedding requests at http://${HOST}:${PORT}/v1/embeddings${C_RESET}"
342}
343# }}}
344
345main "$@"
346
347# vim: set foldmethod=marker:
348