scripts/start-llamacpp-server.sh

348 lines

1#!/bin/bash

2# scripts/start-llamacpp-server.sh

3# Launches the project-local llama.cpp embedding server. Reads which server

4# (host, port, model file) to launch from config.lua's inference_servers

5# via libs/inference-server-config.lua, sets up the CUDA runtime on

6# LD_LIBRARY_PATH so the binary's dlopen of libcudart succeeds, then

7# starts llama-server with --embedding so the OpenAI-compatible

8# /v1/embeddings endpoint is active. Verifies the server is responsive

9# via /health before declaring success.

10#

11# Usage:

12# ./scripts/start-llamacpp-server.sh # Default server from config.lua

13# ./scripts/start-llamacpp-server.sh --server=NAME # Specific server entry

14# ./scripts/start-llamacpp-server.sh /custom/dir # Override project DIR

15# ./scripts/start-llamacpp-server.sh --help # Show this message

16#

17# Replaces scripts/start-ollama-cuda.sh as the embedding-backend launcher

18# per issue 10-049. The on-disk binary at libs/llama.cpp/bin/llama-server

19# is produced by scripts/build-deps.sh; if it is missing, run that script

20# first.

22# {{{ Hard-coded project directory and default state

23DIR="/mnt/mtwo/programming/ai-stuff/neocities-modernization"

24SERVER_NAME=""

25MODEL_OVERRIDE=""

26# }}}

28# {{{ Color codes

29C_GREEN="\033[92m"

30C_BLUE="\033[94m"

31C_RED="\033[91m"

32C_YELLOW="\033[93m"

33C_RESET="\033[0m"

34# }}}

36# {{{ parse_arguments

37# Recognized flags:

38# --server=NAME : override the default_inference_server from config

39# --model=NAME : serve a specific model from that server's available_models

40# (loads that model's GGUF); defaults to the server's model

41# /path/to/dir : override the project DIR (positional)

42parse_arguments() {

43 for arg in "$@"; do

44 case "$arg" in

45 --server=*)

46 SERVER_NAME="${arg#*=}"

47 ;;

48 --model=*)

49 # Pick a specific model the server can serve (one of its

50 # available_models) and load that model's GGUF instead of the

51 # server default. Used by the model-comparison harness.

52 MODEL_OVERRIDE="${arg#*=}"

53 ;;

54 --help|-h)

55 sed -n '2,/^$/p' "$0" | sed 's/^# \?//'

56 exit 0

57 ;;

58 -*)

59 echo -e "${C_RED}Unknown option: $arg${C_RESET}" >&2

60 echo "Run with --help for usage." >&2

61 exit 1

62 ;;

63 *)

64 DIR="$arg"

65 ;;

66 esac

67 done

68}

69# }}}

71# {{{ resolve_server_config

72# Asks libs/inference-server-config.lua to resolve the chosen server (default

73# or --server=NAME override) and prints four lines to stdout: host, port,

74# model_path (relative to DIR), and the model identifier. Errors from the

75# module (typoed --server, missing default) propagate to stderr verbatim.

76# Running the module in a subprocess keeps the parent shell free of any

77# stray Lua state and lets us capture exactly the fields we want.

78resolve_server_config() {

79 local server_override=""

80 if [ -n "$SERVER_NAME" ]; then

81 server_override="inference.set_selected_server('${SERVER_NAME}')"

82 fi

83 local model_override=""

84 if [ -n "$MODEL_OVERRIDE" ]; then

85 model_override="inference.set_selected_model('${MODEL_OVERRIDE}')"

86 fi

87 luajit -e "

88 package.path = '${DIR}/libs/?.lua;' .. package.path

89 local inference = require('inference-server-config')

90 inference.set_project_root('${DIR}')

91 ${server_override}

92 ${model_override}

93 local server = inference.get_selected_server()

94 -- Resolve the GGUF for the SELECTED model (default = server.model), so a

95 -- --model override on a multi-model server loads the right file.

96 local mc = inference.get_selected_model_config()

97 if not mc.model_path then

98 error('inference-server-config: model \"' .. tostring(mc.model)

99 .. '\" on server \"' .. server.name .. '\" has no model_path; add it '

100 .. 'to the server entry or to that model in available_models')

101 end

102 print(server.host)

103 print(server.port)

104 print(mc.model_path)

105 print(mc.model)

106 "

107}

108# }}}

109

110# {{{ setup_env

111# Prepend libs/cuda to PATH and LD_LIBRARY_PATH so the llama-server binary

112# finds nvcc tools (when invoked) and libcudart at dlopen time. The binary's

113# own RPATH covers libs/llama.cpp/lib (built into the binary by build-deps.sh

114# via -DCMAKE_INSTALL_RPATH='$ORIGIN/../lib'), so we do not need to add that

115# directory explicitly.

116#

117# GGML_CUDA_FORCE_MMQ was tried 2026-06-20 and reverted. The hard-freeze

118# symptom predated this flag, so MMQ wasn't the cause; forcing MMQ might

119# just be picking a different-but-also-buggy CUDA kernel path on Pascal.

120# Removed to use the default kernel selection (cuBLAS where applicable).

121setup_env() {

122 export PATH="${DIR}/libs/cuda/bin:${PATH}"

123 export LD_LIBRARY_PATH="${DIR}/libs/cuda/lib64:${LD_LIBRARY_PATH:-}"

124}

125# }}}

126

127# {{{ already_running

128# Returns 0 if some server is already healthy at HOST:PORT — meaning we

129# should not try to start a second one. Returns non-zero otherwise.

130already_running() {

131 # Must be genuinely SERVING (HTTP 200), not merely accepting connections: a

132 # server still loading answers /health with 503, and curl -s exits 0 on that

133 # too. Treating a 503 as "already running" would skip our start AND fail the

134 # caller's readiness check. So compare the status code explicitly.

135 local code

136 code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 2 "http://${HOST}:${PORT}/health" 2>/dev/null)

137 [ "$code" = "200" ]

138}

139# }}}

140

141# {{{ verify_artifacts

142# Sanity-check that the things we need to actually run llama-server exist

143# on disk before launching. Failing here with a clear "run build-deps.sh"

144# message is friendlier than letting the operator debug a missing-binary

145# error or a missing-model crash from the server's log.

146verify_artifacts() {

147 local server_bin="${DIR}/libs/llama.cpp/bin/llama-server"

148 if [ ! -x "$server_bin" ]; then

149 echo -e "${C_RED}llama-server not found at $server_bin${C_RESET}" >&2

150 echo -e " ${C_YELLOW}HINT${C_RESET} run ./scripts/build-deps.sh to build llama.cpp" >&2

151 exit 1

152 fi

153

154 local abs_model_path="${DIR}/${MODEL_PATH}"

155 if [ ! -f "$abs_model_path" ]; then

156 echo -e "${C_RED}Model file not found: $abs_model_path${C_RESET}" >&2

157 echo -e " ${C_YELLOW}HINT${C_RESET} run ./scripts/build-deps.sh to download the GGUF" >&2

158 exit 1

159 fi

160}

161# }}}

162

163# {{{ launch_server

164# Start llama-server in the background with --embedding so the embedding

165# endpoints are active. Output goes to LOG_FILE in the RAM-backed tmp/

166# directory; SERVER_PID is captured so the operator can stop it cleanly.

167launch_server() {

168 local server_bin="${DIR}/libs/llama.cpp/bin/llama-server"

169 local abs_model_path="${DIR}/${MODEL_PATH}"

170

171 echo -e "${C_BLUE}Starting llama-server${C_RESET}"

172 echo -e " bin: $server_bin"

173 echo -e " model: $abs_model_path"

174 echo -e " host: $HOST"

175 echo -e " port: $PORT"

176 echo -e " log: $LOG_FILE"

177

178 # Explicit launch flags. These now mirror the known-good words-pdf launcher

179 # (scripts/start-llamacpp-server.sh there), which runs the byte-identical

180 # nomic-embed-text-v1.5.Q8_0 GGUF on the same machine without freezing.

181 # Before this change the neocities launcher set none of the GPU/batch flags

182 # and relied on llama-server's defaults — which diverge from the working

183 # reference in three ways that matter:

184 #

185 # --n-gpu-layers 99: offload ALL layers to the GPU. Without an explicit

186 # value the default can leave the model PARTIALLY offloaded,

187 # which shuttles tensors across PCIe every forward pass and

188 # interleaves longer-lived GPU work with copies — more

189 # opportunity to starve the display compositor on Pascal.

190 # Full offload is the well-trodden path the reference uses.

191 # --ctx-size 8192 / --batch-size 8192 / --ubatch-size 8192:

192 # embedding mode has no chunking — the WHOLE input must fit

193 # in one ubatch. The default ubatch is 512 (~2048 chars), so

194 # any poem longer than that was being REJECTED with "input

195 # too large to process". 8192 tokens (~32k chars) lets all

196 # but a couple of giant poems through. (nomic's own context

197 # caps at 2048 tokens, so longer inputs are truncated by the

198 # model — full-fidelity handling of those is a separate

199 # chunk-and-average task, see issues/.)

200 # --parallel 1: cap concurrent request slots at 1. Each parallel slot

201 # allocates its own KV cache, multiplying VRAM pressure

202 # linearly. Also llama-server's default; set explicitly so a

203 # future operator doesn't crank it up assuming it's free.

204 # --mlock : pin the model (~140 MB Q8_0) in RAM so the kernel cannot

205 # swap parts of it back to disk under memory pressure. On

206 # Pascal, swap-induced stuttering of forward passes can

207 # extend a CUDA kernel's wall time past the display watchdog

208 # timeout. Cost is ~140 MB of pinned RAM, cheap on a 31 GB host.

209 #

210 # -lv 1 (verbose) is added ONLY under --debug (NEOCITIES_LOG_DIR set): it

211 # emits per-request slot lines that are gold for diagnosing a freeze, but

212 # noise during normal runs.

213 local -a launch_flags=(

214 -m "$abs_model_path"

215 # Advertise the model under the name config.lua uses for it (e.g.

216 # "embeddinggemma-300m"), not the GGUF filename. /v1/models then reports

217 # that exact name, so callers that verify "is my model loaded?" by

218 # matching the model name succeed -- without --alias the server reports

219 # the .gguf path, and a name like embeddinggemma-300m fails to match

220 # embeddinggemma-300M-Q8_0.gguf (the case differs), which read as the

221 # model being absent even though it was loaded and serving fine.

222 --alias "$MODEL"

223 --embedding

224 --host "$HOST"

225 --port "$PORT"

226 --n-gpu-layers 99

227 --ctx-size 8192

228 --batch-size 8192

229 --ubatch-size 8192

230 --parallel 1

231 --mlock

232 )

233 if [ -n "${NEOCITIES_LOG_DIR:-}" ]; then

234 launch_flags+=( -lv 1 )

235 fi

236

237 # In --debug, pipe the server's stdout/stderr through fsync-logger so each

238 # log line is committed to disk immediately (it survives a hard lock). The

239 # process substitution is a SIBLING of llama-server, so $! still captures

240 # llama-server's PID — exactly the process we health-check and kill. Outside

241 # debug, the plain file redirect keeps things fast.

242 if [ -n "${NEOCITIES_LOG_DIR:-}" ]; then

243 "$server_bin" "${launch_flags[@]}" \

244 > >("${DIR}/scripts/fsync-logger" --quiet "$LOG_FILE") 2>&1 &

245 else

246 "$server_bin" "${launch_flags[@]}" > "$LOG_FILE" 2>&1 &

247 fi

248 SERVER_PID=$!

249 echo -e " pid: $SERVER_PID"

250}

251# }}}

252

253# {{{ wait_for_ready

254# Poll /health until the server responds or we hit the timeout. Returns 0

255# on success, 1 on timeout. Using /health is the lightest "are you alive"

256# probe the server exposes; /v1/models would also work but takes slightly

257# longer to respond because it walks the loaded model list.

258wait_for_ready() {

259 # Model load + the warm-up empty run (large batch) can take well over 30s on

260 # a busy GPU, so allow generous headroom.

261 local max_wait=180

262 local i=0

263 while [ "$i" -lt "$max_wait" ]; do

264 # CRITICAL: /health returns 503 while the model is still loading/warming

265 # up and 200 only when it can actually serve. `curl -s` exits 0 even on

266 # 503, so checking the exit code alone declares "ready" mid-warm-up and

267 # the first real request then 503s (the bug this fixes). Inspect the

268 # HTTP STATUS CODE and wait for a genuine 200.

269 local code

270 code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 2 "http://${HOST}:${PORT}/health" 2>/dev/null)

271 if [ "$code" = "200" ]; then

272 return 0

273 fi

274 sleep 1

275 i=$((i + 1))

276 done

277 return 1

278}

279# }}}

280

281# {{{ main

282main() {

283 parse_arguments "$@"

284 "${DIR}/scripts/ensure-tmp-symlink" "${DIR}" 2>/dev/null

285 # run.sh's --debug exports NEOCITIES_LOG_DIR pointing at durable disk

286 # (output/debug-logs) so the server log survives the reboot a hard GPU

287 # lock forces. Default stays in the RAM-backed tmp/. The PID file stays in

288 # tmp regardless: it is runtime state, and run.sh's cleanup_inference_server

289 # reads it from tmp/.

290 LOG_DIR="${NEOCITIES_LOG_DIR:-${DIR}/tmp}"

291 mkdir -p "${LOG_DIR}"

292 LOG_FILE="${LOG_DIR}/llamacpp-server.log"

293

294 echo -e "${C_BLUE}=================================${C_RESET}"

295 echo -e "${C_BLUE} llama.cpp Embedding Server${C_RESET}"

296 echo -e "${C_BLUE}=================================${C_RESET}"

297

298 local config

299 config=$(resolve_server_config) || exit 1

300 HOST=$(echo "$config" | sed -n 1p)

301 PORT=$(echo "$config" | sed -n 2p)

302 MODEL_PATH=$(echo "$config" | sed -n 3p)

303 MODEL=$(echo "$config" | sed -n 4p)

304

305 if [ -n "$SERVER_NAME" ]; then

306 echo -e " ${C_GREEN}server${C_RESET}: $SERVER_NAME (--server override)"

307 fi

308 echo -e " ${C_GREEN}model${C_RESET}: $MODEL"

309

310 setup_env

311 verify_artifacts

312

313 if already_running; then

314 echo -e "${C_GREEN}✓ llama-server is already running at ${HOST}:${PORT}${C_RESET}"

315 exit 0

316 fi

317

318 launch_server

319 if ! wait_for_ready; then

320 echo -e "${C_RED}llama-server did not become responsive within 30 seconds${C_RESET}" >&2

321 echo -e " Last 20 lines of log ($LOG_FILE):" >&2

322 tail -n 20 "$LOG_FILE" >&2

323 kill "$SERVER_PID" 2>/dev/null

324 wait "$SERVER_PID" 2>/dev/null

325 exit 1

326 fi

327

328 # Persist the PID so other processes (run.sh's auto-start cleanup, or

329 # a manual operator using `kill $(cat tmp/llamacpp-server.pid)`) can

330 # find the server later. The PID file is overwritten on each start.

331 local pid_file="${DIR}/tmp/llamacpp-server.pid"

332 echo "$SERVER_PID" > "$pid_file"

333

334 echo -e "${C_GREEN}✅ llama-server ready at http://${HOST}:${PORT}${C_RESET}"

335 echo

336 echo -e "${C_BLUE}🔧 Service management:${C_RESET}"

337 echo " • Logs: tail -f $LOG_FILE"

338 echo " • Stop: kill \$(cat $pid_file) # PID: $SERVER_PID"

339 echo " • Status: curl -s http://${HOST}:${PORT}/health"

340 echo

341 echo -e "${C_GREEN}🚀 Ready for embedding requests at http://${HOST}:${PORT}/v1/embeddings${C_RESET}"

342}

343# }}}

344

345main "$@"

346

347# vim: set foldmethod=marker:

348