scripts/build-deps.sh

852 lines

1#!/bin/bash
2# scripts/build-deps.sh
3# Downloads CUDA and llama.cpp source into the RAM-backed tmp/ tree, builds
4# llama.cpp there, then installs the finished binaries + shared libraries
5# into libs/ so the project owns a clean, disk-backed copy. Replaces the
6# system-installed Ollama daemon as the embedding backend per issue 10-049.
7#
8# CUDA install: download CUDA 12.9 from NVIDIA (~5 GB) into tmp/downloads/
9# and install via the runfile installer DIRECTLY into libs/cuda/ using
10# --toolkitpath. No sudo required — nothing touches /usr/local or /var/log.
11# CUDA 12.9 was chosen because it is the most recent toolkit that still
12# supports Pascal (sm_61, the 1080 Ti) AND officially supports gcc up to
13# 14.x — so no -allow-unsupported-compiler workaround is needed on
14# rolling-distro hosts running gcc 14.
15#
16# CUDA 13.0+ would solve gcc compatibility officially but dropped Pascal
17# entirely, so this project is pinned to the 12.x line for as long as it
18# cares about the 1080 Ti. Ollama's bundled CUDA was tried as a download-
19# free shortcut, but its bundled libs are built without Pascal in the arch
20# list, so the binaries do not actually run on the 1080 Ti even though the
21# toolkit metadata says they should. Removed in favor of the one path that
22# is known to work end-to-end.
23#
24# llama.cpp source/build live in tmp/llamacpp-src/ (RAM-backed), and the
25# install step copies only the finished bin/, lib/, and include/ into
26# libs/llama.cpp/ via "cmake --install --prefix". This way disk holds the
27# ~100 MB of artifacts that need to persist, and RAM absorbs the 1–3 GB of
28# build churn.
29#
30# Usage:
31# ./scripts/build-deps.sh # Build into the default project DIR
32# ./scripts/build-deps.sh /custom/dir # Build into a different project DIR
33# ./scripts/build-deps.sh --clean # Wipe tmp source AND libs install
34# ./scripts/build-deps.sh --no-model # Skip the GGUF model download
35# ./scripts/build-deps.sh --skip-cuda # Trust whatever CUDA is already present
36# ./scripts/build-deps.sh --force-cuda # Wipe libs/cuda/ before installing
37# ./scripts/build-deps.sh --help # Show this message
38#
39# Environment:
40# BUILD_JOBS=N # Parallel cmake build jobs (default: 8)
41# # Lower this if the host CPU is overheating.
42#
43# What this gives you on success:
44# $DIR/libs/cuda/ — the CUDA toolkit (nvcc, libcudart, etc.)
45# $DIR/libs/llama.cpp/bin/ — llama-server, llama-cli, llama-embedding
46# $DIR/libs/llama.cpp/lib/ — libllama.so, libggml*.so
47# $DIR/tmp/llamacpp-src/ — RAM-backed clone + build (ephemeral)
48# $DIR/assets/models/<model>.gguf — the embedding model file
49# A smoke-tested working install ready for the 10-049 migration.
50
51# {{{ Hard-coded project directory
52DIR="/mnt/mtwo/programming/ai-stuff/neocities-modernization"
53# }}}
54
55# {{{ Pinned versions
56# llama.cpp pinned to a known-good tag rather than tracking master, so a
57# future upstream change does not silently break the build. Bump this
58# field after testing.
59#
60# Bumped b4404 -> b9842 (Issue 10-031): b4404 predates the "gemma-embedding"
61# model architecture, so EmbeddingGemma GGUFs fail to load with "unknown model
62# architecture". b9842 knows it. NOTE: between those tags upstream moved the
63# server/cli/embedding binaries from examples/ to tools/, so the build now also
64# needs -DLLAMA_BUILD_TOOLS=ON (see build_llamacpp) or it produces the shared
65# libs with no llama-server.
66LLAMACPP_VERSION="b9842"
67
68# Model to download. The basename matches what config.lua's local server
69# entry's model_path expects; updating one without the other would mismatch.
70# Q8_0 (8-bit) is the chosen quantization for Pascal-class GPUs (GTX 1080 Ti,
71# sm_61): NVIDIA gutted FP16 throughput on consumer Pascals to ~1/64 of FP32,
72# so an FP16 GGUF either runs in software-emulated FP16 (slow) or upcasts to
73# FP32 (loses the size benefit). Q8_0 stores weights at 8-bit but the GPU
74# compute path stays FP32, which Pascal handles at full rate. Quality loss
75# vs FP16 is negligible for embedding tasks. Switch to Q4_K_M for smaller
76# memory footprint or Q5_K_M for a balance, if VRAM ever gets tight.
77MODEL_REPO="nomic-ai/nomic-embed-text-v1.5-GGUF"
78MODEL_FILE="nomic-embed-text-v1.5.Q8_0.gguf"
79MODEL_URL="https://huggingface.co/${MODEL_REPO}/resolve/main/${MODEL_FILE}"
80
81# CUDA 12.9 install constants. 12.9 was chosen because it is the last 12.x
82# release that supports Pascal (sm_61, the 1080 Ti) AND officially supports
83# gcc up to 14.x. CUDA 13.0+ drops Pascal entirely. libs/cuda must contain a
84# matching toolkit version or it gets reinstalled. The prefix is used
85# string-wise: nvcc reporting "12.9.41" matches the "12.9" prefix.
86REQUIRED_CUDA_PREFIX="12.9"
87CUDA_VERSION="12.9.0"
88CUDA_DRIVER_MIN="575.51.03"
89CUDA_INSTALLER_URL="https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers/cuda_${CUDA_VERSION}_${CUDA_DRIVER_MIN}_linux.run"
90
91# Max host gcc that CUDA 12.9's nvcc accepts without -allow-unsupported-compiler.
92# Hardcoded because dynamic detection adds complexity for marginal benefit;
93# update when bumping the toolkit version. (CUDA 12.6 was here too, removed
94# along with the Ollama path — see issue 10-049 for why.)
95CUDA_12_9_MAX_GCC=14
96
97# Cap on parallel build jobs for cmake --build. Defaults to 8 so the host
98# CPU does not redline its thermal budget during summer; override with
99# BUILD_JOBS=N before invoking the script if you have a colder machine and
100# want maximum throughput (e.g. BUILD_JOBS=16 ./scripts/build-deps.sh).
101BUILD_JOBS="${BUILD_JOBS:-8}"
102# }}}
103
104# {{{ Color codes for human-readable output
105# $'...' is bash's ANSI-C quoting — it interprets \033 to the real ESC byte
106# at definition time so both `echo -e` AND `cat <<EOF` heredocs render the
107# escape correctly. The earlier "\033[92m" form was a literal 5-char string
108# that echo -e expanded but heredocs printed verbatim, producing the visible
109# "\033[92m" in print_env_summary's banner.
110C_GREEN=$'\033[92m'
111C_BLUE=$'\033[94m'
112C_RED=$'\033[91m'
113C_YELLOW=$'\033[93m'
114C_RESET=$'\033[0m'
115# }}}
116
117# {{{ parse_arguments
118parse_arguments() {
119 CLEAN_BUILD=0
120 SKIP_MODEL=0
121 SKIP_CUDA=0
122 FORCE_CUDA=0
123
124 for arg in "$@"; do
125 case "$arg" in
126 --clean)
127 CLEAN_BUILD=1
128 ;;
129 --no-model)
130 SKIP_MODEL=1
131 ;;
132 --skip-cuda)
133 # Trust whatever the operator already set up. Useful when
134 # iterating on the llama.cpp build itself.
135 SKIP_CUDA=1
136 ;;
137 --force-cuda)
138 # Wipe libs/cuda before re-installing — useful when the
139 # previous install is partial or the wrong version.
140 FORCE_CUDA=1
141 ;;
142 --help|-h)
143 sed -n '2,/^$/p' "$0" | sed 's/^# \?//'
144 exit 0
145 ;;
146 -*)
147 echo -e "${C_RED}Unknown option: $arg${C_RESET}" >&2
148 echo "Run with --help for usage." >&2
149 exit 1
150 ;;
151 *)
152 # First positional argument is the project directory.
153 DIR="$arg"
154 ;;
155 esac
156 done
157
158 # Derived paths — must come after DIR is finalized.
159 LOCAL_CUDA="${DIR}/libs/cuda"
160 DOWNLOAD_DIR="${DIR}/tmp/downloads"
161 CUDA_INSTALLER_PATH="${DOWNLOAD_DIR}/cuda_${CUDA_VERSION}_linux.run"
162 # llama.cpp paths: clone + build live in RAM-backed tmp/, the cmake
163 # install step copies the finished bin/lib/include into the disk-backed
164 # libs/llama.cpp. Wiping tmp/ between runs is safe — we'll re-clone.
165 LLAMACPP_SRC_DIR="${DIR}/tmp/llamacpp-src"
166 LLAMACPP_INSTALL_DIR="${DIR}/libs/llama.cpp"
167}
168# }}}
169
170# {{{ get_nvcc_version
171# Returns the version string an nvcc binary reports (e.g. "12.6.77"), or
172# empty if the path does not point to a working nvcc. Used both to validate
173# the install we already have, and to decide which gcc-compat flags to set.
174get_nvcc_version() {
175 local nvcc_path="$1"
176 if [ ! -x "$nvcc_path" ]; then
177 echo ""
178 return
179 fi
180 "$nvcc_path" --version 2>/dev/null | grep -oP 'V\K[\d.]+' | head -1
181}
182# }}}
183
184# {{{ get_nvcc_major_minor
185# Returns just the "12.6" or "12.9" prefix of an nvcc version string. Used to
186# decide which max-gcc constant applies, since point releases (.77, .41, etc)
187# never affect host-compiler support.
188get_nvcc_major_minor() {
189 local nvcc_path="$1"
190 local full_ver
191 full_ver=$(get_nvcc_version "$nvcc_path")
192 if [ -z "$full_ver" ]; then
193 echo ""
194 return
195 fi
196 echo "$full_ver" | cut -d. -f1-2
197}
198# }}}
199
200# {{{ detect_cuda_arch
201# Asks nvidia-smi what the local GPU's compute capability is and returns it
202# without the dot (e.g. "61" for the 1080 Ti's 6.1). We use this to target
203# cmake at exactly the architecture we will actually run on, instead of
204# relying on cmake's default arch list (which in CUDA 12.6+ may or may not
205# include Pascal sm_61, since Pascal is officially deprecated).
206detect_cuda_arch() {
207 if ! command -v nvidia-smi >/dev/null 2>&1; then
208 echo ""
209 return
210 fi
211 nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null \
212 | head -1 | tr -d '. \n'
213}
214# }}}
215
216# {{{ host_gcc_major
217# Returns the host gcc's major version number (e.g. "14" for gcc 14.2.1).
218host_gcc_major() {
219 gcc -dumpfullversion 2>/dev/null | cut -d. -f1
220}
221# }}}
222
223# {{{ check_requirements
224# Verifies the build tools are available before we attempt anything serious.
225# Unlike previous versions of this script, a missing nvcc is NOT fatal here —
226# build_cuda() will install one. We only bail on truly required upstream
227# tools (compilers, build system, downloader).
228check_requirements() {
229 echo -e "${C_BLUE}== Checking prerequisites ==${C_RESET}"
230 local missing=0
231
232 for tool in git cmake make curl gcc g++; do
233 if ! command -v "$tool" >/dev/null 2>&1; then
234 echo -e " ${C_RED}MISSING${C_RESET} $tool"
235 missing=1
236 else
237 echo -e " ${C_GREEN}found${C_RESET} $tool ($(command -v "$tool"))"
238 fi
239 done
240
241 # nvidia-smi (the driver) is required regardless of CUDA install path.
242 # We can install the toolkit, but we cannot install the kernel driver
243 # from a userspace script.
244 if ! command -v nvidia-smi >/dev/null 2>&1; then
245 echo -e " ${C_RED}MISSING${C_RESET} nvidia-smi — NVIDIA driver not installed"
246 echo -e " ${C_YELLOW}HINT${C_RESET} install the NVIDIA driver via your distro's mechanism"
247 echo -e " ${C_YELLOW}HINT${C_RESET} CUDA toolkit needs driver >= ${CUDA_DRIVER_MIN}"
248 missing=1
249 else
250 local driver_ver gpu_name compute_cap
251 driver_ver=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -1)
252 gpu_name=$(nvidia-smi --query-gpu=name --format=csv,noheader | head -1)
253 compute_cap=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -1)
254 echo -e " ${C_GREEN}found${C_RESET} nvidia driver $driver_ver ($gpu_name, sm_$(echo "$compute_cap" | tr -d '.'))"
255 fi
256
257 local gcc_major
258 gcc_major=$(host_gcc_major)
259 echo -e " ${C_GREEN}found${C_RESET} gcc major version $gcc_major"
260
261 # Don't bail on missing nvcc — build_cuda will install it. Just report.
262 local local_nvcc="${LOCAL_CUDA}/bin/nvcc"
263 if [ -x "$local_nvcc" ]; then
264 local ver
265 ver=$(get_nvcc_version "$local_nvcc")
266 echo -e " ${C_GREEN}found${C_RESET} project nvcc $ver at libs/cuda/"
267 elif command -v nvcc >/dev/null 2>&1; then
268 local ver
269 ver=$(nvcc --version | grep -oP 'release \K[0-9.]+' | head -n1)
270 echo -e " ${C_YELLOW}note${C_RESET} system nvcc $ver on PATH (will install fresh into libs/cuda/)"
271 else
272 echo -e " ${C_YELLOW}note${C_RESET} no nvcc found — build_cuda will install one"
273 fi
274
275 if [ "$missing" -ne 0 ]; then
276 echo -e "${C_RED}== Prerequisites missing — aborting ==${C_RESET}" >&2
277 exit 1
278 fi
279}
280# }}}
281
282# {{{ ensure_directories
283# Create the destination directories with mkdir -p. tmp/ is symlinked
284# to a RAM-backed location for ephemeral working files (per the project's
285# convention of keeping intermediate state out of the disk-backed tree).
286ensure_directories() {
287 mkdir -p "${DIR}/libs"
288 mkdir -p "${DIR}/assets/models"
289 "${DIR}/scripts/ensure-tmp-symlink" "${DIR}" 2>/dev/null || mkdir -p "${DIR}/tmp"
290 mkdir -p "$DOWNLOAD_DIR"
291}
292# }}}
293
294# {{{ already_have_libs_cuda
295# True if libs/cuda already has a CUDA install matching REQUIRED_CUDA_PREFIX
296# (e.g. "12.9"). A mismatched version triggers a reinstall, since the script
297# now has exactly one supported toolkit version and a hybrid layout would
298# silently break the build. Caller decides what to do with the answer.
299already_have_libs_cuda() {
300 local ver
301 ver=$(get_nvcc_version "${LOCAL_CUDA}/bin/nvcc")
302 if [ -z "$ver" ]; then
303 return 1
304 fi
305
306 case "$ver" in
307 ${REQUIRED_CUDA_PREFIX}*)
308 echo -e " ${C_GREEN}existing${C_RESET} libs/cuda has CUDA $ver (matches required ${REQUIRED_CUDA_PREFIX}.x)"
309 return 0
310 ;;
311 *)
312 # install_cuda_runfile wipes libs/cuda before writing the new
313 # toolkit so we don't end up with a hybrid version layout.
314 echo -e " ${C_YELLOW}upgrade${C_RESET} libs/cuda has CUDA $ver but ${REQUIRED_CUDA_PREFIX}.x is required — reinstalling"
315 return 1
316 ;;
317 esac
318}
319# }}}
320
321# {{{ force_clean_libs_cuda
322# --force-cuda wipes libs/cuda before installing, in case the previous
323# install is corrupt or the wrong version.
324force_clean_libs_cuda() {
325 if [ "$FORCE_CUDA" -eq 1 ] && [ -d "$LOCAL_CUDA" ]; then
326 echo -e "${C_YELLOW}== --force-cuda: removing existing libs/cuda ==${C_RESET}"
327 rm -rf "$LOCAL_CUDA"
328 fi
329}
330# }}}
331
332# {{{ download_cuda_installer
333download_cuda_installer() {
334 if [ -f "$CUDA_INSTALLER_PATH" ]; then
335 local size_mb
336 size_mb=$(du -m "$CUDA_INSTALLER_PATH" | cut -f1)
337 echo -e " ${C_GREEN}cached${C_RESET} installer already at $CUDA_INSTALLER_PATH (${size_mb} MB)"
338 return
339 fi
340 echo -e "${C_BLUE}== Downloading CUDA $CUDA_VERSION installer (~5 GB) ==${C_RESET}"
341 echo -e " url: $CUDA_INSTALLER_URL"
342 curl -L --fail --progress-bar -o "$CUDA_INSTALLER_PATH" "$CUDA_INSTALLER_URL" || {
343 echo -e "${C_RED}CUDA installer download failed.${C_RESET}" >&2
344 rm -f "$CUDA_INSTALLER_PATH"
345 exit 1
346 }
347}
348# }}}
349
350# {{{ install_cuda_runfile
351# Runs the .run installer with --toolkit and --toolkitpath pointing at the
352# project-local libs/cuda/. Because the install target is user-writable, no
353# sudo is required — nothing lands in /usr/local. The CUDA 12.9 installer
354# writes its internal log to a hardcoded /var/log/cuda-installer.log (which
355# fails silently as non-root) and exposes NO flag to redirect it, so we
356# instead capture the installer's stdout+stderr to a project-local log
357# file via shell redirection. That covers most failure modes (bad flags,
358# missing toolkit components, permission issues on the install path).
359install_cuda_runfile() {
360 # Clear the target so the install starts from an empty prefix.
361 # Leftover files from a previous, different toolkit version would
362 # otherwise coexist with the new install and produce a hybrid layout.
363 rm -rf "$LOCAL_CUDA"
364 mkdir -p "$LOCAL_CUDA"
365
366 local log_file="${DIR}/tmp/cuda-installer-output.log"
367
368 echo -e "${C_BLUE}== Installing CUDA $CUDA_VERSION toolkit into libs/cuda (no sudo) ==${C_RESET}"
369 echo -e " prefix: $LOCAL_CUDA"
370 echo -e " log: $log_file (installer stdout+stderr; --tmpdir keeps work in tmp/)"
371 sh "$CUDA_INSTALLER_PATH" --silent --toolkit \
372 --toolkitpath="$LOCAL_CUDA" \
373 --no-opengl-libs \
374 --no-man-page \
375 --tmpdir="${DIR}/tmp" \
376 > "$log_file" 2>&1 || {
377 echo -e "${C_RED}CUDA installer failed.${C_RESET}" >&2
378 echo -e " ${C_YELLOW}HINT${C_RESET} check $log_file for the installer's output" >&2
379 exit 1
380 }
381
382 if [ ! -x "${LOCAL_CUDA}/bin/nvcc" ]; then
383 echo -e "${C_RED}Installer completed but nvcc not at ${LOCAL_CUDA}/bin/nvcc${C_RESET}" >&2
384 echo -e " ${C_YELLOW}HINT${C_RESET} check $log_file for what went wrong" >&2
385 exit 1
386 fi
387 echo -e "${C_GREEN}CUDA $CUDA_VERSION installed at $LOCAL_CUDA${C_RESET}"
388
389 patch_cuda_headers
390}
391# }}}
392
393# {{{ patch_cuda_headers
394# CUDA 12.9.0's math_functions.h declares sinpi/sinpif/cospi/cospif WITHOUT
395# noexcept(true), while glibc 2.40+ declares the same functions WITH it via
396# __MATHCALL_VEC. nvcc rejects the exception-specification mismatch, breaking
397# every CUDA compilation on hosts with modern glibc. CUDA 12.9.1+ ships with
398# the noexcept already in place; this function applies the same patch to the
399# 12.9.0 headers so the user does not have to re-download to fix the build.
400#
401# The substitution is gated on /noexcept/! so re-running it is a no-op —
402# already-patched lines do not get a second noexcept appended.
403patch_cuda_headers() {
404 local header="${LOCAL_CUDA}/targets/x86_64-linux/include/crt/math_functions.h"
405 if [ ! -f "$header" ]; then
406 echo -e " ${C_YELLOW}WARN${C_RESET} math_functions.h not at expected path — skipping noexcept patch"
407 return
408 fi
409
410 sed -i -E '/noexcept/!{
411 /^extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ .* (sinpi|cospi|sinpif|cospif)\(.*\);$/s/;$/ noexcept(true);/
412 }' "$header"
413
414 echo -e " ${C_GREEN}patched${C_RESET} math_functions.h: added noexcept(true) to sinpi/cospi/sinpif/cospif"
415}
416# }}}
417
418# {{{ build_cuda
419# Top-level CUDA install entry point. Resolves between three states:
420# - --skip-cuda set: trust the operator, do nothing
421# - libs/cuda already correct: do nothing (version-prefix match)
422# - otherwise: download CUDA 12.9 from NVIDIA, install
423# directly into libs/cuda/ with no sudo
424build_cuda() {
425 if [ "$SKIP_CUDA" -eq 1 ]; then
426 echo -e "${C_YELLOW}== --skip-cuda set — trusting existing CUDA setup ==${C_RESET}"
427 return
428 fi
429
430 force_clean_libs_cuda
431
432 if already_have_libs_cuda; then
433 return
434 fi
435
436 download_cuda_installer
437 install_cuda_runfile
438
439 # Final sanity check — the install path must have produced a working nvcc.
440 if [ ! -x "${LOCAL_CUDA}/bin/nvcc" ]; then
441 echo -e "${C_RED}CUDA install completed but libs/cuda/bin/nvcc is missing${C_RESET}" >&2
442 exit 1
443 fi
444}
445# }}}
446
447# {{{ detect_old_llamacpp_layout
448# Earlier versions of this script kept llama.cpp's source checkout at
449# libs/llama.cpp/ (with .git inside). The new layout puts source in tmp/
450# and installs the finished bin/lib into libs/llama.cpp/. If the operator
451# is upgrading without --clean, bail BEFORE the CUDA install step so they
452# do not eat a 5 GB download just to hit the layout error after. With
453# --clean we simply continue — clone_llamacpp will wipe libs/llama.cpp
454# as part of its clean handler.
455detect_old_llamacpp_layout() {
456 if [ ! -d "${LLAMACPP_INSTALL_DIR}/.git" ]; then
457 return 0
458 fi
459 if [ "$CLEAN_BUILD" -eq 1 ]; then
460 return 0
461 fi
462 echo -e "${C_YELLOW}== Detected old layout: libs/llama.cpp/ contains a git checkout ==${C_RESET}" >&2
463 echo -e " The new layout uses libs/llama.cpp/ as an install prefix (bin/, lib/," >&2
464 echo -e " include/), with the source tree living in tmp/llamacpp-src/. Either" >&2
465 echo -e " re-run with --clean to wipe the old checkout, or delete it manually:" >&2
466 echo -e " rm -rf ${LLAMACPP_INSTALL_DIR}" >&2
467 exit 1
468}
469# }}}
470
471# {{{ clone_llamacpp
472# Clone llama.cpp at the pinned version into the RAM-backed tmp/ tree, or
473# refresh if it already exists. The clone is intentionally NOT under libs/
474# anymore — only finished artifacts live there. Pinning to a tag keeps the
475# build reproducible across sessions; the -c advice.detachedHead=false
476# silences git's cosmetic warning that tagged checkouts produce detached HEAD.
477clone_llamacpp() {
478 if [ "$CLEAN_BUILD" -eq 1 ]; then
479 if [ -d "$LLAMACPP_SRC_DIR" ]; then
480 echo -e "${C_BLUE}== --clean: removing tmp/llamacpp-src ==${C_RESET}"
481 rm -rf "$LLAMACPP_SRC_DIR"
482 fi
483 if [ -d "$LLAMACPP_INSTALL_DIR" ]; then
484 echo -e "${C_BLUE}== --clean: removing libs/llama.cpp install ==${C_RESET}"
485 rm -rf "$LLAMACPP_INSTALL_DIR"
486 fi
487 fi
488
489 if [ -d "${LLAMACPP_SRC_DIR}/.git" ]; then
490 echo -e "${C_BLUE}== Updating llama.cpp checkout in tmp/ ==${C_RESET}"
491 git -C "$LLAMACPP_SRC_DIR" fetch --tags --depth=1 origin "$LLAMACPP_VERSION" || {
492 echo -e "${C_RED}Failed to fetch llama.cpp tag $LLAMACPP_VERSION${C_RESET}" >&2
493 exit 1
494 }
495 git -c advice.detachedHead=false -C "$LLAMACPP_SRC_DIR" checkout "$LLAMACPP_VERSION" || {
496 echo -e "${C_RED}Failed to checkout llama.cpp $LLAMACPP_VERSION${C_RESET}" >&2
497 exit 1
498 }
499 else
500 # If something exists at LLAMACPP_SRC_DIR but it isn't a git checkout
501 # (interrupted clone, leftover dir), clear it so the clone has a
502 # clean target. tmp/ is RAM-backed so wiping is cheap.
503 if [ -e "$LLAMACPP_SRC_DIR" ]; then
504 rm -rf "$LLAMACPP_SRC_DIR"
505 fi
506 mkdir -p "$(dirname "$LLAMACPP_SRC_DIR")"
507 echo -e "${C_BLUE}== Cloning llama.cpp (tag $LLAMACPP_VERSION) into tmp/ ==${C_RESET}"
508 git -c advice.detachedHead=false clone --depth=1 --branch "$LLAMACPP_VERSION" \
509 https://github.com/ggml-org/llama.cpp.git "$LLAMACPP_SRC_DIR" || {
510 echo -e "${C_RED}Clone failed. Check network and tag validity.${C_RESET}" >&2
511 exit 1
512 }
513 fi
514
515 echo -e "${C_GREEN}llama.cpp source ready at $LLAMACPP_SRC_DIR${C_RESET}"
516}
517# }}}
518
519# {{{ build_llamacpp
520# Configure and build llama.cpp with CUDA support. Build artifacts go to
521# tmp/llamacpp-src/build/ (RAM-backed) — only the install step copies
522# finished products into libs/. Three GPU-specific decisions happen here:
523# 1. CUDAToolkit_ROOT points at libs/cuda/ so cmake never touches the
524# host PATH or LD_LIBRARY_PATH. The build is hermetic from CUDA's
525# perspective.
526# 2. CMAKE_CUDA_ARCHITECTURES is set to the detected GPU's compute
527# capability (sm_61 for the 1080 Ti). This both speeds up the build
528# and guarantees Pascal stays in the compiled arch list — modern
529# CUDA defaults silently drop it.
530# 3. If the host gcc is newer than CUDA 12.9's supported max (14), add
531# -allow-unsupported-compiler so nvcc skips its host-compiler gate.
532# Not needed for gcc 14 or older; meant as a forward-compat hedge
533# against rolling distros that may bump gcc to 15+ before we bump
534# CUDA (CUDA 13 is not an option as long as we want Pascal).
535build_llamacpp() {
536 local build_dir="${LLAMACPP_SRC_DIR}/build"
537
538 # CMAKE_INSTALL_PREFIX is baked into the configure step so a later
539 # `cmake --install` lays artifacts under libs/llama.cpp/. Re-running
540 # without --clean keeps incremental compile state in the tmp/ build
541 # tree, which is the fastest iteration loop.
542 if [ "$CLEAN_BUILD" -eq 1 ] && [ -d "$build_dir" ]; then
543 rm -rf "$build_dir"
544 fi
545
546 mkdir -p "$build_dir"
547 cd "$build_dir"
548
549 # Binary location moved across the b4404 -> b9842 bump (Issue 10-031):
550 # llama-server / llama-cli / llama-embedding used to live under examples/
551 # (built when LLAMA_BUILD_EXAMPLES=ON); upstream relocated them to tools/,
552 # gated by LLAMA_BUILD_TOOLS. We set BOTH ON so the build descends into
553 # whichever tree the pinned tag uses -- turning the relevant one off
554 # silently produces the shared libs with no llama-server. LLAMA_BUILD_SERVER
555 # stays explicit for self-documenting intent (it gates the server tool).
556 local cmake_flags=(
557 -DGGML_CUDA=ON
558 -DGGML_NATIVE=ON
559 -DLLAMA_BUILD_TESTS=OFF
560 -DLLAMA_BUILD_EXAMPLES=ON
561 -DLLAMA_BUILD_TOOLS=ON
562 -DLLAMA_BUILD_SERVER=ON
563 -DLLAMA_CURL=OFF
564 -DCMAKE_INSTALL_PREFIX="${LLAMACPP_INSTALL_DIR}"
565 # Force lib/ over lib64/. cmake's GNUInstallDirs picks lib64/ on
566 # Void/RHEL/SUSE conventions and lib/ on Debian/Arch — which means
567 # our hardcoded RPATH "$ORIGIN/../lib" would resolve to the wrong
568 # directory on lib64 distros and llama-server would fail to dlopen
569 # libllama.so. Forcing lib/ unconditionally keeps the install
570 # layout portable regardless of host distro.
571 -DCMAKE_INSTALL_LIBDIR=lib
572 # Set the binary RPATH so installed binaries find their .so
573 # neighbors without needing LD_LIBRARY_PATH set at run time.
574 -DCMAKE_INSTALL_RPATH='$ORIGIN/../lib'
575 -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON
576 )
577
578 # Point cmake at libs/cuda directly. This is the single source of
579 # truth for "which CUDA does this build use".
580 if [ -x "${LOCAL_CUDA}/bin/nvcc" ]; then
581 echo -e "${C_BLUE}== Using project-local CUDA at $LOCAL_CUDA ==${C_RESET}"
582 cmake_flags+=("-DCUDAToolkit_ROOT=${LOCAL_CUDA}")
583 export PATH="${LOCAL_CUDA}/bin:${PATH}"
584 export LD_LIBRARY_PATH="${LOCAL_CUDA}/lib64:${LD_LIBRARY_PATH:-}"
585 else
586 echo -e "${C_YELLOW}== No libs/cuda — cmake will auto-detect (system PATH, /usr/local/cuda*) ==${C_RESET}"
587 fi
588
589 # Target the detected GPU's compute capability explicitly. The
590 # CMAKE_CUDA_ARCHITECTURES environment variable wins if the operator
591 # set it (useful for cross-compiling to a different GPU).
592 if [ -z "${CMAKE_CUDA_ARCHITECTURES:-}" ]; then
593 local arch
594 arch=$(detect_cuda_arch)
595 if [ -n "$arch" ]; then
596 echo -e " ${C_GREEN}arch${C_RESET} targeting sm_$arch (detected from nvidia-smi)"
597 cmake_flags+=("-DCMAKE_CUDA_ARCHITECTURES=${arch}")
598 else
599 echo -e " ${C_YELLOW}arch${C_RESET} could not detect GPU — letting cmake pick defaults"
600 fi
601 else
602 echo -e " ${C_BLUE}arch${C_RESET} using CMAKE_CUDA_ARCHITECTURES=$CMAKE_CUDA_ARCHITECTURES from env"
603 cmake_flags+=("-DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}")
604 fi
605
606 # CUDA 12.9 officially supports gcc up to 14. If the host gcc is
607 # newer, add -allow-unsupported-compiler so nvcc skips its host-
608 # compiler gate. With the project pinned to CUDA 12.9, the only way
609 # this fires is on a host that bumped to gcc 15+ since the script
610 # was last tested. The fallback else covers --skip-cuda paths where
611 # cuda_mm is empty or a version we don't have a max_gcc for.
612 local cuda_mm gcc_major
613 cuda_mm=$(get_nvcc_major_minor "${LOCAL_CUDA}/bin/nvcc")
614 gcc_major=$(host_gcc_major)
615 if [ "$cuda_mm" = "12.9" ] && [ -n "$gcc_major" ]; then
616 if [ "$gcc_major" -gt "$CUDA_12_9_MAX_GCC" ]; then
617 echo -e " ${C_YELLOW}gcc${C_RESET} host gcc $gcc_major > CUDA $cuda_mm max ($CUDA_12_9_MAX_GCC); adding -allow-unsupported-compiler"
618 cmake_flags+=("-DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler")
619 else
620 echo -e " ${C_GREEN}gcc${C_RESET} host gcc $gcc_major within CUDA $cuda_mm supported range (<= $CUDA_12_9_MAX_GCC)"
621 fi
622 fi
623
624 echo -e "${C_BLUE}== Configuring llama.cpp ==${C_RESET}"
625 cmake .. "${cmake_flags[@]}" || {
626 echo -e "${C_RED}CMake configure failed.${C_RESET}" >&2
627 echo -e "${C_YELLOW}Likely causes (given the script's current path):${C_RESET}" >&2
628 echo -e " - libs/cuda/ is incomplete (an interrupted install) — try --force-cuda" >&2
629 echo -e " - CMAKE_CUDA_ARCHITECTURES env override targets an arch this CUDA does not support" >&2
630 echo -e " - llama.cpp upstream renamed a CMake variable since tag $LLAMACPP_VERSION (we pin to that tag)" >&2
631 echo -e " - host gcc bumped past CUDA's max — update CUDA_12_*_MAX_GCC constants in this script" >&2
632 exit 1
633 }
634
635 echo -e "${C_BLUE}== Building llama.cpp ==${C_RESET}"
636 echo -e " ${C_GREEN}jobs${C_RESET} using $BUILD_JOBS parallel build jobs (override with BUILD_JOBS=N)"
637 cmake --build . --config Release -j "$BUILD_JOBS" || {
638 echo -e "${C_RED}Build failed.${C_RESET}" >&2
639 exit 1
640 }
641
642 # Verify the binaries we care about actually got built (still in the
643 # tmp/ build tree at this point — install_llamacpp copies them out).
644 local missing=0
645 for bin in llama-server llama-cli llama-embedding; do
646 if [ ! -x "$build_dir/bin/$bin" ]; then
647 echo -e " ${C_RED}MISSING${C_RESET} $bin"
648 missing=1
649 else
650 echo -e " ${C_GREEN}built${C_RESET} $bin"
651 fi
652 done
653
654 if [ "$missing" -ne 0 ]; then
655 echo -e "${C_RED}Some binaries are missing. The CMake build flags may have skipped them.${C_RESET}" >&2
656 echo -e " ${C_YELLOW}HINT${C_RESET} at this llama.cpp tag, llama-server / llama-cli / llama-embedding" >&2
657 echo -e " ${C_YELLOW} ${C_RESET} live under examples/, so LLAMA_BUILD_EXAMPLES=ON is required." >&2
658 echo -e " ${C_YELLOW} ${C_RESET} Upstream may have moved them to tools/ in a newer tag." >&2
659 exit 1
660 fi
661
662 echo -e "${C_GREEN}llama.cpp built successfully in tmp/${C_RESET}"
663}
664# }}}
665
666# {{{ install_llamacpp
667# Copy finished binaries + shared libraries + headers from the RAM-backed
668# tmp/ build tree into the disk-backed libs/llama.cpp/ install prefix via
669# `cmake --install`. This is the step where artifacts "move from RAM to
670# disk" — the tmp/ source and build trees can be wiped after this without
671# affecting the project's ability to run llama-server.
672install_llamacpp() {
673 local build_dir="${LLAMACPP_SRC_DIR}/build"
674
675 echo -e "${C_BLUE}== Installing llama.cpp artifacts to $LLAMACPP_INSTALL_DIR ==${C_RESET}"
676 cmake --install "$build_dir" --config Release || {
677 echo -e "${C_RED}cmake --install failed.${C_RESET}" >&2
678 exit 1
679 }
680
681 # Confirm the install actually produced the binaries we expect at the
682 # final on-disk location. Catches the case where llama.cpp's install
683 # rules changed shape between versions and our flags aren't matching.
684 local missing=0
685 for bin in llama-server llama-cli llama-embedding; do
686 if [ ! -x "${LLAMACPP_INSTALL_DIR}/bin/$bin" ]; then
687 echo -e " ${C_RED}MISSING${C_RESET} ${LLAMACPP_INSTALL_DIR}/bin/$bin"
688 missing=1
689 else
690 echo -e " ${C_GREEN}installed${C_RESET} bin/$bin"
691 fi
692 done
693
694 if [ "$missing" -ne 0 ]; then
695 echo -e "${C_RED}cmake --install completed but expected binaries are missing.${C_RESET}" >&2
696 exit 1
697 fi
698
699 echo -e "${C_GREEN}llama.cpp installed at $LLAMACPP_INSTALL_DIR${C_RESET}"
700}
701# }}}
702
703# {{{ download_model
704# Pull the configured GGUF model file from HuggingFace if it is not
705# already present. The download is around 280 MB at f16 precision and
706# typically takes ~30 seconds on a residential connection. Skip with
707# --no-model when iterating on the build itself.
708download_model() {
709 if [ "$SKIP_MODEL" -eq 1 ]; then
710 echo -e "${C_YELLOW}== Skipping model download (--no-model) ==${C_RESET}"
711 return
712 fi
713
714 local model_path="${DIR}/assets/models/${MODEL_FILE}"
715
716 if [ -f "$model_path" ]; then
717 local size
718 size=$(stat -c '%s' "$model_path")
719 if [ "$size" -gt 100000000 ]; then
720 echo -e "${C_GREEN}Model already present: $model_path ($((size / 1024 / 1024)) MB)${C_RESET}"
721 return
722 fi
723 echo -e "${C_YELLOW}Existing model file is suspiciously small ($size bytes); re-downloading.${C_RESET}"
724 rm -f "$model_path"
725 fi
726
727 echo -e "${C_BLUE}== Downloading $MODEL_FILE from HuggingFace ==${C_RESET}"
728 echo -e " URL: $MODEL_URL"
729 curl -L --fail --progress-bar -o "$model_path" "$MODEL_URL" || {
730 echo -e "${C_RED}Download failed.${C_RESET}" >&2
731 rm -f "$model_path"
732 exit 1
733 }
734
735 echo -e "${C_GREEN}Model saved to $model_path${C_RESET}"
736}
737# }}}
738
739# {{{ smoke_test
740# Launch the server briefly, ping it for one embedding, kill it. This
741# catches obvious "the build linked but does not actually run" failures
742# before the operator wires the pipeline against it.
743smoke_test() {
744 if [ "$SKIP_MODEL" -eq 1 ]; then
745 echo -e "${C_YELLOW}== Skipping smoke test (model not downloaded) ==${C_RESET}"
746 return
747 fi
748
749 local server_bin="${LLAMACPP_INSTALL_DIR}/bin/llama-server"
750 local model_path="${DIR}/assets/models/${MODEL_FILE}"
751 local port=18080 # Non-default to avoid collision with any running server
752 local log="${DIR}/tmp/llamacpp-smoketest.log"
753
754 # The server binary needs CUDA runtime libs at load time. The install
755 # also bakes $ORIGIN/../lib into the binary RPATH so llama.cpp's own
756 # libs (libllama.so, libggml*.so) resolve. We set LD_LIBRARY_PATH for
757 # the CUDA runtime specifically — libs/cuda is not under the binary's
758 # RPATH search.
759 local smoke_ld_path="${LOCAL_CUDA}/lib64:${LD_LIBRARY_PATH:-}"
760
761 echo -e "${C_BLUE}== Smoke testing llama-server ==${C_RESET}"
762 LD_LIBRARY_PATH="$smoke_ld_path" "$server_bin" \
763 -m "$model_path" \
764 --embedding \
765 --host 127.0.0.1 \
766 --port "$port" \
767 > "$log" 2>&1 &
768 local pid=$!
769
770 # Wait up to 30 s for the server to become responsive.
771 local i=0
772 while [ "$i" -lt 30 ]; do
773 if curl -s --max-time 1 "http://127.0.0.1:$port/health" >/dev/null 2>&1; then
774 break
775 fi
776 sleep 1
777 i=$((i + 1))
778 done
779
780 if [ "$i" -ge 30 ]; then
781 echo -e "${C_RED}Server did not become responsive within 30 s${C_RESET}" >&2
782 echo "Last 20 lines of server log ($log):" >&2
783 tail -n 20 "$log" >&2
784 kill "$pid" 2>/dev/null
785 wait "$pid" 2>/dev/null
786 exit 1
787 fi
788
789 local response
790 response=$(curl -s --max-time 10 "http://127.0.0.1:$port/v1/embeddings" \
791 -H 'Content-Type: application/json' \
792 -d '{"model": "nomic-embed-text", "input": "clustering: hello world"}')
793
794 kill "$pid" 2>/dev/null
795 wait "$pid" 2>/dev/null
796
797 if echo "$response" | grep -q '"embedding"'; then
798 echo -e "${C_GREEN}Smoke test passed — server returned a valid embedding.${C_RESET}"
799 else
800 echo -e "${C_RED}Smoke test failed — unexpected response:${C_RESET}" >&2
801 echo "$response" >&2
802 exit 1
803 fi
804}
805# }}}
806
807# {{{ print_env_summary
808# Final friendly summary so the operator knows what env vars to set if they
809# want to invoke llama-server / nvcc by hand outside this script. The build
810# script handles its own env internally, but downstream tools need the hint.
811print_env_summary() {
812 cat <<EOF
813
814${C_GREEN}===============================================================${C_RESET}
815${C_GREEN} build-deps.sh complete${C_RESET}
816${C_GREEN}===============================================================${C_RESET}
817 CUDA: ${LOCAL_CUDA}
818 Binaries: ${LLAMACPP_INSTALL_DIR}/bin/
819 Libs: ${LLAMACPP_INSTALL_DIR}/lib/
820 Source: ${LLAMACPP_SRC_DIR} (RAM-backed; wipes on reboot)
821 Model: ${DIR}/assets/models/${MODEL_FILE}
822
823 To use libs/cuda's nvcc/cuda-runtime from your shell, add to ~/.bashrc:
824 export PATH="${LOCAL_CUDA}/bin:\$PATH"
825 export LD_LIBRARY_PATH="${LOCAL_CUDA}/lib64:\$LD_LIBRARY_PATH"
826EOF
827}
828# }}}
829
830# {{{ main
831main() {
832 parse_arguments "$@"
833 check_requirements
834 # Bail BEFORE the CUDA install step if libs/llama.cpp/.git exists and
835 # --clean was not passed — saves the operator from eating a 5 GB CUDA
836 # download just to discover their old layout blocks the install.
837 detect_old_llamacpp_layout
838 ensure_directories
839 build_cuda
840 clone_llamacpp
841 build_llamacpp
842 install_llamacpp
843 download_model
844 smoke_test
845 print_env_summary
846}
847# }}}
848
849main "$@"
850
851# vim: set foldmethod=marker:
852