scripts/build-deps.sh

852 lines

1#!/bin/bash

2# scripts/build-deps.sh

3# Downloads CUDA and llama.cpp source into the RAM-backed tmp/ tree, builds

4# llama.cpp there, then installs the finished binaries + shared libraries

5# into libs/ so the project owns a clean, disk-backed copy. Replaces the

6# system-installed Ollama daemon as the embedding backend per issue 10-049.

8# CUDA install: download CUDA 12.9 from NVIDIA (~5 GB) into tmp/downloads/

9# and install via the runfile installer DIRECTLY into libs/cuda/ using

10# --toolkitpath. No sudo required — nothing touches /usr/local or /var/log.

11# CUDA 12.9 was chosen because it is the most recent toolkit that still

12# supports Pascal (sm_61, the 1080 Ti) AND officially supports gcc up to

13# 14.x — so no -allow-unsupported-compiler workaround is needed on

14# rolling-distro hosts running gcc 14.

15#

16# CUDA 13.0+ would solve gcc compatibility officially but dropped Pascal

17# entirely, so this project is pinned to the 12.x line for as long as it

18# cares about the 1080 Ti. Ollama's bundled CUDA was tried as a download-

19# free shortcut, but its bundled libs are built without Pascal in the arch

20# list, so the binaries do not actually run on the 1080 Ti even though the

21# toolkit metadata says they should. Removed in favor of the one path that

22# is known to work end-to-end.

23#

24# llama.cpp source/build live in tmp/llamacpp-src/ (RAM-backed), and the

25# install step copies only the finished bin/, lib/, and include/ into

26# libs/llama.cpp/ via "cmake --install --prefix". This way disk holds the

27# ~100 MB of artifacts that need to persist, and RAM absorbs the 1–3 GB of

28# build churn.

29#

30# Usage:

31# ./scripts/build-deps.sh # Build into the default project DIR

32# ./scripts/build-deps.sh /custom/dir # Build into a different project DIR

33# ./scripts/build-deps.sh --clean # Wipe tmp source AND libs install

34# ./scripts/build-deps.sh --no-model # Skip the GGUF model download

35# ./scripts/build-deps.sh --skip-cuda # Trust whatever CUDA is already present

36# ./scripts/build-deps.sh --force-cuda # Wipe libs/cuda/ before installing

37# ./scripts/build-deps.sh --help # Show this message

38#

39# Environment:

40# BUILD_JOBS=N # Parallel cmake build jobs (default: 8)

41# # Lower this if the host CPU is overheating.

42#

43# What this gives you on success:

44# $DIR/libs/cuda/ — the CUDA toolkit (nvcc, libcudart, etc.)

45# $DIR/libs/llama.cpp/bin/ — llama-server, llama-cli, llama-embedding

46# $DIR/libs/llama.cpp/lib/ — libllama.so, libggml*.so

47# $DIR/tmp/llamacpp-src/ — RAM-backed clone + build (ephemeral)

48# $DIR/assets/models/<model>.gguf — the embedding model file

49# A smoke-tested working install ready for the 10-049 migration.

51# {{{ Hard-coded project directory

52DIR="/mnt/mtwo/programming/ai-stuff/neocities-modernization"

53# }}}

55# {{{ Pinned versions

56# llama.cpp pinned to a known-good tag rather than tracking master, so a

57# future upstream change does not silently break the build. Bump this

58# field after testing.

59#

60# Bumped b4404 -> b9842 (Issue 10-031): b4404 predates the "gemma-embedding"

61# model architecture, so EmbeddingGemma GGUFs fail to load with "unknown model

62# architecture". b9842 knows it. NOTE: between those tags upstream moved the

63# server/cli/embedding binaries from examples/ to tools/, so the build now also

64# needs -DLLAMA_BUILD_TOOLS=ON (see build_llamacpp) or it produces the shared

65# libs with no llama-server.

66LLAMACPP_VERSION="b9842"

68# Model to download. The basename matches what config.lua's local server

69# entry's model_path expects; updating one without the other would mismatch.

70# Q8_0 (8-bit) is the chosen quantization for Pascal-class GPUs (GTX 1080 Ti,

71# sm_61): NVIDIA gutted FP16 throughput on consumer Pascals to ~1/64 of FP32,

72# so an FP16 GGUF either runs in software-emulated FP16 (slow) or upcasts to

73# FP32 (loses the size benefit). Q8_0 stores weights at 8-bit but the GPU

74# compute path stays FP32, which Pascal handles at full rate. Quality loss

75# vs FP16 is negligible for embedding tasks. Switch to Q4_K_M for smaller

76# memory footprint or Q5_K_M for a balance, if VRAM ever gets tight.

77MODEL_REPO="nomic-ai/nomic-embed-text-v1.5-GGUF"

78MODEL_FILE="nomic-embed-text-v1.5.Q8_0.gguf"

79MODEL_URL="https://huggingface.co/${MODEL_REPO}/resolve/main/${MODEL_FILE}"

81# CUDA 12.9 install constants. 12.9 was chosen because it is the last 12.x

82# release that supports Pascal (sm_61, the 1080 Ti) AND officially supports

83# gcc up to 14.x. CUDA 13.0+ drops Pascal entirely. libs/cuda must contain a

84# matching toolkit version or it gets reinstalled. The prefix is used

85# string-wise: nvcc reporting "12.9.41" matches the "12.9" prefix.

86REQUIRED_CUDA_PREFIX="12.9"

87CUDA_VERSION="12.9.0"

88CUDA_DRIVER_MIN="575.51.03"

89CUDA_INSTALLER_URL="https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers/cuda_${CUDA_VERSION}_${CUDA_DRIVER_MIN}_linux.run"

91# Max host gcc that CUDA 12.9's nvcc accepts without -allow-unsupported-compiler.

92# Hardcoded because dynamic detection adds complexity for marginal benefit;

93# update when bumping the toolkit version. (CUDA 12.6 was here too, removed

94# along with the Ollama path — see issue 10-049 for why.)

95CUDA_12_9_MAX_GCC=14

97# Cap on parallel build jobs for cmake --build. Defaults to 8 so the host

98# CPU does not redline its thermal budget during summer; override with

99# BUILD_JOBS=N before invoking the script if you have a colder machine and

100# want maximum throughput (e.g. BUILD_JOBS=16 ./scripts/build-deps.sh).

101BUILD_JOBS="${BUILD_JOBS:-8}"

102# }}}

103

104# {{{ Color codes for human-readable output

105# $'...' is bash's ANSI-C quoting — it interprets \033 to the real ESC byte

106# at definition time so both `echo -e` AND `cat <<EOF` heredocs render the

107# escape correctly. The earlier "\033[92m" form was a literal 5-char string

108# that echo -e expanded but heredocs printed verbatim, producing the visible

109# "\033[92m" in print_env_summary's banner.

110C_GREEN=$'\033[92m'

111C_BLUE=$'\033[94m'

112C_RED=$'\033[91m'

113C_YELLOW=$'\033[93m'

114C_RESET=$'\033[0m'

115# }}}

116

117# {{{ parse_arguments

118parse_arguments() {

119 CLEAN_BUILD=0

120 SKIP_MODEL=0

121 SKIP_CUDA=0

122 FORCE_CUDA=0

123

124 for arg in "$@"; do

125 case "$arg" in

126 --clean)

127 CLEAN_BUILD=1

128 ;;

129 --no-model)

130 SKIP_MODEL=1

131 ;;

132 --skip-cuda)

133 # Trust whatever the operator already set up. Useful when

134 # iterating on the llama.cpp build itself.

135 SKIP_CUDA=1

136 ;;

137 --force-cuda)

138 # Wipe libs/cuda before re-installing — useful when the

139 # previous install is partial or the wrong version.

140 FORCE_CUDA=1

141 ;;

142 --help|-h)

143 sed -n '2,/^$/p' "$0" | sed 's/^# \?//'

144 exit 0

145 ;;

146 -*)

147 echo -e "${C_RED}Unknown option: $arg${C_RESET}" >&2

148 echo "Run with --help for usage." >&2

149 exit 1

150 ;;

151 *)

152 # First positional argument is the project directory.

153 DIR="$arg"

154 ;;

155 esac

156 done

157

158 # Derived paths — must come after DIR is finalized.

159 LOCAL_CUDA="${DIR}/libs/cuda"

160 DOWNLOAD_DIR="${DIR}/tmp/downloads"

161 CUDA_INSTALLER_PATH="${DOWNLOAD_DIR}/cuda_${CUDA_VERSION}_linux.run"

162 # llama.cpp paths: clone + build live in RAM-backed tmp/, the cmake

163 # install step copies the finished bin/lib/include into the disk-backed

164 # libs/llama.cpp. Wiping tmp/ between runs is safe — we'll re-clone.

165 LLAMACPP_SRC_DIR="${DIR}/tmp/llamacpp-src"

166 LLAMACPP_INSTALL_DIR="${DIR}/libs/llama.cpp"

167}

168# }}}

169

170# {{{ get_nvcc_version

171# Returns the version string an nvcc binary reports (e.g. "12.6.77"), or

172# empty if the path does not point to a working nvcc. Used both to validate

173# the install we already have, and to decide which gcc-compat flags to set.

174get_nvcc_version() {

175 local nvcc_path="$1"

176 if [ ! -x "$nvcc_path" ]; then

177 echo ""

178 return

179 fi

180 "$nvcc_path" --version 2>/dev/null | grep -oP 'V\K[\d.]+' | head -1

181}

182# }}}

183

184# {{{ get_nvcc_major_minor

185# Returns just the "12.6" or "12.9" prefix of an nvcc version string. Used to

186# decide which max-gcc constant applies, since point releases (.77, .41, etc)

187# never affect host-compiler support.

188get_nvcc_major_minor() {

189 local nvcc_path="$1"

190 local full_ver

191 full_ver=$(get_nvcc_version "$nvcc_path")

192 if [ -z "$full_ver" ]; then

193 echo ""

194 return

195 fi

196 echo "$full_ver" | cut -d. -f1-2

197}

198# }}}

199

200# {{{ detect_cuda_arch

201# Asks nvidia-smi what the local GPU's compute capability is and returns it

202# without the dot (e.g. "61" for the 1080 Ti's 6.1). We use this to target

203# cmake at exactly the architecture we will actually run on, instead of

204# relying on cmake's default arch list (which in CUDA 12.6+ may or may not

205# include Pascal sm_61, since Pascal is officially deprecated).

206detect_cuda_arch() {

207 if ! command -v nvidia-smi >/dev/null 2>&1; then

208 echo ""

209 return

210 fi

211 nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null \

212 | head -1 | tr -d '. \n'

213}

214# }}}

215

216# {{{ host_gcc_major

217# Returns the host gcc's major version number (e.g. "14" for gcc 14.2.1).

218host_gcc_major() {

219 gcc -dumpfullversion 2>/dev/null | cut -d. -f1

220}

221# }}}

222

223# {{{ check_requirements

224# Verifies the build tools are available before we attempt anything serious.

225# Unlike previous versions of this script, a missing nvcc is NOT fatal here —

226# build_cuda() will install one. We only bail on truly required upstream

227# tools (compilers, build system, downloader).

228check_requirements() {

229 echo -e "${C_BLUE}== Checking prerequisites ==${C_RESET}"

230 local missing=0

231

232 for tool in git cmake make curl gcc g++; do

233 if ! command -v "$tool" >/dev/null 2>&1; then

234 echo -e " ${C_RED}MISSING${C_RESET} $tool"

235 missing=1

236 else

237 echo -e " ${C_GREEN}found${C_RESET} $tool ($(command -v "$tool"))"

238 fi

239 done

240

241 # nvidia-smi (the driver) is required regardless of CUDA install path.

242 # We can install the toolkit, but we cannot install the kernel driver

243 # from a userspace script.

244 if ! command -v nvidia-smi >/dev/null 2>&1; then

245 echo -e " ${C_RED}MISSING${C_RESET} nvidia-smi — NVIDIA driver not installed"

246 echo -e " ${C_YELLOW}HINT${C_RESET} install the NVIDIA driver via your distro's mechanism"

247 echo -e " ${C_YELLOW}HINT${C_RESET} CUDA toolkit needs driver >= ${CUDA_DRIVER_MIN}"

248 missing=1

249 else

250 local driver_ver gpu_name compute_cap

251 driver_ver=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -1)

252 gpu_name=$(nvidia-smi --query-gpu=name --format=csv,noheader | head -1)

253 compute_cap=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -1)

254 echo -e " ${C_GREEN}found${C_RESET} nvidia driver $driver_ver ($gpu_name, sm_$(echo "$compute_cap" | tr -d '.'))"

255 fi

256

257 local gcc_major

258 gcc_major=$(host_gcc_major)

259 echo -e " ${C_GREEN}found${C_RESET} gcc major version $gcc_major"

260

261 # Don't bail on missing nvcc — build_cuda will install it. Just report.

262 local local_nvcc="${LOCAL_CUDA}/bin/nvcc"

263 if [ -x "$local_nvcc" ]; then

264 local ver

265 ver=$(get_nvcc_version "$local_nvcc")

266 echo -e " ${C_GREEN}found${C_RESET} project nvcc $ver at libs/cuda/"

267 elif command -v nvcc >/dev/null 2>&1; then

268 local ver

269 ver=$(nvcc --version | grep -oP 'release \K[0-9.]+' | head -n1)

270 echo -e " ${C_YELLOW}note${C_RESET} system nvcc $ver on PATH (will install fresh into libs/cuda/)"

271 else

272 echo -e " ${C_YELLOW}note${C_RESET} no nvcc found — build_cuda will install one"

273 fi

274

275 if [ "$missing" -ne 0 ]; then

276 echo -e "${C_RED}== Prerequisites missing — aborting ==${C_RESET}" >&2

277 exit 1

278 fi

279}

280# }}}

281

282# {{{ ensure_directories

283# Create the destination directories with mkdir -p. tmp/ is symlinked

284# to a RAM-backed location for ephemeral working files (per the project's

285# convention of keeping intermediate state out of the disk-backed tree).

286ensure_directories() {

287 mkdir -p "${DIR}/libs"

288 mkdir -p "${DIR}/assets/models"

289 "${DIR}/scripts/ensure-tmp-symlink" "${DIR}" 2>/dev/null || mkdir -p "${DIR}/tmp"

290 mkdir -p "$DOWNLOAD_DIR"

291}

292# }}}

293

294# {{{ already_have_libs_cuda

295# True if libs/cuda already has a CUDA install matching REQUIRED_CUDA_PREFIX

296# (e.g. "12.9"). A mismatched version triggers a reinstall, since the script

297# now has exactly one supported toolkit version and a hybrid layout would

298# silently break the build. Caller decides what to do with the answer.

299already_have_libs_cuda() {

300 local ver

301 ver=$(get_nvcc_version "${LOCAL_CUDA}/bin/nvcc")

302 if [ -z "$ver" ]; then

303 return 1

304 fi

305

306 case "$ver" in

307 ${REQUIRED_CUDA_PREFIX}*)

308 echo -e " ${C_GREEN}existing${C_RESET} libs/cuda has CUDA $ver (matches required ${REQUIRED_CUDA_PREFIX}.x)"

309 return 0

310 ;;

311 *)

312 # install_cuda_runfile wipes libs/cuda before writing the new

313 # toolkit so we don't end up with a hybrid version layout.

314 echo -e " ${C_YELLOW}upgrade${C_RESET} libs/cuda has CUDA $ver but ${REQUIRED_CUDA_PREFIX}.x is required — reinstalling"

315 return 1

316 ;;

317 esac

318}

319# }}}

320

321# {{{ force_clean_libs_cuda

322# --force-cuda wipes libs/cuda before installing, in case the previous

323# install is corrupt or the wrong version.

324force_clean_libs_cuda() {

325 if [ "$FORCE_CUDA" -eq 1 ] && [ -d "$LOCAL_CUDA" ]; then

326 echo -e "${C_YELLOW}== --force-cuda: removing existing libs/cuda ==${C_RESET}"

327 rm -rf "$LOCAL_CUDA"

328 fi

329}

330# }}}

331

332# {{{ download_cuda_installer

333download_cuda_installer() {

334 if [ -f "$CUDA_INSTALLER_PATH" ]; then

335 local size_mb

336 size_mb=$(du -m "$CUDA_INSTALLER_PATH" | cut -f1)

337 echo -e " ${C_GREEN}cached${C_RESET} installer already at $CUDA_INSTALLER_PATH (${size_mb} MB)"

338 return

339 fi

340 echo -e "${C_BLUE}== Downloading CUDA $CUDA_VERSION installer (~5 GB) ==${C_RESET}"

341 echo -e " url: $CUDA_INSTALLER_URL"

342 curl -L --fail --progress-bar -o "$CUDA_INSTALLER_PATH" "$CUDA_INSTALLER_URL" || {

343 echo -e "${C_RED}CUDA installer download failed.${C_RESET}" >&2

344 rm -f "$CUDA_INSTALLER_PATH"

345 exit 1

346 }

347}

348# }}}

349

350# {{{ install_cuda_runfile

351# Runs the .run installer with --toolkit and --toolkitpath pointing at the

352# project-local libs/cuda/. Because the install target is user-writable, no

353# sudo is required — nothing lands in /usr/local. The CUDA 12.9 installer

354# writes its internal log to a hardcoded /var/log/cuda-installer.log (which

355# fails silently as non-root) and exposes NO flag to redirect it, so we

356# instead capture the installer's stdout+stderr to a project-local log

357# file via shell redirection. That covers most failure modes (bad flags,

358# missing toolkit components, permission issues on the install path).

359install_cuda_runfile() {

360 # Clear the target so the install starts from an empty prefix.

361 # Leftover files from a previous, different toolkit version would

362 # otherwise coexist with the new install and produce a hybrid layout.

363 rm -rf "$LOCAL_CUDA"

364 mkdir -p "$LOCAL_CUDA"

365

366 local log_file="${DIR}/tmp/cuda-installer-output.log"

367

368 echo -e "${C_BLUE}== Installing CUDA $CUDA_VERSION toolkit into libs/cuda (no sudo) ==${C_RESET}"

369 echo -e " prefix: $LOCAL_CUDA"

370 echo -e " log: $log_file (installer stdout+stderr; --tmpdir keeps work in tmp/)"

371 sh "$CUDA_INSTALLER_PATH" --silent --toolkit \

372 --toolkitpath="$LOCAL_CUDA" \

373 --no-opengl-libs \

374 --no-man-page \

375 --tmpdir="${DIR}/tmp" \

376 > "$log_file" 2>&1 || {

377 echo -e "${C_RED}CUDA installer failed.${C_RESET}" >&2

378 echo -e " ${C_YELLOW}HINT${C_RESET} check $log_file for the installer's output" >&2

379 exit 1

380 }

381

382 if [ ! -x "${LOCAL_CUDA}/bin/nvcc" ]; then

383 echo -e "${C_RED}Installer completed but nvcc not at ${LOCAL_CUDA}/bin/nvcc${C_RESET}" >&2

384 echo -e " ${C_YELLOW}HINT${C_RESET} check $log_file for what went wrong" >&2

385 exit 1

386 fi

387 echo -e "${C_GREEN}CUDA $CUDA_VERSION installed at $LOCAL_CUDA${C_RESET}"

388

389 patch_cuda_headers

390}

391# }}}

392

393# {{{ patch_cuda_headers

394# CUDA 12.9.0's math_functions.h declares sinpi/sinpif/cospi/cospif WITHOUT

395# noexcept(true), while glibc 2.40+ declares the same functions WITH it via

396# __MATHCALL_VEC. nvcc rejects the exception-specification mismatch, breaking

397# every CUDA compilation on hosts with modern glibc. CUDA 12.9.1+ ships with

398# the noexcept already in place; this function applies the same patch to the

399# 12.9.0 headers so the user does not have to re-download to fix the build.

400#

401# The substitution is gated on /noexcept/! so re-running it is a no-op —

402# already-patched lines do not get a second noexcept appended.

403patch_cuda_headers() {

404 local header="${LOCAL_CUDA}/targets/x86_64-linux/include/crt/math_functions.h"

405 if [ ! -f "$header" ]; then

406 echo -e " ${C_YELLOW}WARN${C_RESET} math_functions.h not at expected path — skipping noexcept patch"

407 return

408 fi

409

410 sed -i -E '/noexcept/!{

411 /^extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ .* (sinpi|cospi|sinpif|cospif)$.*$;$/s/;$/ noexcept(true);/

412 }' "$header"

413

414 echo -e " ${C_GREEN}patched${C_RESET} math_functions.h: added noexcept(true) to sinpi/cospi/sinpif/cospif"

415}

416# }}}

417

418# {{{ build_cuda

419# Top-level CUDA install entry point. Resolves between three states:

420# - --skip-cuda set: trust the operator, do nothing

421# - libs/cuda already correct: do nothing (version-prefix match)

422# - otherwise: download CUDA 12.9 from NVIDIA, install

423# directly into libs/cuda/ with no sudo

424build_cuda() {

425 if [ "$SKIP_CUDA" -eq 1 ]; then

426 echo -e "${C_YELLOW}== --skip-cuda set — trusting existing CUDA setup ==${C_RESET}"

427 return

428 fi

429

430 force_clean_libs_cuda

431

432 if already_have_libs_cuda; then

433 return

434 fi

435

436 download_cuda_installer

437 install_cuda_runfile

438

439 # Final sanity check — the install path must have produced a working nvcc.

440 if [ ! -x "${LOCAL_CUDA}/bin/nvcc" ]; then

441 echo -e "${C_RED}CUDA install completed but libs/cuda/bin/nvcc is missing${C_RESET}" >&2

442 exit 1

443 fi

444}

445# }}}

446

447# {{{ detect_old_llamacpp_layout

448# Earlier versions of this script kept llama.cpp's source checkout at

449# libs/llama.cpp/ (with .git inside). The new layout puts source in tmp/

450# and installs the finished bin/lib into libs/llama.cpp/. If the operator

451# is upgrading without --clean, bail BEFORE the CUDA install step so they

452# do not eat a 5 GB download just to hit the layout error after. With

453# --clean we simply continue — clone_llamacpp will wipe libs/llama.cpp

454# as part of its clean handler.

455detect_old_llamacpp_layout() {

456 if [ ! -d "${LLAMACPP_INSTALL_DIR}/.git" ]; then

457 return 0

458 fi

459 if [ "$CLEAN_BUILD" -eq 1 ]; then

460 return 0

461 fi

462 echo -e "${C_YELLOW}== Detected old layout: libs/llama.cpp/ contains a git checkout ==${C_RESET}" >&2

463 echo -e " The new layout uses libs/llama.cpp/ as an install prefix (bin/, lib/," >&2

464 echo -e " include/), with the source tree living in tmp/llamacpp-src/. Either" >&2

465 echo -e " re-run with --clean to wipe the old checkout, or delete it manually:" >&2

466 echo -e " rm -rf ${LLAMACPP_INSTALL_DIR}" >&2

467 exit 1

468}

469# }}}

470

471# {{{ clone_llamacpp

472# Clone llama.cpp at the pinned version into the RAM-backed tmp/ tree, or

473# refresh if it already exists. The clone is intentionally NOT under libs/

474# anymore — only finished artifacts live there. Pinning to a tag keeps the

475# build reproducible across sessions; the -c advice.detachedHead=false

476# silences git's cosmetic warning that tagged checkouts produce detached HEAD.

477clone_llamacpp() {

478 if [ "$CLEAN_BUILD" -eq 1 ]; then

479 if [ -d "$LLAMACPP_SRC_DIR" ]; then

480 echo -e "${C_BLUE}== --clean: removing tmp/llamacpp-src ==${C_RESET}"

481 rm -rf "$LLAMACPP_SRC_DIR"

482 fi

483 if [ -d "$LLAMACPP_INSTALL_DIR" ]; then

484 echo -e "${C_BLUE}== --clean: removing libs/llama.cpp install ==${C_RESET}"

485 rm -rf "$LLAMACPP_INSTALL_DIR"

486 fi

487 fi

488

489 if [ -d "${LLAMACPP_SRC_DIR}/.git" ]; then

490 echo -e "${C_BLUE}== Updating llama.cpp checkout in tmp/ ==${C_RESET}"

491 git -C "$LLAMACPP_SRC_DIR" fetch --tags --depth=1 origin "$LLAMACPP_VERSION" || {

492 echo -e "${C_RED}Failed to fetch llama.cpp tag $LLAMACPP_VERSION${C_RESET}" >&2

493 exit 1

494 }

495 git -c advice.detachedHead=false -C "$LLAMACPP_SRC_DIR" checkout "$LLAMACPP_VERSION" || {

496 echo -e "${C_RED}Failed to checkout llama.cpp $LLAMACPP_VERSION${C_RESET}" >&2

497 exit 1

498 }

499 else

500 # If something exists at LLAMACPP_SRC_DIR but it isn't a git checkout

501 # (interrupted clone, leftover dir), clear it so the clone has a

502 # clean target. tmp/ is RAM-backed so wiping is cheap.

503 if [ -e "$LLAMACPP_SRC_DIR" ]; then

504 rm -rf "$LLAMACPP_SRC_DIR"

505 fi

506 mkdir -p "$(dirname "$LLAMACPP_SRC_DIR")"

507 echo -e "${C_BLUE}== Cloning llama.cpp (tag $LLAMACPP_VERSION) into tmp/ ==${C_RESET}"

508 git -c advice.detachedHead=false clone --depth=1 --branch "$LLAMACPP_VERSION" \

509 https://github.com/ggml-org/llama.cpp.git "$LLAMACPP_SRC_DIR" || {

510 echo -e "${C_RED}Clone failed. Check network and tag validity.${C_RESET}" >&2

511 exit 1

512 }

513 fi

514

515 echo -e "${C_GREEN}llama.cpp source ready at $LLAMACPP_SRC_DIR${C_RESET}"

516}

517# }}}

518

519# {{{ build_llamacpp

520# Configure and build llama.cpp with CUDA support. Build artifacts go to

521# tmp/llamacpp-src/build/ (RAM-backed) — only the install step copies

522# finished products into libs/. Three GPU-specific decisions happen here:

523# 1. CUDAToolkit_ROOT points at libs/cuda/ so cmake never touches the

524# host PATH or LD_LIBRARY_PATH. The build is hermetic from CUDA's

525# perspective.

526# 2. CMAKE_CUDA_ARCHITECTURES is set to the detected GPU's compute

527# capability (sm_61 for the 1080 Ti). This both speeds up the build

528# and guarantees Pascal stays in the compiled arch list — modern

529# CUDA defaults silently drop it.

530# 3. If the host gcc is newer than CUDA 12.9's supported max (14), add

531# -allow-unsupported-compiler so nvcc skips its host-compiler gate.

532# Not needed for gcc 14 or older; meant as a forward-compat hedge

533# against rolling distros that may bump gcc to 15+ before we bump

534# CUDA (CUDA 13 is not an option as long as we want Pascal).

535build_llamacpp() {

536 local build_dir="${LLAMACPP_SRC_DIR}/build"

537

538 # CMAKE_INSTALL_PREFIX is baked into the configure step so a later

539 # `cmake --install` lays artifacts under libs/llama.cpp/. Re-running

540 # without --clean keeps incremental compile state in the tmp/ build

541 # tree, which is the fastest iteration loop.

542 if [ "$CLEAN_BUILD" -eq 1 ] && [ -d "$build_dir" ]; then

543 rm -rf "$build_dir"

544 fi

545

546 mkdir -p "$build_dir"

547 cd "$build_dir"

548

549 # Binary location moved across the b4404 -> b9842 bump (Issue 10-031):

550 # llama-server / llama-cli / llama-embedding used to live under examples/

551 # (built when LLAMA_BUILD_EXAMPLES=ON); upstream relocated them to tools/,

552 # gated by LLAMA_BUILD_TOOLS. We set BOTH ON so the build descends into

553 # whichever tree the pinned tag uses -- turning the relevant one off

554 # silently produces the shared libs with no llama-server. LLAMA_BUILD_SERVER

555 # stays explicit for self-documenting intent (it gates the server tool).

556 local cmake_flags=(

557 -DGGML_CUDA=ON

558 -DGGML_NATIVE=ON

559 -DLLAMA_BUILD_TESTS=OFF

560 -DLLAMA_BUILD_EXAMPLES=ON

561 -DLLAMA_BUILD_TOOLS=ON

562 -DLLAMA_BUILD_SERVER=ON

563 -DLLAMA_CURL=OFF

564 -DCMAKE_INSTALL_PREFIX="${LLAMACPP_INSTALL_DIR}"

565 # Force lib/ over lib64/. cmake's GNUInstallDirs picks lib64/ on

566 # Void/RHEL/SUSE conventions and lib/ on Debian/Arch — which means

567 # our hardcoded RPATH "$ORIGIN/../lib" would resolve to the wrong

568 # directory on lib64 distros and llama-server would fail to dlopen

569 # libllama.so. Forcing lib/ unconditionally keeps the install

570 # layout portable regardless of host distro.

571 -DCMAKE_INSTALL_LIBDIR=lib

572 # Set the binary RPATH so installed binaries find their .so

573 # neighbors without needing LD_LIBRARY_PATH set at run time.

574 -DCMAKE_INSTALL_RPATH='$ORIGIN/../lib'

575 -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON

576 )

577

578 # Point cmake at libs/cuda directly. This is the single source of

579 # truth for "which CUDA does this build use".

580 if [ -x "${LOCAL_CUDA}/bin/nvcc" ]; then

581 echo -e "${C_BLUE}== Using project-local CUDA at $LOCAL_CUDA ==${C_RESET}"

582 cmake_flags+=("-DCUDAToolkit_ROOT=${LOCAL_CUDA}")

583 export PATH="${LOCAL_CUDA}/bin:${PATH}"

584 export LD_LIBRARY_PATH="${LOCAL_CUDA}/lib64:${LD_LIBRARY_PATH:-}"

585 else

586 echo -e "${C_YELLOW}== No libs/cuda — cmake will auto-detect (system PATH, /usr/local/cuda*) ==${C_RESET}"

587 fi

588

589 # Target the detected GPU's compute capability explicitly. The

590 # CMAKE_CUDA_ARCHITECTURES environment variable wins if the operator

591 # set it (useful for cross-compiling to a different GPU).

592 if [ -z "${CMAKE_CUDA_ARCHITECTURES:-}" ]; then

593 local arch

594 arch=$(detect_cuda_arch)

595 if [ -n "$arch" ]; then

596 echo -e " ${C_GREEN}arch${C_RESET} targeting sm_$arch (detected from nvidia-smi)"

597 cmake_flags+=("-DCMAKE_CUDA_ARCHITECTURES=${arch}")

598 else

599 echo -e " ${C_YELLOW}arch${C_RESET} could not detect GPU — letting cmake pick defaults"

600 fi

601 else

602 echo -e " ${C_BLUE}arch${C_RESET} using CMAKE_CUDA_ARCHITECTURES=$CMAKE_CUDA_ARCHITECTURES from env"

603 cmake_flags+=("-DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}")

604 fi

605

606 # CUDA 12.9 officially supports gcc up to 14. If the host gcc is

607 # newer, add -allow-unsupported-compiler so nvcc skips its host-

608 # compiler gate. With the project pinned to CUDA 12.9, the only way

609 # this fires is on a host that bumped to gcc 15+ since the script

610 # was last tested. The fallback else covers --skip-cuda paths where

611 # cuda_mm is empty or a version we don't have a max_gcc for.

612 local cuda_mm gcc_major

613 cuda_mm=$(get_nvcc_major_minor "${LOCAL_CUDA}/bin/nvcc")

614 gcc_major=$(host_gcc_major)

615 if [ "$cuda_mm" = "12.9" ] && [ -n "$gcc_major" ]; then

616 if [ "$gcc_major" -gt "$CUDA_12_9_MAX_GCC" ]; then

617 echo -e " ${C_YELLOW}gcc${C_RESET} host gcc $gcc_major > CUDA $cuda_mm max ($CUDA_12_9_MAX_GCC); adding -allow-unsupported-compiler"

618 cmake_flags+=("-DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler")

619 else

620 echo -e " ${C_GREEN}gcc${C_RESET} host gcc $gcc_major within CUDA $cuda_mm supported range (<= $CUDA_12_9_MAX_GCC)"

621 fi

622 fi

623

624 echo -e "${C_BLUE}== Configuring llama.cpp ==${C_RESET}"

625 cmake .. "${cmake_flags[@]}" || {

626 echo -e "${C_RED}CMake configure failed.${C_RESET}" >&2

627 echo -e "${C_YELLOW}Likely causes (given the script's current path):${C_RESET}" >&2

628 echo -e " - libs/cuda/ is incomplete (an interrupted install) — try --force-cuda" >&2

629 echo -e " - CMAKE_CUDA_ARCHITECTURES env override targets an arch this CUDA does not support" >&2

630 echo -e " - llama.cpp upstream renamed a CMake variable since tag $LLAMACPP_VERSION (we pin to that tag)" >&2

631 echo -e " - host gcc bumped past CUDA's max — update CUDA_12_*_MAX_GCC constants in this script" >&2

632 exit 1

633 }

634

635 echo -e "${C_BLUE}== Building llama.cpp ==${C_RESET}"

636 echo -e " ${C_GREEN}jobs${C_RESET} using $BUILD_JOBS parallel build jobs (override with BUILD_JOBS=N)"

637 cmake --build . --config Release -j "$BUILD_JOBS" || {

638 echo -e "${C_RED}Build failed.${C_RESET}" >&2

639 exit 1

640 }

641

642 # Verify the binaries we care about actually got built (still in the

643 # tmp/ build tree at this point — install_llamacpp copies them out).

644 local missing=0

645 for bin in llama-server llama-cli llama-embedding; do

646 if [ ! -x "$build_dir/bin/$bin" ]; then

647 echo -e " ${C_RED}MISSING${C_RESET} $bin"

648 missing=1

649 else

650 echo -e " ${C_GREEN}built${C_RESET} $bin"

651 fi

652 done

653

654 if [ "$missing" -ne 0 ]; then

655 echo -e "${C_RED}Some binaries are missing. The CMake build flags may have skipped them.${C_RESET}" >&2

656 echo -e " ${C_YELLOW}HINT${C_RESET} at this llama.cpp tag, llama-server / llama-cli / llama-embedding" >&2

657 echo -e " ${C_YELLOW} ${C_RESET} live under examples/, so LLAMA_BUILD_EXAMPLES=ON is required." >&2

658 echo -e " ${C_YELLOW} ${C_RESET} Upstream may have moved them to tools/ in a newer tag." >&2

659 exit 1

660 fi

661

662 echo -e "${C_GREEN}llama.cpp built successfully in tmp/${C_RESET}"

663}

664# }}}

665

666# {{{ install_llamacpp

667# Copy finished binaries + shared libraries + headers from the RAM-backed

668# tmp/ build tree into the disk-backed libs/llama.cpp/ install prefix via

669# `cmake --install`. This is the step where artifacts "move from RAM to

670# disk" — the tmp/ source and build trees can be wiped after this without

671# affecting the project's ability to run llama-server.

672install_llamacpp() {

673 local build_dir="${LLAMACPP_SRC_DIR}/build"

674

675 echo -e "${C_BLUE}== Installing llama.cpp artifacts to $LLAMACPP_INSTALL_DIR ==${C_RESET}"

676 cmake --install "$build_dir" --config Release || {

677 echo -e "${C_RED}cmake --install failed.${C_RESET}" >&2

678 exit 1

679 }

680

681 # Confirm the install actually produced the binaries we expect at the

682 # final on-disk location. Catches the case where llama.cpp's install

683 # rules changed shape between versions and our flags aren't matching.

684 local missing=0

685 for bin in llama-server llama-cli llama-embedding; do

686 if [ ! -x "${LLAMACPP_INSTALL_DIR}/bin/$bin" ]; then

687 echo -e " ${C_RED}MISSING${C_RESET} ${LLAMACPP_INSTALL_DIR}/bin/$bin"

688 missing=1

689 else

690 echo -e " ${C_GREEN}installed${C_RESET} bin/$bin"

691 fi

692 done

693

694 if [ "$missing" -ne 0 ]; then

695 echo -e "${C_RED}cmake --install completed but expected binaries are missing.${C_RESET}" >&2

696 exit 1

697 fi

698

699 echo -e "${C_GREEN}llama.cpp installed at $LLAMACPP_INSTALL_DIR${C_RESET}"

700}

701# }}}

702

703# {{{ download_model

704# Pull the configured GGUF model file from HuggingFace if it is not

705# already present. The download is around 280 MB at f16 precision and

706# typically takes ~30 seconds on a residential connection. Skip with

707# --no-model when iterating on the build itself.

708download_model() {

709 if [ "$SKIP_MODEL" -eq 1 ]; then

710 echo -e "${C_YELLOW}== Skipping model download (--no-model) ==${C_RESET}"

711 return

712 fi

713

714 local model_path="${DIR}/assets/models/${MODEL_FILE}"

715

716 if [ -f "$model_path" ]; then

717 local size

718 size=$(stat -c '%s' "$model_path")

719 if [ "$size" -gt 100000000 ]; then

720 echo -e "${C_GREEN}Model already present: $model_path ($((size / 1024 / 1024)) MB)${C_RESET}"

721 return

722 fi

723 echo -e "${C_YELLOW}Existing model file is suspiciously small ($size bytes); re-downloading.${C_RESET}"

724 rm -f "$model_path"

725 fi

726

727 echo -e "${C_BLUE}== Downloading $MODEL_FILE from HuggingFace ==${C_RESET}"

728 echo -e " URL: $MODEL_URL"

729 curl -L --fail --progress-bar -o "$model_path" "$MODEL_URL" || {

730 echo -e "${C_RED}Download failed.${C_RESET}" >&2

731 rm -f "$model_path"

732 exit 1

733 }

734

735 echo -e "${C_GREEN}Model saved to $model_path${C_RESET}"

736}

737# }}}

738

739# {{{ smoke_test

740# Launch the server briefly, ping it for one embedding, kill it. This

741# catches obvious "the build linked but does not actually run" failures

742# before the operator wires the pipeline against it.

743smoke_test() {

744 if [ "$SKIP_MODEL" -eq 1 ]; then

745 echo -e "${C_YELLOW}== Skipping smoke test (model not downloaded) ==${C_RESET}"

746 return

747 fi

748

749 local server_bin="${LLAMACPP_INSTALL_DIR}/bin/llama-server"

750 local model_path="${DIR}/assets/models/${MODEL_FILE}"

751 local port=18080 # Non-default to avoid collision with any running server

752 local log="${DIR}/tmp/llamacpp-smoketest.log"

753

754 # The server binary needs CUDA runtime libs at load time. The install

755 # also bakes $ORIGIN/../lib into the binary RPATH so llama.cpp's own

756 # libs (libllama.so, libggml*.so) resolve. We set LD_LIBRARY_PATH for

757 # the CUDA runtime specifically — libs/cuda is not under the binary's

758 # RPATH search.

759 local smoke_ld_path="${LOCAL_CUDA}/lib64:${LD_LIBRARY_PATH:-}"

760

761 echo -e "${C_BLUE}== Smoke testing llama-server ==${C_RESET}"

762 LD_LIBRARY_PATH="$smoke_ld_path" "$server_bin" \

763 -m "$model_path" \

764 --embedding \

765 --host 127.0.0.1 \

766 --port "$port" \

767 > "$log" 2>&1 &

768 local pid=$!

769

770 # Wait up to 30 s for the server to become responsive.

771 local i=0

772 while [ "$i" -lt 30 ]; do

773 if curl -s --max-time 1 "http://127.0.0.1:$port/health" >/dev/null 2>&1; then

774 break

775 fi

776 sleep 1

777 i=$((i + 1))

778 done

779

780 if [ "$i" -ge 30 ]; then

781 echo -e "${C_RED}Server did not become responsive within 30 s${C_RESET}" >&2

782 echo "Last 20 lines of server log ($log):" >&2

783 tail -n 20 "$log" >&2

784 kill "$pid" 2>/dev/null

785 wait "$pid" 2>/dev/null

786 exit 1

787 fi

788

789 local response

790 response=$(curl -s --max-time 10 "http://127.0.0.1:$port/v1/embeddings" \

791 -H 'Content-Type: application/json' \

792 -d '{"model": "nomic-embed-text", "input": "clustering: hello world"}')

793

794 kill "$pid" 2>/dev/null

795 wait "$pid" 2>/dev/null

796

797 if echo "$response" | grep -q '"embedding"'; then

798 echo -e "${C_GREEN}Smoke test passed — server returned a valid embedding.${C_RESET}"

799 else

800 echo -e "${C_RED}Smoke test failed — unexpected response:${C_RESET}" >&2

801 echo "$response" >&2

802 exit 1

803 fi

804}

805# }}}

806

807# {{{ print_env_summary

808# Final friendly summary so the operator knows what env vars to set if they

809# want to invoke llama-server / nvcc by hand outside this script. The build

810# script handles its own env internally, but downstream tools need the hint.

811print_env_summary() {

812 cat <<EOF

813

814${C_GREEN}===============================================================${C_RESET}

815${C_GREEN} build-deps.sh complete${C_RESET}

816${C_GREEN}===============================================================${C_RESET}

817 CUDA: ${LOCAL_CUDA}

818 Binaries: ${LLAMACPP_INSTALL_DIR}/bin/

819 Libs: ${LLAMACPP_INSTALL_DIR}/lib/

820 Source: ${LLAMACPP_SRC_DIR} (RAM-backed; wipes on reboot)

821 Model: ${DIR}/assets/models/${MODEL_FILE}

822

823 To use libs/cuda's nvcc/cuda-runtime from your shell, add to ~/.bashrc:

824 export PATH="${LOCAL_CUDA}/bin:\$PATH"

825 export LD_LIBRARY_PATH="${LOCAL_CUDA}/lib64:\$LD_LIBRARY_PATH"

826EOF

827}

828# }}}

829

830# {{{ main

831main() {

832 parse_arguments "$@"

833 check_requirements

834 # Bail BEFORE the CUDA install step if libs/llama.cpp/.git exists and

835 # --clean was not passed — saves the operator from eating a 5 GB CUDA

836 # download just to discover their old layout blocks the install.

837 detect_old_llamacpp_layout

838 ensure_directories

839 build_cuda

840 clone_llamacpp

841 build_llamacpp

842 install_llamacpp

843 download_model

844 smoke_test

845 print_env_summary

846}

847# }}}

848

849main "$@"

850

851# vim: set foldmethod=marker:

852