run.sh

27# {{{ setup_dir_path

28setup_dir_path() {

29 if [ -n "$1" ]; then

30 echo "$1"

31 else

32 echo "/mnt/mtwo/programming/ai-stuff/neocities-modernization"

33 fi

34}

35# }}}

37# {{{ Signal handling

38# Trap Ctrl+C so the script actually exits when the operator interrupts.

39# Bash on its own does not always propagate SIGINT to long-running children

40# (luajit's tight inner loops in particular eat the signal), so we kill

41# every background job in our process group and exit non-zero. Exit code

42# 130 is the conventional value for "terminated by SIGINT" (128 + signal#).

43cleanup_on_interrupt() {

44 echo

45 echo "Interrupted by user (SIGINT)" >&2

46 # Kill anything we backgrounded; suppress errors when there are none.

47 jobs -p | xargs -r kill 2>/dev/null

48 # Best-effort kill the entire process group too, in case a child

49 # spawned its own children without forwarding signals.

50 kill -- -$$ 2>/dev/null

51 exit 130

52}

53trap cleanup_on_interrupt INT TERM

54

55# WE_STARTED_INFERENCE_SERVER tracks whether THIS run started the llama.cpp

56# server itself (because validation failed at startup). If it's true, the

57# EXIT trap below shuts the server down again. If the operator (or a prior

58# run) was already running a server when we started, we leave it alone —

59# never kill what we did not start.

60WE_STARTED_INFERENCE_SERVER=false

61

62# cleanup_inference_server: gracefully terminate the llama.cpp server we

63# auto-started during the pre-flight validation phase. Runs on every exit

64# path (normal completion, SIGINT/SIGTERM via cleanup_on_interrupt, errors

65# that hit `exit`). The PID is read from a file the start script writes;

66# if the file is missing or stale (PID no longer alive) we silently bow

67# out — this is best-effort cleanup, not a contract.

68cleanup_inference_server() {

69 if ! $WE_STARTED_INFERENCE_SERVER; then

70 return

71 fi

72 local pid_file="$DIR/tmp/llamacpp-server.pid"

73 if [ ! -f "$pid_file" ]; then

74 return

75 fi

76 local pid

77 pid=$(cat "$pid_file" 2>/dev/null)

78 if [ -z "$pid" ] || ! kill -0 "$pid" 2>/dev/null; then

79 # Stale PID file — clean it up and move on.

80 rm -f "$pid_file"

81 return

82 fi

83 echo "Shutting down inference server (PID $pid) that this run started..." >&2

84 kill "$pid" 2>/dev/null

85 # Give the server up to 5 s to exit on SIGTERM. Most well-behaved

86 # processes shut down within a second; the timeout is generous.

87 local i=0

88 while [ "$i" -lt 5 ]; do

89 if ! kill -0 "$pid" 2>/dev/null; then

90 break

91 fi

92 sleep 1

93 i=$((i + 1))

94 done

95 if kill -0 "$pid" 2>/dev/null; then

96 echo " server did not exit on SIGTERM; sending SIGKILL" >&2

97 kill -9 "$pid" 2>/dev/null

98 fi

99 rm -f "$pid_file"

100}

101trap cleanup_inference_server EXIT

102# }}}

104# {{{ TUI Library

105# Source TUI library for interactive mode with command preview

106LIBS_DIR="/home/ritz/programming/ai-stuff/scripts/libs"

107TUI_AVAILABLE=false

108if [[ -f "${LIBS_DIR}/lua-menu.sh" ]] && command -v luajit &>/dev/null; then

109 source "${LIBS_DIR}/lua-menu.sh"

110 TUI_AVAILABLE=true

111fi

112# }}}

114# {{{ show_help

115show_help() {

116 cat << 'EOF'

117Usage: ./run.sh [FLAGS] [PROJECT_DIR]

118

119Runs the poem processing pipeline. Stages are selected individually

120by named flag (--generate-diversity, --parse, etc.) or by stage

121number (--stage N, --stage=N). Use --full to run every stage.

122With stage flags, runs only the specified stages in pipeline order.

123

124Pipeline Stages (run in order, multiple can be specified):

125 --update-words Stage 1: Sync input files from words repository

126 --extract Stage 2: Extract content from backup archives

127 --parse Stage 3: Parse poems from JSON sources into poems.json

128 --validate Stage 4: Run poem validation

129 --catalog-images Stage 5: Catalog images from input directories

130 --generate-embeddings Stage 6: Generate embeddings via the inference server (~2-3 hours)

131 --generate-similarity Stage 7: Build similarity matrix (~30 min)

132 --generate-diversity Stage 8: Pre-compute diversity cache (~42 hours)

133 --generate-html Stage 9: Generate poem pages, gallery, source browser

134 --generate-wordcloud Stage 10: Generate word-cloud menu and per-word pages

135

136Stage Selection:

137 --stage N Select stage by number (e.g. --stage 8, --stage=5)

138 --full Run ALL stages 1-10 including embeddings

139

140Stage Configuration:

141 --threads N Thread count for parallel operations (default: 4)

142 --force Force regeneration even if files are fresh

143 --force-stage N Force regenerate specific stage only (1-10)

144

145Pagination (HTML Generation):

146 --pages N Pages per poem (default: from config, 1)

147 --poems-per-page N Poems per page for similar/different (default: 200)

148 --chrono-per-page N Poems per page for chronological (default: 500)

149 --seed N Master seed for all randomization (word-cloud shuffle,

150 image order). Same seed => identical output. Precedence:

151 this flag > config.randomization.seed > an auto-generated

152 seed. The resolved seed is recorded to

153 output/generation-metadata.json. (Issue 10-058)

154

155Word Cloud:

156 --wordcloud-words N Number of words in word cloud (default: 200);

157 pass "all" (--wordcloud-words all) for every word

158 --wordcloud-poems N Poems per word-cloud page (default: 50)

159

160Extraction Options:

161 --include-boosts Include fediverse boosts/reblogs in extraction

162

163External Files (Issue 10-003b):

164 --list-external List configured external file sources

165 --sync-only NAME Sync only the specified external source

166

167Inference Server (Issue 10-017):

168 --server NAME Use specific Inference server from config.lua

169 --model NAME Embedding model to use; must be one of the server's

170 available_models (default: the server's configured model)

171 --list-servers List available Inference servers and exit

172

173Output Control:

174 --quiet Suppress progress messages

175 --verbose Show detailed progress

176 --dry-run Show what would be executed without running

177 --debug Write all logs to output/debug-logs/ (durable disk) and

178 keep them on exit, instead of the RAM-backed tmp/ that a

179 hard GPU lock + reboot would wipe. Also tees this script's

180 console output to output/debug-logs/run.log.

181 --low-priority Run compute-heavy stages at lower OS priority (nice -n 10)

182 Keeps desktop/terminal responsive during long operations

183

184Interactive Mode:

185 -I, --interactive Launch TUI for interactive selection (with command preview)

186

187Directory Options:

188 --dir PATH Assets directory (where poems.json etc. are stored)

189 --output PATH Output directory (default: output/)

190

191Other:

192 -h, --help Show this help message

193

194Examples:

195 ./run.sh --full # Run ALL stages including embeddings

196 ./run.sh --generate-html # Only regenerate HTML

197 ./run.sh --stage 8 # Run stage 8 by number

198 ./run.sh --stage=5 --stage=9 # Stage 5 and stage 9

199 ./run.sh --parse --generate-html # Parse then generate HTML

200 ./run.sh --generate-html --threads 8 # HTML with 8 threads

201 ./run.sh --generate-html --pages 5 # Generate top 500 poems per file

202 ./run.sh -I # Interactive TUI mode

203

204Notes:

205 - Stage 6 (embeddings) requires the inference server running with the embedding model

206 - Stage 8 (diversity) takes ~42 hours but is a one-time cost

207 - Once stages 6-8 complete, subsequent runs use cached data

208EOF

209}

210# }}}

212# {{{ Parse command line arguments

213DIR=""

214ASSETS_DIR=""

215OUTPUT_DIR=""

216INTERACTIVE=false

217

218# Stage flags (boolean)

219UPDATE_WORDS=false

220EXTRACT=false

221PARSE=false

222VALIDATE=false

223CATALOG_IMAGES=false

224GENERATE_EMBEDDINGS=false

225GENERATE_SIMILARITY=false

226GENERATE_DIVERSITY=false

227GENERATE_HTML=false

228GENERATE_WORDCLOUD=false

229

230# Config flags

231THREADS=""

232# Boost-inclusion override forwarded to extraction (empty = use config default)

233BOOSTS_ARG=""

234FORCE=false

235# Issue 10-016: Per-stage force flags

236FORCE_STAGE_1=false

237FORCE_STAGE_2=false

238FORCE_STAGE_3=false

239FORCE_STAGE_4=false

240FORCE_STAGE_5=false

241FORCE_STAGE_6=false

242FORCE_STAGE_7=false

243FORCE_STAGE_8=false

244FORCE_STAGE_9=false

245FORCE_STAGE_10=false

246QUIET=false

247VERBOSE=false

248DRY_RUN=false

249# --debug: route all logs to output/ (durable disk) instead of the RAM-backed

250# tmp/ symlink, and preserve them on exit. Added to diagnose a hard GPU lock:

251# such a freeze forces a power-cycle, and the tmpfs-backed tmp/ is wiped on

252# reboot, taking every diagnostic with it. See the setup block after `cd $DIR`.

253DEBUG=false

254# Issue 10-028: Lower process priority for UI responsiveness

255LOW_PRIORITY=false

256# Model propagation fix: run.sh no longer hard-codes a default model here, so

257# source code can never disagree with config.lua about the default. CLI_MODEL

258# holds --model ONLY when the operator actually passed it (that is what we record

259# on the per-run overrides notepad); the effective MODEL_NAME used by run.sh's

260# own path/freshness checks is resolved AFTER arg-parsing -- from CLI_MODEL if

261# given, else from config.lua through the same code the child stages use, so

262# every stage agrees by construction. See "Resolve the effective model" below.

263CLI_MODEL=""

264MODEL_NAME=""

265# Issue 8-022: Pagination settings for HTML generation

266PAGES=""

267POEMS_PER_PAGE=""

268

269# Issue 10-058: master seed for all randomization. Empty here means "no --seed on

270# the command line"; the resolver below then falls back to config.randomization.seed

271# and finally to an auto-generated, recorded seed. RANDOM_SEED is the resolved value.

272RANDOM_SEED_FLAG=""

273RANDOM_SEED=""

274RANDOM_SEED_SOURCE=""

275

276# Issue 8-043: Word cloud configuration

277# Word-cloud word count: a number, or the literal "all" for every word. Both the

278# CLI (--wordcloud-words all) and the menu's "All Words" checkbox set this single

279# value -- there is no separate "all" flag to keep in sync.

280WORDCLOUD_WORDS=""

281# Issue 8-050d: Poems per word-cloud page

282WORDCLOUD_POEMS=""

283

284# Issue 8-011: Fediverse boost inclusion (extraction stage)

285INCLUDE_BOOSTS=false

286

287# Issue 10-003b: External file management

288LIST_EXTERNAL=false

289SYNC_ONLY=""

290

291# Issue 10-017: Inference server configuration

292INFERENCE_SERVER=""

293LIST_SERVERS=false

294

295# Track if any stage flag was explicitly set

296STAGE_FLAG_SET=false

297

298while [[ $# -gt 0 ]]; do

299 case $1 in

300 -h|--help)

301 show_help

302 exit 0

303 ;;

304 -I|--interactive)

305 INTERACTIVE=true

306 shift

307 ;;

308 --dir)

309 ASSETS_DIR="$2"

310 shift 2

311 ;;

312 --dir=*)

313 ASSETS_DIR="${1#*=}"

314 shift

315 ;;

316 --output)

317 OUTPUT_DIR="$2"

318 shift 2

319 ;;

320 --output=*)

321 OUTPUT_DIR="${1#*=}"

322 shift

323 ;;

324 --threads)

325 THREADS="$2"

326 shift 2

327 ;;

328 --threads=*)

329 THREADS="${1#*=}"

330 shift

331 ;;

332 # Issue 10-058: master seed for all randomization. Highest precedence,

333 # overrides config.randomization.seed. Resolved + recorded after DIR setup.

334 --seed)

335 RANDOM_SEED_FLAG="$2"

336 shift 2

337 ;;

338 --seed=*)

339 RANDOM_SEED_FLAG="${1#*=}"

340 shift

341 ;;

342 --force)

343 FORCE=true

344 shift

345 ;;

346 # Issue 10-016: Per-stage force regeneration (space-separated format)

347 --force-stage)

348 stage_num="$2"

349 case "$stage_num" in

350 1) FORCE_STAGE_1=true ;;

351 2) FORCE_STAGE_2=true ;;

352 3) FORCE_STAGE_3=true ;;

353 4) FORCE_STAGE_4=true ;;

354 5) FORCE_STAGE_5=true ;;

355 6) FORCE_STAGE_6=true ;;

356 7) FORCE_STAGE_7=true ;;

357 8) FORCE_STAGE_8=true ;;

358 9) FORCE_STAGE_9=true ;;

359 10) FORCE_STAGE_10=true ;;

360 *)

361 echo "ERROR: Invalid stage number: $stage_num (valid: 1-10)" >&2

362 exit 1

363 ;;

364 esac

365 shift 2

366 ;;

367 # Issue 10-016: Per-stage force regeneration (= format for backward compatibility)

368 --force-stage=*)

369 stage_num="${1#*=}"

370 case "$stage_num" in

371 1) FORCE_STAGE_1=true ;;

372 2) FORCE_STAGE_2=true ;;

373 3) FORCE_STAGE_3=true ;;

374 4) FORCE_STAGE_4=true ;;

375 5) FORCE_STAGE_5=true ;;

376 6) FORCE_STAGE_6=true ;;

377 7) FORCE_STAGE_7=true ;;

378 8) FORCE_STAGE_8=true ;;

379 9) FORCE_STAGE_9=true ;;

380 10) FORCE_STAGE_10=true ;;

381 *)

382 echo "ERROR: Invalid stage number: $stage_num (valid: 1-10)" >&2

383 exit 1

384 ;;

385 esac

386 shift

387 ;;

388 --quiet)

389 QUIET=true

390 shift

391 ;;

392 --verbose)

393 VERBOSE=true

394 shift

395 ;;

396 --dry-run)

397 DRY_RUN=true

398 shift

399 ;;

400 # Boost inclusion (reshared posts). These override config.privacy.

401 # include_boosts and are forwarded to the extraction step. Only take

402 # effect on a (re)parse, since they change what poems.json contains.

403 --no-boosts|--exclude-boosts)

404 BOOSTS_ARG="--no-boosts"

405 shift

406 ;;

407 --include-boosts)

408 BOOSTS_ARG="--include-boosts"

409 shift

410 ;;

411 # --debug: persist logs to output/ (survives the reboot a hard GPU

412 # lock forces). Handled after DIR is resolved, below.

413 --debug)

414 DEBUG=true

415 shift

416 ;;

417 # Issue 10-028: Lower process priority for UI responsiveness

418 --low-priority)

419 LOW_PRIORITY=true

420 shift

421 ;;

422 --model)

423 CLI_MODEL="$2"

424 shift 2

425 ;;

426 --model=*)

427 CLI_MODEL="${1#*=}"

428 shift

429 ;;

430 # Issue 8-022: Pagination flags for HTML generation

431 --pages)

432 PAGES="$2"

433 shift 2

434 ;;

435 --pages=*)

436 PAGES="${1#*=}"

437 shift

438 ;;

439 --poems-per-page)

440 POEMS_PER_PAGE="$2"

441 shift 2

442 ;;

443 --poems-per-page=*)

444 POEMS_PER_PAGE="${1#*=}"

445 shift

446 ;;

447 --chrono-per-page)

448 CHRONO_PER_PAGE="$2"

449 shift 2

450 ;;

451 --chrono-per-page=*)

452 CHRONO_PER_PAGE="${1#*=}"

453 shift

454 ;;

455 # Issue 8-043: Word cloud configuration. Word count is set with

456 # --wordcloud-words N, or "--wordcloud-words all" for every word.

457 --wordcloud-words)

458 WORDCLOUD_WORDS="$2"

459 shift 2

460 ;;

461 --wordcloud-words=*)

462 WORDCLOUD_WORDS="${1#*=}"

463 shift

464 ;;

465 # Issue 8-050d: Poems per word-cloud page

466 --wordcloud-poems)

467 WORDCLOUD_POEMS="$2"

468 shift 2

469 ;;

470 --wordcloud-poems=*)

471 WORDCLOUD_POEMS="${1#*=}"

472 shift

473 ;;

474 # Issue 8-011: Fediverse boost inclusion

475 --include-boosts)

476 INCLUDE_BOOSTS=true

477 shift

478 ;;

479 # Issue 10-003b: External file management

480 --list-external)

481 LIST_EXTERNAL=true

482 shift

483 ;;

484 --sync-only)

485 SYNC_ONLY="$2"

486 shift 2

487 ;;

488 --sync-only=*)

489 SYNC_ONLY="${1#*=}"

490 shift

491 ;;

492 # Issue 10-017: Inference server configuration

493 --server)

494 INFERENCE_SERVER="$2"

495 shift 2

496 ;;

497 --server=*)

498 INFERENCE_SERVER="${1#*=}"

499 shift

500 ;;

501 --list-servers)

502 LIST_SERVERS=true

503 shift

504 ;;

505 # Stage flags

506 --update-words)

507 UPDATE_WORDS=true

508 STAGE_FLAG_SET=true

509 shift

510 ;;

511 --extract)

512 EXTRACT=true

513 STAGE_FLAG_SET=true

514 shift

515 ;;

516 --parse)

517 PARSE=true

518 STAGE_FLAG_SET=true

519 shift

520 ;;

521 --validate)

522 VALIDATE=true

523 STAGE_FLAG_SET=true

524 shift

525 ;;

526 --catalog-images)

527 CATALOG_IMAGES=true

528 STAGE_FLAG_SET=true

529 shift

530 ;;

531 --generate-embeddings)

532 GENERATE_EMBEDDINGS=true

533 STAGE_FLAG_SET=true

534 shift

535 ;;

536 --generate-similarity)

537 GENERATE_SIMILARITY=true

538 STAGE_FLAG_SET=true

539 shift

540 ;;

541 --generate-diversity)

542 GENERATE_DIVERSITY=true

543 STAGE_FLAG_SET=true

544 shift

545 ;;

546 --generate-html)

547 GENERATE_HTML=true

548 STAGE_FLAG_SET=true

549 shift

550 ;;

551 --generate-wordcloud)

552 GENERATE_WORDCLOUD=true

553 STAGE_FLAG_SET=true

554 shift

555 ;;

556 # --stage N or --stage=N — select a specific stage by number.

557 # Stage map (numeric): 1=update-words, 2=extract, 3=parse,

558 # 4=validate, 5=catalog-images, 6=generate-embeddings,

559 # 7=generate-similarity, 8=generate-diversity, 9=generate-html,

560 # 10=generate-wordcloud. Can be repeated (e.g. --stage 6 --stage 7).

561 --stage)

562 case "$2" in

563 1) UPDATE_WORDS=true ;;

564 2) EXTRACT=true ;;

565 3) PARSE=true ;;

566 4) VALIDATE=true ;;

567 5) CATALOG_IMAGES=true ;;

568 6) GENERATE_EMBEDDINGS=true ;;

569 7) GENERATE_SIMILARITY=true ;;

570 8) GENERATE_DIVERSITY=true ;;

571 9) GENERATE_HTML=true ;;

572 10) GENERATE_WORDCLOUD=true ;;

573 *) echo "Error: --stage expects a number 1-10, got: $2" >&2; exit 1 ;;

574 esac

575 STAGE_FLAG_SET=true

576 shift 2

577 ;;

578 --stage=*)

579 STAGE_NUM="${1#*=}"

580 case "$STAGE_NUM" in

581 1) UPDATE_WORDS=true ;;

582 2) EXTRACT=true ;;

583 3) PARSE=true ;;

584 4) VALIDATE=true ;;

585 5) CATALOG_IMAGES=true ;;

586 6) GENERATE_EMBEDDINGS=true ;;

587 7) GENERATE_SIMILARITY=true ;;

588 8) GENERATE_DIVERSITY=true ;;

589 9) GENERATE_HTML=true ;;

590 10) GENERATE_WORDCLOUD=true ;;

591 *) echo "Error: --stage expects a number 1-10, got: $STAGE_NUM" >&2; exit 1 ;;

592 esac

593 STAGE_FLAG_SET=true

594 shift

595 ;;

596 --full)

597 # ALL stages including expensive embedding generation (1-10)

598 UPDATE_WORDS=true

599 EXTRACT=true

600 PARSE=true

601 VALIDATE=true

602 CATALOG_IMAGES=true

603 GENERATE_EMBEDDINGS=true

604 GENERATE_SIMILARITY=true

605 GENERATE_DIVERSITY=true

606 GENERATE_HTML=true

607 GENERATE_WORDCLOUD=true

608 STAGE_FLAG_SET=true

609 shift

610 ;;

611 -*)

612 echo "Unknown option: $1" >&2

613 echo "Use --help for usage information" >&2

614 exit 1

615 ;;

616 *)

617 DIR="$1"

618 shift

619 ;;

620 esac

621done

622

623# No implicit stages — require explicit selection. The operator should

624# say what they want to run: a named stage flag, --stage N, or --full.

625if ! $STAGE_FLAG_SET && ! $INTERACTIVE && ! $LIST_SERVERS; then

626 echo "Error: no stages selected. Use --full, a named stage flag" >&2

627 echo " (e.g. --generate-diversity), --stage N, or -I for interactive mode." >&2

628 echo " Run with --help for the full flag list." >&2

629 exit 1

630fi

631

632# Issue 8-032: Convert FORCE to Lua boolean for passing to Lua functions

633if $FORCE; then

634 FORCE_LUA="true"

635else

636 FORCE_LUA="false"

637fi

638

639# Issue 10-028: Set up nice prefix for low priority execution

640# When enabled, heavy operations run at nice level 10 (lower priority)

641# This keeps the desktop/terminal responsive during long pipeline runs

642NICE_PREFIX=""

643if $LOW_PRIORITY; then

644 NICE_PREFIX="nice -n 10"

645fi

646# }}}

648# {{{ Setup directories

649DIR=$(setup_dir_path "$DIR")

650

651# Issue 10-051: stage wall-clock timing. Sourced after DIR is final so the

652# library knows where .stage-timings lives. Provides timed_stage (wrap a stage

653# to record its duration on success) and stage_timing_label (render the measured

654# estimate for the pre-flight list). Missing file is harmless: timing is optional.

655[ -f "${DIR}/scripts/stage-timing.sh" ] && source "${DIR}/scripts/stage-timing.sh"

656# If the library was absent, timed_stage still has to exist so the dispatch below

657# runs unchanged -- define a passthrough that just runs the stage, no recording.

658command -v timed_stage >/dev/null 2>&1 || timed_stage() { shift; "$@"; }

659

660# Build arguments for Lua scripts

661ASSETS_ARG=""

662if [ -n "$ASSETS_DIR" ]; then

663 ASSETS_ARG="--dir $ASSETS_DIR"

664fi

665

666# Ensure we're in the right directory

667cd "$DIR" || {

668 echo "Error: Could not access directory $DIR" >&2

669 exit 1

670}

671# }}}

673# {{{ --debug: persistent logging

674# Why this exists: a hard GPU lock forces a power-cycle, and the tmp/ symlink

675# points at a tmpfs subdir under /tmp/ (RAM) that the reboot wipes — so the

676# logs that would explain the freeze are gone before they can be read.

677# --debug routes logs to output/debug-logs/ (durable disk) instead.

678#

679# Two mechanisms, working together:

680# 1. NEOCITIES_LOG_DIR is exported so the child scripts that own the

681# inference logs — scripts/start-llamacpp-server.sh (llamacpp-server.log)

682# and generate-embeddings.sh (embedding_generation.log) — write there

683# and skip their usual end-of-run log deletion.

684# 2. This script's own console output is tee'd to run.log, so whatever stage

685# was mid-flight at the instant of the freeze (including the GPU Vulkan

686# similarity/diversity stages, which log only to stdout) leaves a trail.

687#

688# Caveat worth knowing: on a true hard lock you must hard-power-cycle, and the

689# kernel may not have flushed the last few seconds of file writes (dirty pages)

690# to disk. Durable disk still captures vastly more than tmpfs, but the final

691# line or two before the lock can still be lost.

692if $DEBUG; then

693 LOG_DIR="$DIR/output/debug-logs"

694 mkdir -p "$LOG_DIR"

695 export NEOCITIES_LOG_DIR="$LOG_DIR"

696 # The Vulkan C library reads VKC_DEBUG to switch its progress bars from the

697 # animated single-line "\r" form to verbose, newline-terminated lines --

698 # the right shape when stdout is the fsync-logger pipe below and we want a

699 # durable, per-line history of a possibly-freezing run.

700 export VKC_DEBUG=1

701 # Don't reroute stdout through a pipe in interactive mode: the TUI checks

702 # isatty() and a pipe would break its rendering. The child-script file

703 # logs still land in LOG_DIR via the exported env var above.

704 #

705 # fsync-logger (not tee) is used so every line is fsync()'d to disk the

706 # instant it is printed — the stage banners are exactly what triage needs,

707 # and a hard lock right after a banner must not lose it to a dirty-page

708 # buffer. Slow, but --debug is for catching a freeze, not for speed.

709 if ! $INTERACTIVE; then

710 exec > >("$DIR/scripts/fsync-logger" "$LOG_DIR/run.log") 2>&1

711 fi

712 echo "[DEBUG] Logging to $LOG_DIR (per-line fsync to disk; persists across reboots; logs kept on exit)"

713fi

714# }}}

716# {{{ Issue 10-003b: Handle external file commands (immediate actions)

717if $LIST_EXTERNAL; then

718 "$DIR/scripts/sync-external-files" --list

719 exit 0

720fi

721

722if [ -n "$SYNC_ONLY" ]; then

723 "$DIR/scripts/sync-external-files" "$SYNC_ONLY"

724 exit $?

725fi

726# }}}

728# {{{ Issue 10-017: Handle Inference server commands (immediate actions)

729if $LIST_SERVERS; then

730 luajit -e "

731 package.path = '$DIR/libs/?.lua;' .. package.path

732 local inference = require('inference-server-config')

733 inference.list_servers()

734 "

735 exit 0

736fi

737# }}}

739# {{{ Resolve the effective embedding model and record this run's overrides

740# Why this exists: run.sh launches a fresh luajit process per stage, and argv/env

741# reach only the stages we remember to thread them through. Before this block, a

742# --model override silently reverted to config.lua's default in the HTML,

743# word-cloud and word-page stages (they resolve the model via get_selected_model()

744# / embeddings_dir() with no argument). The fix is a shared notepad in RAM: we

745# stamp THIS run's choices onto tmp/run-overrides.lua once, here, and the model

746# resolver reads them. It is rewritten every run, so a previous run's --model can

747# never leak in -- the staleness trap a file has but an env var does not. Passing

748# an empty CLI_MODEL records no model key, so a plain run falls back to config.lua.

749# Materialize the tmpfs-backed tmp/ symlink + target before writing into it.

750# A bare `mkdir -p tmp` does NOT work here: tmp/ is a symlink, and if its target

751# is missing (wiped on reboot) mkdir sees the link, reports "exists", and creates

752# nothing. ensure-tmp-symlink is the project's idempotent, fail-loud helper for

753# exactly this -- it creates the /tmp target the symlink points at.

754"$DIR/scripts/ensure-tmp-symlink" "$DIR" || {

755 echo "Error: could not materialize the tmp/ RAM directory (scripts/ensure-tmp-symlink)" >&2

756 exit 1

757}

758"$DIR/scripts/write-run-overrides" "$DIR" --model "$CLI_MODEL" || {

759 echo "Error: failed to record run overrides (scripts/write-run-overrides)" >&2

760 exit 1

761}

762

763# The effective model for run.sh's OWN path/freshness checks and for the stages it

764# hands an explicit model to: the CLI value if given, else config.lua's default

765# for the selected server -- resolved through the SAME resolver the children use

766# (which now also reads the notepad above), so every stage agrees by construction.

767if [ -n "$CLI_MODEL" ]; then

768 MODEL_NAME="$CLI_MODEL"

769else

770 MODEL_NAME="$(luajit -e "

771 package.path = '$DIR/libs/?.lua;$DIR/src/?.lua;' .. package.path

772 local inf = require('inference-server-config')

773 inf.set_project_root('$DIR')

774 if '$INFERENCE_SERVER' ~= '' then inf.set_selected_server('$INFERENCE_SERVER') end

775 io.write(inf.get_selected_model())

776 ")"

777 if [ -z "$MODEL_NAME" ]; then

778 echo "Error: could not resolve embedding model from config.lua" >&2

779 exit 1

780 fi

781fi

782

783# Create this model's cache directories ONCE here, at model-load, instead of

784# making each stage remember to mkdir its own output dir before its first write.

785# The paths are inferred from the model name by scripts/cache-dir (the single

786# place that maps a model -> its directories): the movable (RAM) dir, its

787# similarities/ subdir, and the reboot-surviving on-disk dir (--disk). A

788# brand-new model otherwise has no assets/embeddings/<model>/ folder, which once

789# let a 40-minute diversity run finish and then fail at its final write. Adding a

790# new model now needs no manual mkdir -- selecting it is enough.

791_ram_dir="$(luajit "$DIR/scripts/cache-dir" "$DIR" --model "$MODEL_NAME")"

792_disk_dir="$(luajit "$DIR/scripts/cache-dir" "$DIR" --model "$MODEL_NAME" --disk)"

793if [ -z "$_ram_dir" ] || [ -z "$_disk_dir" ]; then

794 echo "Error: could not resolve cache directories for model $MODEL_NAME" >&2

795 exit 1

796fi

797mkdir -p "$_ram_dir/similarities" "$_disk_dir" || {

798 echo "Error: could not create cache directories for model $MODEL_NAME" >&2

799 exit 1

800}

801# }}}

803# {{{ Logging functions

804log_info() {

805 if ! $QUIET; then

806 echo "$1"

807 fi

808}

809

810log_verbose() {

811 if $VERBOSE; then

812 echo "$1"

813 fi

814}

815

816log_stage() {

817 if ! $QUIET; then

818 echo ""

819 echo -e "${COLOR_MAGENTA}═══════════════════════════════════════════════════════════════════${COLOR_RESET}"

820 echo -e " ${COLOR_GREEN}$1${COLOR_RESET}"

821 echo -e "${COLOR_MAGENTA}═══════════════════════════════════════════════════════════════════${COLOR_RESET}"

822 fi

823}

824

825log_dry_run() {

826 echo "[DRY-RUN] Would execute: $1"

827}

828

829# ANSI color codes for terminal output

830# These add visual distinction to success/info/error messages

831COLOR_GREEN="\033[92m" # Bright green for success (✓, ✅)

832COLOR_BLUE="\033[94m" # Bright blue for info (ℹ️)

833COLOR_RED="\033[91m" # Bright red for errors (✗, ❌)

834COLOR_YELLOW="\033[93m" # Bright yellow for warnings (⚠️)

835COLOR_MAGENTA="\033[95m" # Bright magenta for stage delimiters

836COLOR_RESET="\033[0m" # Reset to default

837

838# Colored symbol helpers

839symbol_success() {

840 echo -e "${COLOR_GREEN}$1${COLOR_RESET}"

841}

842

843symbol_info() {

844 echo -e "${COLOR_BLUE}$1${COLOR_RESET}"

845}

846

847symbol_error() {

848 echo -e "${COLOR_RED}$1${COLOR_RESET}"

849}

850

851symbol_warning() {

852 echo -e "${COLOR_YELLOW}$1${COLOR_RESET}"

853}

854# }}}

856# {{{ Issue 10-058: resolve + record the build's master seed

857# A single integer governs every randomization site this run (the word-cloud

858# shuffle and image-order randomization). Resolved here -- after arg parsing and

859# DIR setup, before any stage -- with this precedence (highest first):

860# 1. --seed N on the command line

861# 2. config.randomization.seed in config.lua

862# 3. an auto-generated seed (so an unseeded build is still reproducible after

863# the fact, because we RECORD whatever we resolve)

864# The resolved seed is logged, written to output/generation-metadata.json, and

865# threaded to each randomizing subprocess as "--seed=N".

866

867# {{{ resolve_random_seed()

868resolve_random_seed() {

869 if [ -n "$RANDOM_SEED_FLAG" ]; then

870 RANDOM_SEED="$RANDOM_SEED_FLAG"

871 RANDOM_SEED_SOURCE="cli (--seed)"

872 return

873 fi

874 # config.lua is a static `return {...}` table, so dofile reads it without the

875 # Lua config-loader. pcall guards a malformed config (empty => fall through).

876 local cfg_seed

877 cfg_seed=$(luajit -e 'local ok,c=pcall(dofile,"'"$DIR"'/config.lua"); if ok and type(c)=="table" and c.randomization and c.randomization.seed then io.write(tostring(c.randomization.seed)) end')

878 if [ -n "$cfg_seed" ]; then

879 RANDOM_SEED="$cfg_seed"

880 RANDOM_SEED_SOURCE="config.lua (randomization.seed)"

881 return

882 fi

883 # Auto: mix epoch seconds with the PID so two runs in the same second differ;

884 # fold to a 31-bit non-negative int so it round-trips through CLI/JSON/randomseed.

885 RANDOM_SEED=$(( ($(date +%s) * 100000 + $$) % 2147483647 ))

886 RANDOM_SEED_SOURCE="auto-generated"

887}

888# }}}

889

890resolve_random_seed

891

892# No fallback on a bad value: a malformed seed is a hard error, because silently

893# substituting a random one would defeat the reproducibility this whole feature buys.

894case "$RANDOM_SEED" in

895 ''|*[!0-9]*)

896 echo "ERROR: resolved random seed '$RANDOM_SEED' is not a non-negative integer." >&2

897 echo " Fix --seed or config.randomization.seed and re-run." >&2

898 exit 1

899 ;;

900esac

901

902# The argument every randomizing subprocess receives. Equals-form on purpose: the

903# bare number can never be mistaken for a positional DIR by a child's arg parser.

904RANDOM_SEED_ARG="--seed=$RANDOM_SEED"

905log_info "🎲 Random seed: $RANDOM_SEED (source: $RANDOM_SEED_SOURCE)"

906

907# {{{ write_generation_metadata()

908# The canonical "which seed made this build?" record. A small JSON at the output

909# root; written early (so an interrupted build still leaves it) and at the root

910# (so per-stage clears, which only touch output/ subdirs, never wipe it).

911write_generation_metadata() {

912 local out_dir="${OUTPUT_DIR:-$DIR/output}"

913 mkdir -p "$out_dir"

914 local generated_at

915 generated_at=$(date -u +%Y-%m-%dT%H:%M:%SZ)

916 cat > "$out_dir/generation-metadata.json" <<EOF

917{

918 "seed": $RANDOM_SEED,

919 "seed_source": "$RANDOM_SEED_SOURCE",

920 "generated_at": "$generated_at",

921 "pages": "${PAGES:-default}",

922 "poems_per_page": "${POEMS_PER_PAGE:-default}"

923}

924EOF

925}

926# }}}

927

928if $DRY_RUN; then

929 log_dry_run "write $DIR/output/generation-metadata.json (seed $RANDOM_SEED)"

930else

931 write_generation_metadata

932fi

933# }}}

935# {{{ Stage execution functions

936

937# {{{ run_update_words

938run_update_words() {

939 log_stage "📁 Stage 1/10: Updating input files from words repository"

940

941 # Issue 10-016: Check both global and per-stage force flags (Stage 1)

942 local stage_force=$FORCE

943 $FORCE_STAGE_1 && stage_force=true

944

945 # Issue 7-003: Pass force flag to skip file preservation

946 local force_flag=""

947 if $stage_force; then

948 force_flag="--force"

949 fi

950

951 if $DRY_RUN; then

952 log_dry_run "$DIR/scripts/update-words $force_flag"

953 return 0

954 fi

955

956 "$DIR/scripts/update-words" $force_flag || {

957 echo "Warning: Failed to update input files, continuing anyway..." >&2

958 }

959}

960# }}}

961

962# {{{ run_extract

963run_extract() {

964 log_stage "🔄 Stage 2/10: Extracting content from backup archives"

965

966 # Issue 8-011: Build boost inclusion flag

967 local boost_flag=""

968 if $INCLUDE_BOOSTS; then

969 boost_flag="--include-boosts"

970 fi

971

972 if $DRY_RUN; then

973 log_dry_run "$DIR/scripts/update $DIR $boost_flag"

974 return 0

975 fi

976

977 "$DIR/scripts/update" "$DIR" $boost_flag || {

978 echo "Error: Content extraction failed" >&2

979 exit 1

980 }

981}

982# }}}

983

984# {{{ run_strip_excluded

985# Issue 10-053: After sync/extraction, remove excluded images + note source files

986# from input/ so they are never cataloged, embedded, rendered, or uploaded. Runs

987# before image cataloging. strip-excluded validates every exclusion BEFORE it

988# deletes anything; a non-zero exit means a broken exclusion path (it points at no

989# real file), which is FATAL -- continuing would ship content that was explicitly

990# marked do-not-ship. The validation happens before any stripping and before the

991# expensive catalog/embed stages, so a bad path costs only the cheap re-run.

992run_strip_excluded() {

993 log_stage "🧹 Stripping excluded content from input/"

994 if $DRY_RUN; then

995 log_dry_run "lua $DIR/scripts/strip-excluded $DIR"

996 return 0

997 fi

998 if ! lua "$DIR/scripts/strip-excluded" "$DIR"; then

999 echo "ERROR: strip-excluded failed -- a broken exclusion path in config.lua." >&2

1000 echo " Fix excluded_images and re-run; nothing was stripped or shipped." >&2

1001 exit 1

1002 fi

1003}

1004# }}}

1005

1006# {{{ run_parse

1007run_parse() {

1008 log_stage "📝 Stage 3/10: Parsing poems from JSON sources"

1009

1010 # Issue 10-016: Check both global and per-stage force flags (Stage 3)

1011 local stage_force=$FORCE

1012 $FORCE_STAGE_3 && stage_force=true

1013

1014 local force_arg=""

1015 if $stage_force; then

1016 force_arg="--force"

1017 fi

1018

1019 if $DRY_RUN; then

1020 log_dry_run "luajit src/main.lua $DIR --parse-only $force_arg $BOOSTS_ARG $ASSETS_ARG"

1021 return 0

1022 fi

1023

1024 luajit src/main.lua "$DIR" --parse-only $force_arg $BOOSTS_ARG $ASSETS_ARG || {

1025 echo "Error: Poem parsing failed" >&2

1026 exit 1

1027 }

1028}

1029# }}}

1030

1031# {{{ run_validate

1032run_validate() {

1033 log_stage "$(symbol_success "✓") Stage 4/10: Validating poem data"

1034

1035 if $DRY_RUN; then

1036 log_dry_run "luajit src/main.lua $DIR --validate-only $ASSETS_ARG"

1037 return 0

1038 fi

1039

1040 luajit src/main.lua "$DIR" --validate-only $ASSETS_ARG || {

1041 echo "Error: Poem validation failed" >&2

1042 exit 1

1043 }

1044}

1045# }}}

1046

1047# {{{ run_catalog_images

1048# Issue 10-015a: Pass --verbose flag to show detailed image catalog statistics

1049run_catalog_images() {

1050 log_stage "🖼️ Stage 5/10: Cataloging images"

1051

1052 # Build verbose argument if enabled

1053 local VERBOSE_ARG=""

1054 $VERBOSE && VERBOSE_ARG="--verbose"

1055

1056 if $DRY_RUN; then

1057 log_dry_run "luajit src/main.lua $DIR --catalog-only $VERBOSE_ARG $ASSETS_ARG $RANDOM_SEED_ARG"

1058 return 0

1059 fi

1060

1061 luajit src/main.lua "$DIR" --catalog-only $VERBOSE_ARG $ASSETS_ARG $RANDOM_SEED_ARG || {

1062 echo "Error: Image cataloging failed" >&2

1063 exit 1

1064 }

1065}

1066# }}}

1067

1068# {{{ emb_cache_dir

1069# Issue 10-054: resolve a model's cache directory through the shared resolver

1070# (scripts/cache-dir), so run.sh's freshness/pre-flight checks look in EXACTLY the

1071# place the Lua code and generate-embeddings.sh write -- disk or RAM, per the

1072# CACHE_IN_RAM switch. Pass --disk for the reboot-surviving diversity cache. A

1073# blank result is a hard error rather than a silently-wrong (empty) path.

1074emb_cache_dir() {

1075 local d

1076 d="$(luajit "$DIR/scripts/cache-dir" "$DIR" --model "$MODEL_NAME" "$@")"

1077 if [ -z "$d" ]; then

1078 echo "Error: could not resolve cache dir (scripts/cache-dir)" >&2

1079 exit 1

1080 fi

1081 echo "$d"

1082}

1083# }}}

1084

1085# {{{ run_generate_embeddings

1086run_generate_embeddings() {

1087 log_stage "🤖 Stage 6/10: Generating embeddings via the inference server"

1088

1089 # Convert model name for directory (embeddinggemma:latest -> embeddinggemma_latest)

1090 local model_dir_name="${MODEL_NAME//:/_}"

1091 local embeddings_file="$(emb_cache_dir)/embeddings.json"

1092 local poems_file="$DIR/assets/poems.json"

1093

1094 # Issue 10-016: Check both global and per-stage force flags

1095 local stage_force=$FORCE

1096 $FORCE_STAGE_6 && stage_force=true

1097

1098 # Freshness check (Issue 10-050): skip ONLY when every poem already has an

1099 # embedding. The old test compared mtimes (embeddings.json newer than

1100 # poems.json) — which was wrong: a run that embedded 8160/8362 and then died

1101 # leaves a NEWER but INCOMPLETE embeddings.json, so mtime said "fresh, skip"

1102 # and the missing poems never got done. Counting entries is the honest

1103 # signal; incremental mode then fills only the gap, so it is cheap to re-run.

1104 if ! $stage_force && [ -f "$embeddings_file" ] && [ -f "$poems_file" ]; then

1105 # Count embeddings WITHOUT parsing the (large) JSON: each entry carries

1106 # exactly one "poem_index" key. (This counts error records too, so it can

1107 # only over-report completeness; incremental retries those anyway.)

1108 local emb_count

1109 emb_count=$(grep -o '"poem_index"' "$embeddings_file" | wc -l)

1110 local poem_count

1111 poem_count=$(luajit -e "

1112 package.path = '$DIR/?.lua;' .. package.path

1113 local dk = require('libs/dkjson')

1114 local f = io.open('$poems_file'); local d = dk.decode(f:read('*a')); f:close()

1115 print(#(d.poems or d))

1116 ")

1117 if [ -n "$poem_count" ] && [ "$poem_count" -gt 0 ] && [ "$emb_count" -ge "$poem_count" ]; then

1118 log_info " ⏭️ Embeddings complete ($emb_count/$poem_count), skipping..."

1119 return 0

1120 fi

1121 log_info " Embeddings incomplete ($emb_count/${poem_count:-?}) — running incremental to fill the gap..."

1122 fi

1123

1124 local force_arg=""

1125 if $stage_force; then

1126 force_arg="--full-regen"

1127 else

1128 force_arg="--incremental"

1129 fi

1130

1131 # Issue 10-017: Build Inference server argument

1132 local server_arg=""

1133 if [ -n "$INFERENCE_SERVER" ]; then

1134 server_arg="--server=$INFERENCE_SERVER"

1135 fi

1136

1137 if $DRY_RUN; then

1138 log_dry_run "$DIR/generate-embeddings.sh $force_arg --model=$MODEL_NAME $server_arg $DIR"

1139 log_dry_run "luajit $DIR/src/generate-word-pages.lua $DIR --embeddings-only"

1140 return 0

1141 fi

1142

1143 if [ -n "$INFERENCE_SERVER" ]; then

1144 log_info " Inference Server: $INFERENCE_SERVER"

1145 fi

1146 log_info " Model: $MODEL_NAME"

1147 log_info " Output: assets/embeddings/$model_dir_name/embeddings.json"

1148 log_info " Mode: $(if $FORCE; then echo 'full regeneration'; else echo 'incremental (skip existing)'; fi)"

1149

1150 # Issue 10-028: Apply low priority to expensive embedding generation

1151 $NICE_PREFIX "$DIR/generate-embeddings.sh" $force_arg --model="$MODEL_NAME" $server_arg "$DIR" || {

1152 echo "Error: Embedding generation failed" >&2

1153 echo "Make sure the inference server is running with the $MODEL_NAME model" >&2

1154 exit 1

1155 }

1156

1157 # Word embeddings used to run here, but the word-COLOR step inside

1158 # generate-word-pages needs color_embeddings.json, which is produced later by

1159 # run_generate_semantic_colors. Running words first made that step skip with

1160 # "no color embeddings found". Moved to run_generate_word_embeddings, called

1161 # AFTER colors in main.

1162}

1163# }}}

1164

1165# {{{ run_generate_word_embeddings

1166# Word-cloud word embeddings + their semantic colors. Split out of

1167# run_generate_embeddings (Issue 8-043b) and ordered AFTER the semantic-color

1168# stage so color_embeddings.json already exists when the word-color step runs.

1169run_generate_word_embeddings() {

1170 log_info " Generating word embeddings for word cloud..."

1171 # WORDCLOUD_WORDS carries either a number or the literal "all"; the generator

1172 # accepts both via --words (it treats "--words all" the same as "--all").

1173 local wordcloud_args=""

1174 if [ -n "$WORDCLOUD_WORDS" ]; then

1175 wordcloud_args="--words $WORDCLOUD_WORDS"

1176 fi

1177 $NICE_PREFIX luajit "$DIR/src/generate-word-pages.lua" "$DIR" --embeddings-only $wordcloud_args || {

1178 echo "Warning: Word embedding generation failed, continuing..." >&2

1179 }

1180}

1181# }}}

1182

1183# {{{ run_generate_semantic_colors

1184run_generate_semantic_colors() {

1185 # Regenerate poem_colors.json if stale or missing

1186 # This runs BEFORE similarity matrix generation (Stage 6.5)

1187 # Requires: embeddings.json, color_embeddings.json

1188 # Respects: --force (skip freshness check), --dry-run (show actions only)

1189

1190 local model_dir_name="${MODEL_NAME//:/_}"

1191

1192 # Paths match what generate-embeddings.sh writes (see run_generate_embeddings above).

1193 # The stray assets/embeddings/embeddings/ directory on disk is a stale leftover from

1194 # before the model-name subfolder convention; it is not the real output location.

1195 local embeddings_file="$(emb_cache_dir)/embeddings.json"

1196 local poem_colors_file="$(emb_cache_dir)/poem_colors.json"

1197 local color_embeddings_file="$(emb_cache_dir)/color_embeddings.json"

1198

1199 # Embeddings must exist first (exit early if not - prevents confusing errors)

1200 if [ ! -f "$embeddings_file" ]; then

1201 log_verbose " Skipping semantic colors - embeddings not yet generated"

1202 return 0

1203 fi

1204

1205 # color_embeddings.json is derived from the color palette (color_names +

1206 # color_associations in config.lua). It used to regenerate ONLY when the file was

1207 # missing, so editing the palette -- e.g. dropping gray as a cluster color -- had

1208 # no effect until someone deleted the cache by hand (and the config comment that

1209 # said "re-run stage 6.5 after editing" was quietly false). We now fingerprint the

1210 # palette and regenerate whenever it changes, so editing colors then re-running

1211 # actually takes effect. The fingerprint is a sorted, deterministic dump of the

1212 # palette -- no server needed to compute it.

1213 local palette_fp_file="$(emb_cache_dir)/color_palette.fingerprint"

1214 local current_palette_fp

1215 current_palette_fp=$(luajit -e "

1216 package.path = '$DIR/libs/?.lua;$DIR/src/?.lua;' .. package.path

1217 local config = require('config-loader').load()

1218 local names = {}

1219 for _, n in ipairs(config.color_names or {}) do names[#names+1] = n end

1220 table.sort(names)

1221 local parts = {}

1222 for _, n in ipairs(names) do

1223 local a = {}

1224 for _, w in ipairs((config.color_associations or {})[n] or {}) do a[#a+1] = w end

1225 table.sort(a)

1226 parts[#parts+1] = n .. '=' .. table.concat(a, ',')

1227 end

1228 io.write(table.concat(parts, '|'))

1229 ")

1230 local stored_palette_fp=""

1231 [ -f "$palette_fp_file" ] && stored_palette_fp=$(cat "$palette_fp_file")

1232

1233 # Regenerate color embeddings if missing OR the palette changed since last time.

1234 if [ ! -f "$color_embeddings_file" ] || [ "$current_palette_fp" != "$stored_palette_fp" ]; then

1235 if [ -f "$color_embeddings_file" ]; then

1236 log_stage "🎨 Stage 6.5/10: Color palette changed -- regenerating color embeddings"

1237 else

1238 log_stage "🎨 Stage 6.5/10: Generating color embeddings (one-time)"

1239 fi

1240

1241 if $DRY_RUN; then

1242 log_dry_run "luajit semantic-color-calculator (generate color embeddings)"

1243 # Still need to skip poem colors generation in dry run

1244 else

1245 log_info " $(symbol_warning "⚠️") Color embeddings not found, generating via the inference server..."

1246 # Issue 10-003 migrated color_names from config/semantic-colors.json (now deleted)

1247 # into config.lua, loaded via libs/config-loader.lua. Errors here are loud rather

1248 # than silent so a missing config doesn't propagate downstream as a confusing

1249 # "Failed to load required data files" in the next stage.

1250 luajit -e "

1251 package.path = '$DIR/libs/?.lua;$DIR/src/?.lua;' .. package.path

1252 local calc = require('semantic-color-calculator')

1253 local utils = require('utils')

1254 utils.init_assets_root({'$DIR'})

1255

1256 -- Mirror the --server selection pattern used elsewhere in run.sh

1257 -- (see the interactive TUI block below). If INFERENCE_SERVER is empty

1258 -- the module falls back to config.lua's default_inference_server.

1259 -- The interactive flag is forwarded so that a typoed --server or

1260 -- --model triggers a 1/2 prompt only when the operator launched

1261 -- run.sh with -I; otherwise we hard-error.

1262 local inference = require('inference-server-config')

1263 inference.set_project_root('$DIR')

1264 inference.set_interactive_mode('$INTERACTIVE' == 'true')

1265 if '$INFERENCE_SERVER' ~= '' then

1266 inference.set_selected_server('$INFERENCE_SERVER')

1267 end

1268

1269 local config = require('config-loader').load()

1270 if not config.color_names then

1271 error('config.lua is missing color_names (Issue 10-003 migration)')

1272 end

1273 -- Pass color_associations so each color's embedding is the mean

1274 -- of its essence words, not the bare color word (richer + the

1275 -- z-scored assignment is balanced). nil endpoint = use the

1276 -- selected server. Falls back to bare words if associations absent.

1277 local embeddings = calc.generate_color_embeddings(config.color_names, '$MODEL_NAME', nil, config.color_associations)

1278 if not next(embeddings) then

1279 error('Inference server returned no color embeddings')

1280 end

1281 local data = {embeddings = embeddings, generated_at = os.date('%Y-%m-%d %H:%M:%S'), model_name = '$MODEL_NAME'}

1282 utils.write_json_file('$color_embeddings_file', data)

1283 print('[INFO] Color embeddings saved: ' .. '$color_embeddings_file')

1284 " || {

1285 echo "Error: Color embedding generation failed" >&2

1286 exit 1

1287 }

1288 # Remember the palette we just built from, so the next run can tell

1289 # whether it changed (and skip this server round-trip when it hasn't).

1290 echo "$current_palette_fp" > "$palette_fp_file"

1291 fi

1292 fi

1293

1294 # Issue 10-016: Check both global and per-stage force flags (Stage 6)

1295 local stage_force=$FORCE

1296 $FORCE_STAGE_6 && stage_force=true

1297

1298 # Check freshness: poem_colors.json should be newer than embeddings.json

1299 # With --force or --force-stage 6: always regenerate regardless of freshness

1300 if ! $stage_force && [ -f "$poem_colors_file" ] && [ -f "$embeddings_file" ]; then

1301 # Poem colors depend on BOTH the poem embeddings AND the color centroids, so

1302 # they are only fresh when newer than both. Watching only embeddings.json

1303 # meant a palette change (which rewrites color_embeddings.json but not

1304 # embeddings.json) left poem_colors.json stale yet considered "fresh".

1305 if [ "$poem_colors_file" -nt "$embeddings_file" ] && [ "$poem_colors_file" -nt "$color_embeddings_file" ]; then

1306 log_info " ⏭️ Semantic colors are fresh (newer than embeddings + palette), skipping..."

1307 return 0

1308 fi

1309 log_verbose " poem_colors.json is stale (older than embeddings or palette), regenerating..."

1310 elif $stage_force; then

1311 log_verbose " --force specified, regenerating semantic colors..."

1312 fi

1313

1314 log_stage "🎨 Stage 6b/10: Computing semantic colors (part of embeddings)"

1315

1316 if $DRY_RUN; then

1317 log_dry_run "luajit semantic-color-calculator (poem colors regeneration)"

1318 return 0

1319 fi

1320

1321 log_info " Input: $embeddings_file"

1322 log_info " Output: $poem_colors_file"

1323

1324 # Regenerate poem colors using existing embeddings

1325 luajit -e "

1326 package.path = '$DIR/libs/?.lua;$DIR/src/?.lua;' .. package.path

1327 local calc = require('semantic-color-calculator')

1328 local utils = require('utils')

1329 utils.init_assets_root({'$DIR'})

1330

1331 local poems_data = utils.read_json_file(utils.asset_path('poems.json'))

1332 local embeddings_data = utils.read_json_file('$embeddings_file')

1333 local color_embeddings_data = utils.read_json_file('$color_embeddings_file')

1334

1335 if poems_data and embeddings_data and color_embeddings_data then

1336 calc.precompute_poem_colors(poems_data, embeddings_data, color_embeddings_data.embeddings, '$poem_colors_file')

1337 else

1338 error('Failed to load required data files')

1339 end

1340 " || {

1341 echo "Error: Semantic color generation failed" >&2

1342 exit 1

1343 }

1344}

1345# }}}

1346

1347# {{{ run_augment_images

1348# Issue 9-013: give every text-less image a pseudo-embedding (the normalized

1349# average of the poem before and after it chronologically) and fold those into

1350# embeddings.json so the GPU similarity stage ranks images alongside poems.

1351# Also writes image-manifest.json, which the HTML renderer reads to draw image

1352# entries. Cheap and idempotent, so it runs each time before the matrix build.

1353run_augment_images() {

1354 log_stage "🖼️ Stage 6.7: Folding images into the embedding set (pseudo-embeddings)"

1355 local model_dir_name="${MODEL_NAME//:/_}"

1356 local embeddings_file="$(emb_cache_dir)/embeddings.json"

1357 if [ ! -f "$embeddings_file" ]; then

1358 echo "Error: embeddings.json not found; run --generate-embeddings first" >&2

1359 exit 1

1360 fi

1361 if $DRY_RUN; then

1362 log_dry_run "luajit $DIR/src/augment-embeddings-with-images.lua $DIR"

1363 return 0

1364 fi

1365 $NICE_PREFIX luajit "$DIR/src/augment-embeddings-with-images.lua" "$DIR" || {

1366 echo "Error: image augmentation failed" >&2

1367 exit 1

1368 }

1369}

1370# }}}

1371

1372# {{{ run_generate_similarity

1373run_generate_similarity() {

1374 # GPU (Vulkan) is required: these are O(N^2) similarity calculations that make no

1375 # sense on a CPU, so the CPU route was removed (Issue 10-057). A missing GPU library

1376 # is a hard error with build instructions, never a slow fallback.

1377 if [ ! -f "$DIR/libs/vulkan-compute/build/libvkcompute.so" ]; then

1378 echo "Error: GPU library not found: libs/vulkan-compute/build/libvkcompute.so" >&2

1379 echo "Build it: cd libs/vulkan-compute && make" >&2

1380 exit 1

1381 fi

1382 log_stage "📊 Stage 7/10: Building similarity matrix with GPU (~5-10 min)"

1383

1384 # Convert model name for directory

1385 local model_dir_name="${MODEL_NAME//:/_}"

1386 local embeddings_file="$(emb_cache_dir)/embeddings.json"

1387

1388 # Check if embeddings exist

1389 if [ ! -f "$embeddings_file" ]; then

1390 echo "Error: Embeddings file not found: $embeddings_file" >&2

1391 echo "Run --generate-embeddings first" >&2

1392 exit 1

1393 fi

1394

1395 # Issue 10-016: Check both global and per-stage force flags (Stage 7)

1396 local stage_force=$FORCE

1397 $FORCE_STAGE_7 && stage_force=true

1398

1399 # Issue 8-033: Check for individual similarity files instead of monolithic matrix

1400 local similarities_dir="$(emb_cache_dir)/similarities"

1401 local similarity_count=0

1402 if [ -d "$similarities_dir" ]; then

1403 similarity_count=$(find "$similarities_dir" -name "poem_*.json" 2>/dev/null | wc -l)

1404 fi

1405

1406 # Freshness check: skip if we have all files and they're fresh

1407 if ! $stage_force && [ "$similarity_count" -ge 7797 ]; then

1408 # Check if any are older than embeddings (check newest file)

1409 local newest_similarity=$(find "$similarities_dir" -name "poem_*.json" -type f -printf '%T@ %p\n' 2>/dev/null | sort -rn | head -1 | cut -d' ' -f2-)

1410 if [ -n "$newest_similarity" ] && [ "$newest_similarity" -nt "$embeddings_file" ]; then

1411 log_info " ⏭️ Similarity files are fresh ($similarity_count files newer than embeddings), skipping..."

1412 return 0

1413 fi

1414 fi

1415

1416 local threads_arg=""

1417 if [ -n "$THREADS" ]; then

1418 threads_arg="--threads=$THREADS"

1419 fi

1420

1421 if $DRY_RUN; then

1422 log_dry_run "luajit (GPU vk_similarity via libvkcompute.so) --generate-matrix $threads_arg"

1423 return 0

1424 fi

1425

1426 log_info " Input: assets/embeddings/$model_dir_name/embeddings.json"

1427 log_info " Output: assets/embeddings/$model_dir_name/similarities/*.json (individual files)"

1428

1429 # Issue 10-016: Convert stage_force to Lua boolean for Lua function calls

1430 local stage_force_lua="false"

1431 $stage_force && stage_force_lua="true"

1432

1433 # GPU similarity generation using Vulkan compute shaders (the only route now)

1434 log_info " Mode: GPU-accelerated (Vulkan)"

1435

1436 # Pass threads value to GPU similarity (defaults to 8 if not specified)

1437 local default_threads=8

1438 local threads_to_use=${THREADS:-$default_threads}

1439 log_info " CPU sorting threads: $threads_to_use"

1440

1441 DIR="$DIR" luajit -e "

1442 package.path = '$DIR/?.lua;$DIR/?/init.lua;$DIR/libs/?.lua;' .. package.path

1443 local vk_sim = require('libs.vulkan-compute.lua.vk_similarity')

1444 -- Issue 10-057: size the rankings cache to exactly what THIS build shows

1445 -- per poem -- the ACTUAL pages it generates (the --pages value, else the

1446 -- config default minimum_pages -- NOT the storage ceiling

1447 -- max_pages_per_poem) times the poems shown per page. Everything is read

1448 -- at runtime from the run's flags + config; no hardcoded page counts. The

1449 -- HTML stage's loader regenerates if a later run ever needs more (the

1450 -- top_k stamp makes that detectable). The list is sorted nearest-first, so

1451 -- the top-K ARE precisely what the pages display.

1452 local _cfg = require('config-loader'); _cfg.set_project_root('$DIR')

1453 local _pag = _cfg.load().pagination

1454 if not _pag then error('config.pagination missing; cannot size the rankings cache') end

1455 local _pages = tonumber('$PAGES') or _pag.minimum_pages

1456 local _per_page = tonumber('$POEMS_PER_PAGE') or _pag.poems_per_page

1457 if not _pages or not _per_page then

1458 error('cannot resolve pages/poems_per_page to size the rankings cache')

1459 end

1460 local _top_k = _pages * _per_page

1461 -- Use TRUE parallel GPU computation (Issue 9-002 original design)

1462 local success = vk_sim.generate_similarity_matrix_gpu_parallel(

1463 '$(emb_cache_dir)/embeddings.json',

1464 '$MODEL_NAME',

1465 $stage_force_lua,

1466 $threads_to_use,

1467 _top_k

1468 )

1469 if not success then

1470 print('[GPU SIMILARITY ERROR] GPU generation failed')

1471 os.exit(1)

1472 end

1473 " || {

1474 echo "Error: GPU similarity generation failed" >&2

1475 exit 1

1476 }

1477

1478 # Note: Pre-sorted similarity rankings cache is now generated automatically

1479 # by the GPU similarity engine (in-RAM, no file re-reading needed)

1480}

1481# }}}

1482

1483# {{{ run_generate_diversity

1484run_generate_diversity() {

1485 # GPU (Vulkan) is required: the diversity walk is O(N^2) GPU work, so the CPU route

1486 # was removed (Issue 10-057). A missing GPU library is a hard error, not a fallback.

1487 if [ ! -f "$DIR/libs/vulkan-compute/build/libvkcompute.so" ]; then

1488 echo "Error: GPU library not found: libs/vulkan-compute/build/libvkcompute.so" >&2

1489 echo "Build it: cd libs/vulkan-compute && make" >&2

1490 exit 1

1491 fi

1492 log_stage "🎲 Stage 8/10: Pre-computing diversity cache with GPU (~1 min)"

1493

1494 # Convert model name for directory

1495 local model_dir_name="${MODEL_NAME//:/_}"

1496 local cache_file="$(emb_cache_dir --disk)/diversity_cache.json"

1497 local embeddings_file="$(emb_cache_dir)/embeddings.json"

1498

1499 # Check if embeddings exist

1500 if [ ! -f "$embeddings_file" ]; then

1501 echo "Error: Embeddings file not found: $embeddings_file" >&2

1502 echo "Run --generate-embeddings first" >&2

1503 exit 1

1504 fi

1505

1506 # Issue 10-016: Check both global and per-stage force flags (Stage 8)

1507 local stage_force=$FORCE

1508 $FORCE_STAGE_8 && stage_force=true

1509

1510 # Freshness check: skip if cache newer than embeddings

1511 if ! $stage_force && [ -f "$cache_file" ]; then

1512 if [ "$cache_file" -nt "$embeddings_file" ]; then

1513 log_info " ⏭️ Diversity cache is fresh (newer than embeddings), skipping..."

1514 return 0

1515 fi

1516 fi

1517

1518 log_info " Input: assets/embeddings/$model_dir_name/embeddings.json"

1519 log_info " Output: assets/embeddings/$model_dir_name/diversity_cache.json"

1520

1521 # GPU diversity generation using Vulkan compute shaders (the only route now)

1522 log_info " Mode: GPU-accelerated (Vulkan)"

1523

1524 if $DRY_RUN; then

1525 log_dry_run "$DIR/scripts/precompute-diversity-sequences-gpu $DIR"

1526 return 0

1527 fi

1528

1529 # Issue 10-028: Apply low priority to expensive diversity generation.

1530 # The model is no longer passed via env here: the wrapper resolves it

1531 # through inference-server-config, which reads this run's overrides notepad

1532 # (tmp/run-overrides.lua, written above from --model) and falls back to

1533 # config.lua -- so the CLI override is honored without a per-stage env var.

1534 # Issue 10-057: pass the run's page settings so the wrapper caps each diversity

1535 # sequence to the SAME K the similarity cache and the HTML stage use.

1536 PAGES="$PAGES" POEMS_PER_PAGE="$POEMS_PER_PAGE" $NICE_PREFIX "$DIR/scripts/precompute-diversity-sequences-gpu" "$DIR" || {

1537 echo "Error: GPU diversity cache generation failed" >&2

1538 exit 1

1539 }

1540}

1541# }}}

1542

1543# {{{ run_generate_html

1544run_generate_html() {

1545 log_stage "🌐 Stage 9/10: Generating website HTML"

1546

1547 # Issue 10-016: Check both global and per-stage force flags (Stage 9)

1548 local stage_force=$FORCE

1549 $FORCE_STAGE_9 && stage_force=true

1550

1551 # Issue 10-024: Clear output directories when forcing regeneration

1552 # This prevents stale files with obsolete poem_index values from persisting

1553 # after poem re-extraction changes the poem_index assignments

1554 if $stage_force; then

1555 log_info " Clearing stale HTML files (--force)..."

1556 rm -f "$DIR/output/similar/"*.html 2>/dev/null

1557 rm -f "$DIR/output/different/"*.html 2>/dev/null

1558 rm -f "$DIR/output/chronological/"*.html 2>/dev/null

1559 fi

1560

1561 local force_arg=""

1562 if $stage_force; then

1563 force_arg="--force"

1564 fi

1565

1566 local threads_arg=""

1567 if [ -n "$THREADS" ]; then

1568 threads_arg="--threads $THREADS"

1569 fi

1570

1571 # Issue 8-022: Pagination arguments

1572 local pages_arg=""

1573 if [ -n "$PAGES" ]; then

1574 pages_arg="--pages $PAGES"

1575 fi

1576

1577 local poems_per_page_arg=""

1578 if [ -n "$POEMS_PER_PAGE" ]; then

1579 poems_per_page_arg="--poems-per-page $POEMS_PER_PAGE"

1580 fi

1581

1582 local chrono_per_page_arg=""

1583 if [ -n "$CHRONO_PER_PAGE" ]; then

1584 chrono_per_page_arg="--chrono-per-page $CHRONO_PER_PAGE"

1585 fi

1586

1587 if $DRY_RUN; then

1588 log_dry_run "$DIR/scripts/sync-page-templates $DIR (restore explore-page copy into input/pages/)"

1589 log_dry_run "luajit src/main.lua $DIR --html-only $force_arg $threads_arg $pages_arg $poems_per_page_arg $chrono_per_page_arg $ASSETS_ARG"

1590 log_dry_run "luajit $DIR/src/generate-gallery-pages.lua $DIR"

1591 log_dry_run "luajit $DIR/src/generate-source-browser.lua $DIR"

1592 return 0

1593 fi

1594

1595 # Issue 11-005: restore the authored explore-page copy into the ephemeral

1596 # input/pages/ before generating. The canonical, version-controlled source is

1597 # page-templates/*.txt; input/ is wiped + re-synced from external sources each

1598 # run and does NOT carry this prose, so it is copied back in here. (Edit the

1599 # files in page-templates/ -- input/pages/ is overwritten from them.)

1600 "$DIR/scripts/sync-page-templates" "$DIR" || {

1601 echo "Error: failed to restore page templates into input/pages/" >&2

1602 exit 1

1603 }

1604

1605 # Issue 10-028: Apply low priority to HTML generation (parallel processing)

1606 $NICE_PREFIX luajit src/main.lua "$DIR" --html-only $force_arg $threads_arg $pages_arg $poems_per_page_arg $chrono_per_page_arg $ASSETS_ARG || {

1607 echo "Error: HTML generation failed" >&2

1608 exit 1

1609 }

1610

1611 # Issue 10-059: the word-cloud menu and per-word similarity pages moved to their

1612 # own stage 10 (run_generate_wordcloud). They run after this stage, so the

1613 # chronological pages main.lua just built are already present for their #poem links.

1614

1615 # Issue 10-042: Build the image gallery (masonry pages per source + index +

1616 # chronological). It was previously a separate manual step, so the gallery

1617 # went stale -- it now regenerates with every HTML run from image-catalog.json.

1618 log_info " Generating image gallery..."

1619 $NICE_PREFIX luajit "$DIR/src/generate-gallery-pages.lua" "$DIR" || {

1620 echo "Warning: Gallery generation failed, continuing..." >&2

1621 }

1622

1623 # Issue 10-052: Build the link-only source browser (code/issues/docs as HTML)

1624 # under output/source/. This is the "git push that builds a webpage" -- the

1625 # private monorepo never leaves the machine; whoever has the site link can

1626 # browse the source. It publishes an ALLOWLIST only (never the private input

1627 # corpus), so it is safe to ship with the rest of the site.

1628 log_info " Generating source browser..."

1629 $NICE_PREFIX luajit "$DIR/src/generate-source-browser.lua" "$DIR" || {

1630 echo "Warning: Source browser generation failed, continuing..." >&2

1631 }

1632 # NOTE: the downloadable zip is built at POST time by running

1633 # scripts/build-download-zip directly, not here -- it is a deploy artifact, and

1634 # there is no point regenerating a multi-GB archive on every local build. (The

1635 # site's links are document-relative, so there is no URL-conversion step before

1636 # upload; just upload output/ and build the zip.)

1637}

1638# }}}

1639

1640# {{{ run_generate_wordcloud

1641# Issue 10-059: the word-cloud stage. Builds the site's entry menu (which carries the

1642# live poem index) and the per-word similarity pages. Runs after stage 9, so the

1643# chronological pages its #poem links target already exist. Replaces the retired

1644# numeric-similarity-index stage, whose output (numeric-index.html) was linked from

1645# nowhere and was superseded by the menu's embedded poem index.

1646run_generate_wordcloud() {

1647 log_stage "🔤 Stage 10/10: Generating word-cloud menu and per-word pages"

1648

1649 # Issue 10-059/10-061: wipe the per-word pages before regenerating. A word that

1650 # has fallen out of the cloud since the last build leaves an orphan page that the

1651 # generator never overwrites -- and an orphan from before a link-scheme change

1652 # ships BROKEN links (this is exactly how 134 stale "/similar-different/" pages

1653 # survived into a relative-path build). The pages are fully regenerated from the

1654 # current word set just below, so clearing every run (not only on --force) is

1655 # safe and is the only way to guarantee no stale orphans. Matches the principle

1656 # that each stage wipes its own output subdirectory before rebuilding it.

1657 if [ -d "$DIR/output/wordcloud" ]; then

1658 log_info " Clearing stale per-word pages before regeneration..."

1659 rm -f "$DIR/output/wordcloud/"*.html

1660 fi

1661

1662 # Word-cloud arguments. WORDCLOUD_WORDS is a number or "all"; --words carries

1663 # either ("--words all" == every word, per the generators).

1664 local wordcloud_words_arg=""

1665 if [ -n "$WORDCLOUD_WORDS" ]; then

1666 wordcloud_words_arg="--words $WORDCLOUD_WORDS"

1667 fi

1668

1669 # Issue 8-050d: Poems per word-cloud page

1670 local wordcloud_poems_arg=""

1671 if [ -n "$WORDCLOUD_POEMS" ]; then

1672 wordcloud_poems_arg="--poems-per-page $WORDCLOUD_POEMS"

1673 fi

1674

1675 # Issue 10-036: thread chrono_per_page so the word-cloud poem links paginate to

1676 # the SAME chronological pages stage 9 built (separate processes must agree on

1677 # page size, or every #poem link lands on the wrong page).

1678 local chrono_per_page_arg=""

1679 if [ -n "$CHRONO_PER_PAGE" ]; then

1680 chrono_per_page_arg="--chrono-per-page $CHRONO_PER_PAGE"

1681 fi

1682

1683 if $DRY_RUN; then

1684 log_dry_run "luajit $DIR/src/wordcloud-generator.lua $DIR $wordcloud_words_arg $chrono_per_page_arg $RANDOM_SEED_ARG"

1685 log_dry_run "luajit $DIR/src/generate-word-pages.lua $DIR --html-only $wordcloud_words_arg $wordcloud_poems_arg $chrono_per_page_arg"

1686 return 0

1687 fi

1688

1689 # The word cloud IS the site's menu (and carries the live poem index), so a

1690 # failure here is fatal, not a warning -- there is no usable entry page without it.

1691 log_info " Generating word cloud menu..."

1692 $NICE_PREFIX luajit "$DIR/src/wordcloud-generator.lua" "$DIR" $wordcloud_words_arg $chrono_per_page_arg $RANDOM_SEED_ARG || {

1693 echo "Error: Word cloud menu generation failed" >&2

1694 exit 1

1695 }

1696

1697 log_info " Generating word similarity pages..."

1698 $NICE_PREFIX luajit "$DIR/src/generate-word-pages.lua" "$DIR" --html-only $wordcloud_words_arg $wordcloud_poems_arg $chrono_per_page_arg || {

1699 echo "Error: Word similarity page generation failed" >&2

1700 exit 1

1701 }

1702}

1703# }}}

1704

1705# }}}

1707# {{{ interactive_mode_tui

1708# TUI-based interactive mode with command preview

1709# Uses Lua menu library for stable rendering and real-time command preview

1710interactive_mode_tui() {

1711 if ! $TUI_AVAILABLE; then

1712 echo "ERROR: TUI library not available." >&2

1713 echo "Falling back to Lua-based interactive mode..." >&2

1714 luajit src/main.lua "$DIR" -I $ASSETS_ARG

1715 return $?

1716 fi

1717

1718 # Initialize TUI

1719 if ! tui_init; then

1720 echo "ERROR: TUI initialization failed." >&2

1721 echo "Falling back to Lua-based interactive mode..." >&2

1722 luajit src/main.lua "$DIR" -I $ASSETS_ARG

1723 return $?

1724 fi

1725

1726 # Build the menu

1727 menu_init

1728 menu_set_title "Neocities Pipeline" "Use j/k to navigate, space to toggle, Enter to run"

1729

1730 # ═══════════════════════════════════════════════════════════════════════════

1731 # Section 1: Pipeline Stages (multi - can select multiple)

1732 # Each checkbox maps to a CLI flag for command preview

1733 # Issue 10-016: Force regeneration moved here with per-stage options

1734 # ═══════════════════════════════════════════════════════════════════════════

1735 menu_add_section "stages" "multi" "Pipeline Stages (toggle stages to run)"

1736

1737 # Issue 10-016: Global force regenerate option at top of stages

1738 menu_add_item "stages" "force" "Force regenerate ALL stages" "checkbox" "0" \

1739 "Force regeneration even if files are fresh" "" "--force"

1740

1741 menu_add_item "stages" "update_words" "1. Update Words" "checkbox" "1" \

1742 "Sync input files from words repository" "" "--update-words"

1743 menu_add_item "stages" "force_update_words" " ↳ Force regenerate" "checkbox" "0" \

1744 "Force regenerate this stage only" "" "--force-stage 1"

1745

1746 menu_add_item "stages" "extract" "2. Extract" "checkbox" "1" \

1747 "Extract content from backup archives" "" "--extract"

1748 menu_add_item "stages" "force_extract" " ↳ Force regenerate" "checkbox" "0" \

1749 "Force regenerate this stage only" "" "--force-stage 2"

1750

1751 menu_add_item "stages" "parse" "3. Parse" "checkbox" "1" \

1752 "Parse poems from JSON sources into poems.json" "" "--parse"

1753 menu_add_item "stages" "force_parse" " ↳ Force regenerate" "checkbox" "0" \

1754 "Force regenerate this stage only" "" "--force-stage 3"

1755

1756 menu_add_item "stages" "validate" "4. Validate" "checkbox" "1" \

1757 "Run poem validation" "" "--validate"

1758 menu_add_item "stages" "force_validate" " ↳ Force regenerate" "checkbox" "0" \

1759 "Force regenerate this stage only" "" "--force-stage 4"

1760

1761 menu_add_item "stages" "catalog_images" "5. Catalog Images" "checkbox" "1" \

1762 "Catalog images from input directories" "" "--catalog-images"

1763 menu_add_item "stages" "force_catalog_images" " ↳ Force regenerate" "checkbox" "0" \

1764 "Force regenerate this stage only" "" "--force-stage 5"

1765

1766 menu_add_item "stages" "generate_embeddings" "6. Embeddings ⚠️" "checkbox" "0" \

1767 "Generate embeddings via the inference server (~2-3 hours)" "" "--generate-embeddings"

1768 menu_add_item "stages" "force_generate_embeddings" " ↳ Force regenerate" "checkbox" "0" \

1769 "Force regenerate this stage only" "" "--force-stage 6"

1770

1771 menu_add_item "stages" "generate_similarity" "7. Similarity ⚠️" "checkbox" "0" \

1772 "Build similarity matrix (~30 min)" "" "--generate-similarity"

1773 menu_add_item "stages" "force_generate_similarity" " ↳ Force regenerate" "checkbox" "0" \

1774 "Force regenerate this stage only" "" "--force-stage 7"

1775

1776 menu_add_item "stages" "generate_diversity" "8. Diversity ⚠️" "checkbox" "0" \

1777 "Pre-compute diversity cache (~42 hours)" "" "--generate-diversity"

1778 menu_add_item "stages" "force_generate_diversity" " ↳ Force regenerate" "checkbox" "0" \

1779 "Force regenerate this stage only" "" "--force-stage 8"

1780

1781 menu_add_item "stages" "generate_html" "9. Generate HTML" "checkbox" "1" \

1782 "Generate website HTML (chronological + similarity pages)" "" "--generate-html"

1783 menu_add_item "stages" "force_generate_html" " ↳ Force regenerate" "checkbox" "0" \

1784 "Force regenerate this stage only" "" "--force-stage 9"

1785

1786 menu_add_item "stages" "generate_wordcloud" "10. Generate Word Cloud" "checkbox" "1" \

1787 "Generate the word-cloud menu and per-word similarity pages" "" "--generate-wordcloud"

1788 menu_add_item "stages" "force_generate_wordcloud" " ↳ Force regenerate" "checkbox" "0" \

1789 "Force regenerate this stage only" "" "--force-stage 10"

1790

1791 # Issue 10-016: Dependencies - per-stage force options disabled when global force is checked

1792 # invert=true means: enable per-stage force when global force is NOT checked

1793 menu_add_dependency "force_update_words" "force" "1" "true" \

1794 "Disabled: global force is active" "orange"

1795 menu_add_dependency "force_extract" "force" "1" "true" \

1796 "Disabled: global force is active" "orange"

1797 menu_add_dependency "force_parse" "force" "1" "true" \

1798 "Disabled: global force is active" "orange"

1799 menu_add_dependency "force_validate" "force" "1" "true" \

1800 "Disabled: global force is active" "orange"

1801 menu_add_dependency "force_catalog_images" "force" "1" "true" \

1802 "Disabled: global force is active" "orange"

1803 menu_add_dependency "force_generate_embeddings" "force" "1" "true" \

1804 "Disabled: global force is active" "orange"

1805 menu_add_dependency "force_generate_similarity" "force" "1" "true" \

1806 "Disabled: global force is active" "orange"

1807 menu_add_dependency "force_generate_diversity" "force" "1" "true" \

1808 "Disabled: global force is active" "orange"

1809 menu_add_dependency "force_generate_html" "force" "1" "true" \

1810 "Disabled: global force is active" "orange"

1811 menu_add_dependency "force_generate_wordcloud" "force" "1" "true" \

1812 "Disabled: global force is active" "orange"

1813

1814 # ═══════════════════════════════════════════════════════════════════════════

1815 # Section 2: Configuration Options

1816 # ═══════════════════════════════════════════════════════════════════════════

1817 menu_add_section "config" "multi" "Configuration"

1818 # Issue 10-034: Orchestrator pattern enables parallel HTML with low memory

1819 # Main thread sends 80KB work slices instead of workers loading 700MB caches

1820 # Expected memory: ~2.5GB total (vs 14GB+ before fix)

1821 menu_add_item "config" "threads" "Thread Count" "flag" "4:8" \

1822 "Threads for HTML gen (orchestrator mode)" "" "--threads"

1823 # Issue 8-022: Pagination options for HTML generation

1824 menu_add_item "config" "pages" "Pages per Poem" "flag" ":2" \

1825 "Pages to generate per poem (default: from config, 1)" "" "--pages"

1826 menu_add_item "config" "poems_per_page" "Poems per Page" "flag" ":3" \

1827 "Poems per page for similar/different (default: 200)" "" "--poems-per-page"

1828 menu_add_item "config" "chrono_per_page" "Chrono per Page" "flag" ":3" \

1829 "Poems per page for chronological (default: 500)" "" "--chrono-per-page"

1830 # Issue 10-016: Force Regeneration moved to stages section

1831 menu_add_item "config" "dry_run" "Dry Run" "checkbox" "0" \

1832 "Show what would be executed without running" "" "--dry-run"

1833 menu_add_item "config" "verbose" "Verbose Output" "checkbox" "0" \

1834 "Show detailed progress information" "" "--verbose"

1835 menu_add_item "config" "include_boosts" "Include Boosts" "checkbox" "0" \

1836 "Include fediverse boosts/reblogs in extraction" "" "--include-boosts"

1837

1838 # ═══════════════════════════════════════════════════════════════════════════

1839 # Section 3: Word Cloud Configuration

1840 # Issue 8-043: Configurable word count with "all words" toggle

1841 # ═══════════════════════════════════════════════════════════════════════════

1842 menu_add_section "wordcloud" "multi" "Word Cloud Options"

1843 menu_add_item "wordcloud" "wordcloud_all" "All Words" "checkbox" "0" \

1844 "Include all words (disables word count limit)" "" "--wordcloud-words all"

1845 menu_add_item "wordcloud" "wordcloud_words" "Word Count" "flag" "200:3" \

1846 "Maximum words in word cloud (default: 200)" "" "--wordcloud-words"

1847 # Issue 8-050d: Poems per word-cloud page

1848 menu_add_item "wordcloud" "wordcloud_poems" "Poems Per Page" "flag" "50:3" \

1849 "Poems per word-cloud similarity page (default: 50)" "" "--wordcloud-poems"

1850 # Dependency: Disable wordcloud_words when wordcloud_all is checked

1851 # invert=true means: enable wordcloud_words when wordcloud_all is NOT checked (value "1")

1852 menu_add_dependency "wordcloud_words" "wordcloud_all" "1" "true" \

1853 "Word count disabled when 'All Words' is checked"

1854

1855 # ═══════════════════════════════════════════════════════════════════════════

1856 # Section 4: Command Preview (shows the command that will be executed)

1857 # ═══════════════════════════════════════════════════════════════════════════

1858 menu_add_section "preview" "multi" "Command Preview"

1859 menu_add_item "preview" "cmd_preview" "" "text" "" \

1860 "The command that will be executed (press ~ to copy to clipboard)"

1861

1862 # Configure command preview - links checkboxes to command string

1863 menu_set_command_config "./run.sh" "cmd_preview" ""

1864

1865 # ═══════════════════════════════════════════════════════════════════════════

1866 # Section 5: Actions

1867 # ═══════════════════════════════════════════════════════════════════════════

1868 menu_add_section "actions" "single" "Actions"

1869 menu_add_item "actions" "run" "Run Selected Stages" "action" "" \

1870 "Execute the selected pipeline stages" ""

1871

1872 # Run the menu loop

1873 while true; do

1874 if menu_run; then

1875 # User selected "run" - extract values and execute

1876 local update_words_val=$(menu_get_value "update_words")

1877 local extract_val=$(menu_get_value "extract")

1878 local parse_val=$(menu_get_value "parse")

1879 local validate_val=$(menu_get_value "validate")

1880 local catalog_val=$(menu_get_value "catalog_images")

1881 local embeddings_val=$(menu_get_value "generate_embeddings")

1882 local similarity_val=$(menu_get_value "generate_similarity")

1883 local diversity_val=$(menu_get_value "generate_diversity")

1884 local html_val=$(menu_get_value "generate_html")

1885 local wordcloud_stage_val=$(menu_get_value "generate_wordcloud")

1886 local threads_val=$(menu_get_value "threads")

1887 # Issue 8-022: Get pagination values from TUI

1888 local pages_val=$(menu_get_value "pages")

1889 local poems_per_page_val=$(menu_get_value "poems_per_page")

1890 local chrono_per_page_val=$(menu_get_value "chrono_per_page")

1891 local force_val=$(menu_get_value "force")

1892 # Issue 10-016: Get per-stage force values from TUI

1893 local force_update_words_val=$(menu_get_value "force_update_words")

1894 local force_extract_val=$(menu_get_value "force_extract")

1895 local force_parse_val=$(menu_get_value "force_parse")

1896 local force_validate_val=$(menu_get_value "force_validate")

1897 local force_catalog_val=$(menu_get_value "force_catalog_images")

1898 local force_embeddings_val=$(menu_get_value "force_generate_embeddings")

1899 local force_similarity_val=$(menu_get_value "force_generate_similarity")

1900 local force_diversity_val=$(menu_get_value "force_generate_diversity")

1901 local force_html_val=$(menu_get_value "force_generate_html")

1902 local force_wordcloud_val=$(menu_get_value "force_generate_wordcloud")

1903 local dry_val=$(menu_get_value "dry_run")

1904 local verbose_val=$(menu_get_value "verbose")

1905 # Issue 8-011: Get boost inclusion value from TUI

1906 local include_boosts_val=$(menu_get_value "include_boosts")

1907 # Issue 8-043: Get wordcloud values from TUI

1908 local wordcloud_all_val=$(menu_get_value "wordcloud_all")

1909 local wordcloud_words_val=$(menu_get_value "wordcloud_words")

1910 # Issue 8-050d: Get poems per word-cloud page from TUI

1911 local wordcloud_poems_val=$(menu_get_value "wordcloud_poems")

1912

1913 # Set global flags based on menu selection

1914 [[ "$update_words_val" == "1" ]] && UPDATE_WORDS=true || UPDATE_WORDS=false

1915 [[ "$extract_val" == "1" ]] && EXTRACT=true || EXTRACT=false

1916 [[ "$parse_val" == "1" ]] && PARSE=true || PARSE=false

1917 [[ "$validate_val" == "1" ]] && VALIDATE=true || VALIDATE=false

1918 [[ "$catalog_val" == "1" ]] && CATALOG_IMAGES=true || CATALOG_IMAGES=false

1919 [[ "$embeddings_val" == "1" ]] && GENERATE_EMBEDDINGS=true || GENERATE_EMBEDDINGS=false

1920 [[ "$similarity_val" == "1" ]] && GENERATE_SIMILARITY=true || GENERATE_SIMILARITY=false

1921 [[ "$diversity_val" == "1" ]] && GENERATE_DIVERSITY=true || GENERATE_DIVERSITY=false

1922 [[ "$html_val" == "1" ]] && GENERATE_HTML=true || GENERATE_HTML=false

1923 [[ "$wordcloud_stage_val" == "1" ]] && GENERATE_WORDCLOUD=true || GENERATE_WORDCLOUD=false

1924

1925 # Config flags

1926 [[ -n "$threads_val" && "$threads_val" != "0" ]] && THREADS="$threads_val"

1927 # Issue 8-022: Set pagination values from TUI

1928 [[ -n "$pages_val" && "$pages_val" != "0" ]] && PAGES="$pages_val"

1929 [[ -n "$poems_per_page_val" && "$poems_per_page_val" != "0" ]] && POEMS_PER_PAGE="$poems_per_page_val"

1930 [[ -n "$chrono_per_page_val" && "$chrono_per_page_val" != "0" ]] && CHRONO_PER_PAGE="$chrono_per_page_val"

1931 [[ "$force_val" == "1" ]] && FORCE=true || FORCE=false

1932 # Issue 10-016: Set per-stage force flags from TUI

1933 [[ "$force_update_words_val" == "1" ]] && FORCE_STAGE_1=true || FORCE_STAGE_1=false

1934 [[ "$force_extract_val" == "1" ]] && FORCE_STAGE_2=true || FORCE_STAGE_2=false

1935 [[ "$force_parse_val" == "1" ]] && FORCE_STAGE_3=true || FORCE_STAGE_3=false

1936 [[ "$force_validate_val" == "1" ]] && FORCE_STAGE_4=true || FORCE_STAGE_4=false

1937 [[ "$force_catalog_val" == "1" ]] && FORCE_STAGE_5=true || FORCE_STAGE_5=false

1938 [[ "$force_embeddings_val" == "1" ]] && FORCE_STAGE_6=true || FORCE_STAGE_6=false

1939 [[ "$force_similarity_val" == "1" ]] && FORCE_STAGE_7=true || FORCE_STAGE_7=false

1940 [[ "$force_diversity_val" == "1" ]] && FORCE_STAGE_8=true || FORCE_STAGE_8=false

1941 [[ "$force_html_val" == "1" ]] && FORCE_STAGE_9=true || FORCE_STAGE_9=false

1942 [[ "$force_wordcloud_val" == "1" ]] && FORCE_STAGE_10=true || FORCE_STAGE_10=false

1943 [[ "$dry_val" == "1" ]] && DRY_RUN=true || DRY_RUN=false

1944 [[ "$verbose_val" == "1" ]] && VERBOSE=true || VERBOSE=false

1945 # Issue 8-011: Set boost inclusion from TUI

1946 [[ "$include_boosts_val" == "1" ]] && INCLUDE_BOOSTS=true || INCLUDE_BOOSTS=false

1947 # Issue 8-043: Set the word count from the TUI. The "All Words" checkbox

1948 # wins -- it sets the count to the literal "all" (and the dependency has

1949 # already disabled the now-irrelevant Word Count field). Otherwise the

1950 # typed count is used. One value, WORDCLOUD_WORDS, feeds --wordcloud-words.

1951 if [[ "$wordcloud_all_val" == "1" ]]; then

1952 WORDCLOUD_WORDS="all"

1953 elif [[ -n "$wordcloud_words_val" && "$wordcloud_words_val" != "0" ]]; then

1954 WORDCLOUD_WORDS="$wordcloud_words_val"

1955 fi

1956 # Issue 8-050d: Set poems per word-cloud page from TUI

1957 [[ -n "$wordcloud_poems_val" && "$wordcloud_poems_val" != "0" ]] && WORDCLOUD_POEMS="$wordcloud_poems_val"

1958

1959 # Check if at least one stage is selected

1960 if ! $UPDATE_WORDS && ! $EXTRACT && ! $PARSE && ! $VALIDATE && \

1961 ! $CATALOG_IMAGES && ! $GENERATE_EMBEDDINGS && ! $GENERATE_SIMILARITY && \

1962 ! $GENERATE_DIVERSITY && ! $GENERATE_HTML && ! $GENERATE_WORDCLOUD; then

1963 echo ""

1964 echo "No stages selected. Please select at least one stage to run."

1965 echo "Press Enter to continue..."

1966 read -r

1967 continue

1968 fi

1969

1970 # Exit menu and run the pipeline

1971 menu_cleanup

1972 return 0

1973 else

1974 # User quit

1975 menu_cleanup

1976 echo "Goodbye!"

1977 exit 0

1978 fi

1979 done

1980}

1981# }}}

1983# {{{ Main execution

1984

1985# Handle interactive mode

1986EXECUTED_COMMAND="" # Store command for post-run display

1987if $INTERACTIVE; then

1988 log_info "🎛️ Launching interactive mode with command preview..."

1989 interactive_mode_tui

1990 # Save the command preview for display after execution

1991 EXECUTED_COMMAND=$(menu_get_value "cmd_preview")

1992 # After TUI, fall through to execute selected stages

1993fi

1994

1995# Show what will be executed (in non-interactive or after TUI selection)

1996if $DRY_RUN || $VERBOSE; then

1997 echo "Pipeline stages to execute:"

1998 # Issue 10-051 / alignment: render the plan as a TABLE -- stage names in one

1999 # left-aligned column, the measured average time right-aligned in the next --

2000 # so durations line up and the eye can scan them. Measured wall-clock (avg of

2001 # recent runs) appears once a stage has run here before; until then a coarse

2002 # magnitude word (short/medium/long) stands in, since a word can't go stale

2003 # the way a hard number can. The ⚠ marks the heavy stages.

2004 #

2006 # key can differ from the display name (word-cloud history is stored under

2007 # "wordcloud" but shown as "generate-wordcloud").

2008 _plan_rows=(

2019 )

2020 _have_timing=false

2021 command -v stage_timing_mean >/dev/null 2>&1 && _have_timing=true

2022

2023 # Pass 1: collect enabled rows + each one's time string and tail, and track

2024 # the widest label and widest time. The ⚠ glyph is counted as ONE display

2025 # column (not its byte length) so the multibyte char does not skew alignment.

2026 _p_num=(); _p_label=(); _p_lvis=(); _p_time=(); _p_tail=()

2027 _labelw=0; _timew=0

2028 for _row in "${_plan_rows[@]}"; do

2029 IFS='|' read -r _en _num _name _warn _key _mag <<< "$_row"

2030 [ "$_en" = "true" ] || continue

2031 _lbl="$_name"; _lvis=${#_name}

2032 if [ "$_warn" = "1" ]; then _lbl="$_name $(symbol_warning "⚠")"; _lvis=$(( ${#_name} + 2 )); fi

2033 _time=""; _tail="$_mag"

2034 if $_have_timing; then

2035 _mean="$(stage_timing_mean "$_key" 2>/dev/null)"

2036 if [ -n "$_mean" ]; then

2037 _cnt="$(stage_timing_count "$_key")"

2038 _pl="s"; [ "$_cnt" = "1" ] && _pl=""

2039 _time="$(stage_timing_format_seconds "$_mean")"

2040 _tail="last ${_cnt} run${_pl}"

2041 fi

2042 fi

2043 _p_num+=("$_num"); _p_label+=("$_lbl"); _p_lvis+=("$_lvis")

2044 _p_time+=("$_time"); _p_tail+=("$_tail")

2045 [ "$_lvis" -gt "$_labelw" ] && _labelw=$_lvis

2046 [ "${#_time}" -gt "$_timew" ] && _timew=${#_time}

2047 done

2048

2049 # Pass 2: print aligned. Number in a 3-wide field ("1." / "10."), label padded

2050 # to _labelw, time right-aligned to _timew inside "(avg <time>, <tail>)".

2051 _i=0

2052 while [ "$_i" -lt "${#_p_num[@]}" ]; do

2053 _pad=$(( _labelw - ${_p_lvis[$_i]} ))

2054 _sp=""; [ "$_pad" -gt 0 ] && _sp="$(printf '%*s' "$_pad" '')"

2055 if [ -n "${_p_time[$_i]}" ]; then

2056 printf " %-3s %s%s (avg %*s, %s)\n" \

2057 "${_p_num[$_i]}." "${_p_label[$_i]}" "$_sp" \

2058 "$_timew" "${_p_time[$_i]}" "${_p_tail[$_i]}"

2059 else

2060 printf " %-3s %s%s (%s)\n" \

2061 "${_p_num[$_i]}." "${_p_label[$_i]}" "$_sp" "${_p_tail[$_i]}"

2062 fi

2063 _i=$(( _i + 1 ))

2064 done

2065 echo ""

2066fi

2067

2068# {{{ Issue 10-017: Validate Inference server connectivity before embedding stages

2069if $GENERATE_EMBEDDINGS && ! $DRY_RUN; then

2070 log_info "Validating Inference server connectivity..."

2071 VALIDATION_RESULT=$(luajit -e "

2072 package.path = '$DIR/libs/?.lua;' .. package.path

2073 local inference = require('inference-server-config')

2074 if '$INFERENCE_SERVER' ~= '' then

2075 inference.set_selected_server('$INFERENCE_SERVER')

2076 end

2077 local server = inference.get_selected_server()

2078 local ok, msg = inference.validate_server(server)

2079 if ok then

2080 print('OK:' .. server.name .. ':' .. inference.build_host_url(server))

2081 else

2082 print('FAIL:' .. server.name .. ':' .. msg)

2083 end

2084 " 2>&1)

2085

2086 if [[ "$VALIDATION_RESULT" == OK:* ]]; then

2087 SERVER_NAME=$(echo "$VALIDATION_RESULT" | cut -d: -f2)

2088 SERVER_URL=$(echo "$VALIDATION_RESULT" | cut -d: -f3-)

2089 log_info " ✓ Inference server '$SERVER_NAME' is reachable at $SERVER_URL"

2090 else

2091 # Server unreachable. Try to start it ourselves (and remember we

2092 # did, so the EXIT trap shuts it down again). If start succeeds,

2093 # re-validate to confirm /health is responsive before proceeding.

2094 SERVER_NAME=$(echo "$VALIDATION_RESULT" | cut -d: -f2)

2095 ERROR_MSG=$(echo "$VALIDATION_RESULT" | cut -d: -f3-)

2096 log_info " ✗ Inference server '$SERVER_NAME' not reachable: $ERROR_MSG"

2097 log_info " Attempting to start it via scripts/start-llamacpp-server.sh..."

2098

2099 START_ARGS=("$DIR")

2100 if [ -n "$INFERENCE_SERVER" ]; then

2101 START_ARGS+=("--server=$INFERENCE_SERVER")

2102 fi

2103 if "$DIR/scripts/start-llamacpp-server.sh" "${START_ARGS[@]}"; then

2104 WE_STARTED_INFERENCE_SERVER=true

2105

2106 # Re-validate to confirm the freshly-started server is responsive.

2107 VALIDATION_RESULT=$(luajit -e "

2108 package.path = '$DIR/libs/?.lua;' .. package.path

2109 local inference = require('inference-server-config')

2110 if '$INFERENCE_SERVER' ~= '' then

2111 inference.set_selected_server('$INFERENCE_SERVER')

2112 end

2113 local server = inference.get_selected_server()

2114 local ok, msg = inference.validate_server(server)

2115 if ok then

2116 print('OK:' .. server.name .. ':' .. inference.build_host_url(server))

2117 else

2118 print('FAIL:' .. server.name .. ':' .. msg)

2119 end

2120 " 2>&1)

2121

2122 if [[ "$VALIDATION_RESULT" == OK:* ]]; then

2123 SERVER_NAME=$(echo "$VALIDATION_RESULT" | cut -d: -f2)

2124 SERVER_URL=$(echo "$VALIDATION_RESULT" | cut -d: -f3-)

2125 log_info " ✓ Inference server '$SERVER_NAME' started at $SERVER_URL"

2126 log_info " (will be shut down again when this run completes)"

2127 else

2128 ERROR_MSG=$(echo "$VALIDATION_RESULT" | cut -d: -f3-)

2129 echo -e "${RED}❌ ERROR: Started the inference server but it is still not reachable${NC}" >&2

2130 echo -e "${RED} $ERROR_MSG${NC}" >&2

2131 echo -e "${YELLOW}💡 Check ${NEOCITIES_LOG_DIR:-$DIR/tmp}/llamacpp-server.log for the server's own diagnostics${NC}" >&2

2132 exit 1

2133 fi

2134 else

2135 echo -e "${RED}❌ ERROR: Failed to start the inference server${NC}" >&2

2136 echo -e "${YELLOW}💡 Run ./scripts/start-llamacpp-server.sh manually for verbose output${NC}" >&2

2137 echo -e "${YELLOW}💡 Use --list-servers to see available servers${NC}" >&2

2138 echo -e "${YELLOW}💡 Use --server=NAME to select a different server${NC}" >&2

2139 exit 1

2140 fi

2141 fi

2142fi

2143# }}}

2144

2145# Execute stages in pipeline order (regardless of argument order)

2146# Issue 10-051: timed_stage <name> wraps each stage so its wall-clock is recorded

2147# to .stage-timings on success (skipped stages and failures record nothing). The

2148# names here are the keys the pre-flight list reads back for its estimates.

2149$UPDATE_WORDS && timed_stage update-words run_update_words

2150$EXTRACT && timed_stage extract run_extract

2151# Issue 10-053: strip excluded content from input/ right after sync/extraction,

2152# before anything catalogs or embeds it. Tied to extraction (which follows sync).

2153$EXTRACT && timed_stage strip-excluded run_strip_excluded

2154$PARSE && timed_stage parse run_parse

2155$VALIDATE && timed_stage validate run_validate

2156$CATALOG_IMAGES && timed_stage catalog-images run_catalog_images

2157$GENERATE_EMBEDDINGS && timed_stage generate-embeddings run_generate_embeddings

2158# Semantic colors are part of embedding generation (Stage 6.5)

2159# Only regenerate when embeddings are generated - HTML should use existing poem_colors.json

2160$GENERATE_EMBEDDINGS && timed_stage generate-semantic-colors run_generate_semantic_colors

2161# Word embeddings run AFTER colors so the word-color step finds color_embeddings.json

2162$GENERATE_EMBEDDINGS && timed_stage generate-word-embeddings run_generate_word_embeddings

2163# Issue 9-013: fold image pseudo-embeddings into the set BEFORE the similarity

2164# matrix is built, so images rank alongside poems. Idempotent + cheap.

2165$GENERATE_SIMILARITY && timed_stage augment-images run_augment_images

2166$GENERATE_SIMILARITY && timed_stage generate-similarity run_generate_similarity

2167$GENERATE_DIVERSITY && timed_stage generate-diversity run_generate_diversity

2168$GENERATE_HTML && timed_stage generate-html run_generate_html

2169$GENERATE_WORDCLOUD && timed_stage wordcloud run_generate_wordcloud

2170

2171if ! $QUIET; then

2172 echo ""

2173 echo -e "$(symbol_success "✅") Pipeline completed successfully"

2174

2175 # Print the executed command for easy re-running (copy-paste friendly)

2176 if [[ -n "$EXECUTED_COMMAND" ]]; then

2177 echo ""

2178 echo -e "$(symbol_info "📋") Command executed:"

2179 echo " $EXECUTED_COMMAND"

2180 fi

2181fi

2182# }}}