run.sh

2183 lines

1#!/bin/bash
2
3# run.sh - Main orchestrator for neocities-modernization pipeline
4#
5# Runs the complete poem processing pipeline from input files to generated HTML.
6# Supports selective stage execution via CLI flags, with stages running in
7# pipeline order regardless of argument order.
8#
9# The full pipeline has 10 stages:
10# 1. Update Words - Sync input files from words repository
11# 2. Extract - Extract content from backup archives
12# 3. Parse - Generate poems.json from sources
13# 4. Validate - Validate poem data
14# 5. Catalog Images - Generate image-catalog.json
15# 6. Embeddings - Generate poem embeddings via the inference server (~2-3 hours)
16# 7. Similarity - Build similarity matrix (~30 min)
17# 8. Diversity - Pre-compute diversity cache (~42 hours)
18# 9. Generate HTML - Generate poem pages, gallery, and source browser
19# 10. Generate WordCloud - Generate the word-cloud menu and per-word pages
20#
21# Stages are selected individually with named flags (--extract,
22# --generate-diversity, etc.) or by stage number (--stage 8,
23# --stage=5). Use --full to run all 10 stages.
24#
25# Usage: ./run.sh [FLAGS] [PROJECT_DIR]
26
27# {{{ setup_dir_path
28setup_dir_path() {
29 if [ -n "$1" ]; then
30 echo "$1"
31 else
32 echo "/mnt/mtwo/programming/ai-stuff/neocities-modernization"
33 fi
34}
35# }}}
36
37# {{{ Signal handling
38# Trap Ctrl+C so the script actually exits when the operator interrupts.
39# Bash on its own does not always propagate SIGINT to long-running children
40# (luajit's tight inner loops in particular eat the signal), so we kill
41# every background job in our process group and exit non-zero. Exit code
42# 130 is the conventional value for "terminated by SIGINT" (128 + signal#).
43cleanup_on_interrupt() {
44 echo
45 echo "Interrupted by user (SIGINT)" >&2
46 # Kill anything we backgrounded; suppress errors when there are none.
47 jobs -p | xargs -r kill 2>/dev/null
48 # Best-effort kill the entire process group too, in case a child
49 # spawned its own children without forwarding signals.
50 kill -- -$$ 2>/dev/null
51 exit 130
52}
53trap cleanup_on_interrupt INT TERM
54
55# WE_STARTED_INFERENCE_SERVER tracks whether THIS run started the llama.cpp
56# server itself (because validation failed at startup). If it's true, the
57# EXIT trap below shuts the server down again. If the operator (or a prior
58# run) was already running a server when we started, we leave it alone —
59# never kill what we did not start.
60WE_STARTED_INFERENCE_SERVER=false
61
62# cleanup_inference_server: gracefully terminate the llama.cpp server we
63# auto-started during the pre-flight validation phase. Runs on every exit
64# path (normal completion, SIGINT/SIGTERM via cleanup_on_interrupt, errors
65# that hit `exit`). The PID is read from a file the start script writes;
66# if the file is missing or stale (PID no longer alive) we silently bow
67# out — this is best-effort cleanup, not a contract.
68cleanup_inference_server() {
69 if ! $WE_STARTED_INFERENCE_SERVER; then
70 return
71 fi
72 local pid_file="$DIR/tmp/llamacpp-server.pid"
73 if [ ! -f "$pid_file" ]; then
74 return
75 fi
76 local pid
77 pid=$(cat "$pid_file" 2>/dev/null)
78 if [ -z "$pid" ] || ! kill -0 "$pid" 2>/dev/null; then
79 # Stale PID file — clean it up and move on.
80 rm -f "$pid_file"
81 return
82 fi
83 echo "Shutting down inference server (PID $pid) that this run started..." >&2
84 kill "$pid" 2>/dev/null
85 # Give the server up to 5 s to exit on SIGTERM. Most well-behaved
86 # processes shut down within a second; the timeout is generous.
87 local i=0
88 while [ "$i" -lt 5 ]; do
89 if ! kill -0 "$pid" 2>/dev/null; then
90 break
91 fi
92 sleep 1
93 i=$((i + 1))
94 done
95 if kill -0 "$pid" 2>/dev/null; then
96 echo " server did not exit on SIGTERM; sending SIGKILL" >&2
97 kill -9 "$pid" 2>/dev/null
98 fi
99 rm -f "$pid_file"
100}
101trap cleanup_inference_server EXIT
102# }}}
103
104# {{{ TUI Library
105# Source TUI library for interactive mode with command preview
106LIBS_DIR="/home/ritz/programming/ai-stuff/scripts/libs"
107TUI_AVAILABLE=false
108if [[ -f "${LIBS_DIR}/lua-menu.sh" ]] && command -v luajit &>/dev/null; then
109 source "${LIBS_DIR}/lua-menu.sh"
110 TUI_AVAILABLE=true
111fi
112# }}}
113
114# {{{ show_help
115show_help() {
116 cat << 'EOF'
117Usage: ./run.sh [FLAGS] [PROJECT_DIR]
118
119Runs the poem processing pipeline. Stages are selected individually
120by named flag (--generate-diversity, --parse, etc.) or by stage
121number (--stage N, --stage=N). Use --full to run every stage.
122With stage flags, runs only the specified stages in pipeline order.
123
124Pipeline Stages (run in order, multiple can be specified):
125 --update-words Stage 1: Sync input files from words repository
126 --extract Stage 2: Extract content from backup archives
127 --parse Stage 3: Parse poems from JSON sources into poems.json
128 --validate Stage 4: Run poem validation
129 --catalog-images Stage 5: Catalog images from input directories
130 --generate-embeddings Stage 6: Generate embeddings via the inference server (~2-3 hours)
131 --generate-similarity Stage 7: Build similarity matrix (~30 min)
132 --generate-diversity Stage 8: Pre-compute diversity cache (~42 hours)
133 --generate-html Stage 9: Generate poem pages, gallery, source browser
134 --generate-wordcloud Stage 10: Generate word-cloud menu and per-word pages
135
136Stage Selection:
137 --stage N Select stage by number (e.g. --stage 8, --stage=5)
138 --full Run ALL stages 1-10 including embeddings
139
140Stage Configuration:
141 --threads N Thread count for parallel operations (default: 4)
142 --force Force regeneration even if files are fresh
143 --force-stage N Force regenerate specific stage only (1-10)
144
145Pagination (HTML Generation):
146 --pages N Pages per poem (default: from config, 1)
147 --poems-per-page N Poems per page for similar/different (default: 200)
148 --chrono-per-page N Poems per page for chronological (default: 500)
149 --seed N Master seed for all randomization (word-cloud shuffle,
150 image order). Same seed => identical output. Precedence:
151 this flag > config.randomization.seed > an auto-generated
152 seed. The resolved seed is recorded to
153 output/generation-metadata.json. (Issue 10-058)
154
155Word Cloud:
156 --wordcloud-words N Number of words in word cloud (default: 200);
157 pass "all" (--wordcloud-words all) for every word
158 --wordcloud-poems N Poems per word-cloud page (default: 50)
159
160Extraction Options:
161 --include-boosts Include fediverse boosts/reblogs in extraction
162
163External Files (Issue 10-003b):
164 --list-external List configured external file sources
165 --sync-only NAME Sync only the specified external source
166
167Inference Server (Issue 10-017):
168 --server NAME Use specific Inference server from config.lua
169 --model NAME Embedding model to use; must be one of the server's
170 available_models (default: the server's configured model)
171 --list-servers List available Inference servers and exit
172
173Output Control:
174 --quiet Suppress progress messages
175 --verbose Show detailed progress
176 --dry-run Show what would be executed without running
177 --debug Write all logs to output/debug-logs/ (durable disk) and
178 keep them on exit, instead of the RAM-backed tmp/ that a
179 hard GPU lock + reboot would wipe. Also tees this script's
180 console output to output/debug-logs/run.log.
181 --low-priority Run compute-heavy stages at lower OS priority (nice -n 10)
182 Keeps desktop/terminal responsive during long operations
183
184Interactive Mode:
185 -I, --interactive Launch TUI for interactive selection (with command preview)
186
187Directory Options:
188 --dir PATH Assets directory (where poems.json etc. are stored)
189 --output PATH Output directory (default: output/)
190
191Other:
192 -h, --help Show this help message
193
194Examples:
195 ./run.sh --full # Run ALL stages including embeddings
196 ./run.sh --generate-html # Only regenerate HTML
197 ./run.sh --stage 8 # Run stage 8 by number
198 ./run.sh --stage=5 --stage=9 # Stage 5 and stage 9
199 ./run.sh --parse --generate-html # Parse then generate HTML
200 ./run.sh --generate-html --threads 8 # HTML with 8 threads
201 ./run.sh --generate-html --pages 5 # Generate top 500 poems per file
202 ./run.sh -I # Interactive TUI mode
203
204Notes:
205 - Stage 6 (embeddings) requires the inference server running with the embedding model
206 - Stage 8 (diversity) takes ~42 hours but is a one-time cost
207 - Once stages 6-8 complete, subsequent runs use cached data
208EOF
209}
210# }}}
211
212# {{{ Parse command line arguments
213DIR=""
214ASSETS_DIR=""
215OUTPUT_DIR=""
216INTERACTIVE=false
217
218# Stage flags (boolean)
219UPDATE_WORDS=false
220EXTRACT=false
221PARSE=false
222VALIDATE=false
223CATALOG_IMAGES=false
224GENERATE_EMBEDDINGS=false
225GENERATE_SIMILARITY=false
226GENERATE_DIVERSITY=false
227GENERATE_HTML=false
228GENERATE_WORDCLOUD=false
229
230# Config flags
231THREADS=""
232# Boost-inclusion override forwarded to extraction (empty = use config default)
233BOOSTS_ARG=""
234FORCE=false
235# Issue 10-016: Per-stage force flags
236FORCE_STAGE_1=false
237FORCE_STAGE_2=false
238FORCE_STAGE_3=false
239FORCE_STAGE_4=false
240FORCE_STAGE_5=false
241FORCE_STAGE_6=false
242FORCE_STAGE_7=false
243FORCE_STAGE_8=false
244FORCE_STAGE_9=false
245FORCE_STAGE_10=false
246QUIET=false
247VERBOSE=false
248DRY_RUN=false
249# --debug: route all logs to output/ (durable disk) instead of the RAM-backed
250# tmp/ symlink, and preserve them on exit. Added to diagnose a hard GPU lock:
251# such a freeze forces a power-cycle, and the tmpfs-backed tmp/ is wiped on
252# reboot, taking every diagnostic with it. See the setup block after `cd $DIR`.
253DEBUG=false
254# Issue 10-028: Lower process priority for UI responsiveness
255LOW_PRIORITY=false
256# Model propagation fix: run.sh no longer hard-codes a default model here, so
257# source code can never disagree with config.lua about the default. CLI_MODEL
258# holds --model ONLY when the operator actually passed it (that is what we record
259# on the per-run overrides notepad); the effective MODEL_NAME used by run.sh's
260# own path/freshness checks is resolved AFTER arg-parsing -- from CLI_MODEL if
261# given, else from config.lua through the same code the child stages use, so
262# every stage agrees by construction. See "Resolve the effective model" below.
263CLI_MODEL=""
264MODEL_NAME=""
265# Issue 8-022: Pagination settings for HTML generation
266PAGES=""
267POEMS_PER_PAGE=""
268
269# Issue 10-058: master seed for all randomization. Empty here means "no --seed on
270# the command line"; the resolver below then falls back to config.randomization.seed
271# and finally to an auto-generated, recorded seed. RANDOM_SEED is the resolved value.
272RANDOM_SEED_FLAG=""
273RANDOM_SEED=""
274RANDOM_SEED_SOURCE=""
275
276# Issue 8-043: Word cloud configuration
277# Word-cloud word count: a number, or the literal "all" for every word. Both the
278# CLI (--wordcloud-words all) and the menu's "All Words" checkbox set this single
279# value -- there is no separate "all" flag to keep in sync.
280WORDCLOUD_WORDS=""
281# Issue 8-050d: Poems per word-cloud page
282WORDCLOUD_POEMS=""
283
284# Issue 8-011: Fediverse boost inclusion (extraction stage)
285INCLUDE_BOOSTS=false
286
287# Issue 10-003b: External file management
288LIST_EXTERNAL=false
289SYNC_ONLY=""
290
291# Issue 10-017: Inference server configuration
292INFERENCE_SERVER=""
293LIST_SERVERS=false
294
295# Track if any stage flag was explicitly set
296STAGE_FLAG_SET=false
297
298while [[ $# -gt 0 ]]; do
299 case $1 in
300 -h|--help)
301 show_help
302 exit 0
303 ;;
304 -I|--interactive)
305 INTERACTIVE=true
306 shift
307 ;;
308 --dir)
309 ASSETS_DIR="$2"
310 shift 2
311 ;;
312 --dir=*)
313 ASSETS_DIR="${1#*=}"
314 shift
315 ;;
316 --output)
317 OUTPUT_DIR="$2"
318 shift 2
319 ;;
320 --output=*)
321 OUTPUT_DIR="${1#*=}"
322 shift
323 ;;
324 --threads)
325 THREADS="$2"
326 shift 2
327 ;;
328 --threads=*)
329 THREADS="${1#*=}"
330 shift
331 ;;
332 # Issue 10-058: master seed for all randomization. Highest precedence,
333 # overrides config.randomization.seed. Resolved + recorded after DIR setup.
334 --seed)
335 RANDOM_SEED_FLAG="$2"
336 shift 2
337 ;;
338 --seed=*)
339 RANDOM_SEED_FLAG="${1#*=}"
340 shift
341 ;;
342 --force)
343 FORCE=true
344 shift
345 ;;
346 # Issue 10-016: Per-stage force regeneration (space-separated format)
347 --force-stage)
348 stage_num="$2"
349 case "$stage_num" in
350 1) FORCE_STAGE_1=true ;;
351 2) FORCE_STAGE_2=true ;;
352 3) FORCE_STAGE_3=true ;;
353 4) FORCE_STAGE_4=true ;;
354 5) FORCE_STAGE_5=true ;;
355 6) FORCE_STAGE_6=true ;;
356 7) FORCE_STAGE_7=true ;;
357 8) FORCE_STAGE_8=true ;;
358 9) FORCE_STAGE_9=true ;;
359 10) FORCE_STAGE_10=true ;;
360 *)
361 echo "ERROR: Invalid stage number: $stage_num (valid: 1-10)" >&2
362 exit 1
363 ;;
364 esac
365 shift 2
366 ;;
367 # Issue 10-016: Per-stage force regeneration (= format for backward compatibility)
368 --force-stage=*)
369 stage_num="${1#*=}"
370 case "$stage_num" in
371 1) FORCE_STAGE_1=true ;;
372 2) FORCE_STAGE_2=true ;;
373 3) FORCE_STAGE_3=true ;;
374 4) FORCE_STAGE_4=true ;;
375 5) FORCE_STAGE_5=true ;;
376 6) FORCE_STAGE_6=true ;;
377 7) FORCE_STAGE_7=true ;;
378 8) FORCE_STAGE_8=true ;;
379 9) FORCE_STAGE_9=true ;;
380 10) FORCE_STAGE_10=true ;;
381 *)
382 echo "ERROR: Invalid stage number: $stage_num (valid: 1-10)" >&2
383 exit 1
384 ;;
385 esac
386 shift
387 ;;
388 --quiet)
389 QUIET=true
390 shift
391 ;;
392 --verbose)
393 VERBOSE=true
394 shift
395 ;;
396 --dry-run)
397 DRY_RUN=true
398 shift
399 ;;
400 # Boost inclusion (reshared posts). These override config.privacy.
401 # include_boosts and are forwarded to the extraction step. Only take
402 # effect on a (re)parse, since they change what poems.json contains.
403 --no-boosts|--exclude-boosts)
404 BOOSTS_ARG="--no-boosts"
405 shift
406 ;;
407 --include-boosts)
408 BOOSTS_ARG="--include-boosts"
409 shift
410 ;;
411 # --debug: persist logs to output/ (survives the reboot a hard GPU
412 # lock forces). Handled after DIR is resolved, below.
413 --debug)
414 DEBUG=true
415 shift
416 ;;
417 # Issue 10-028: Lower process priority for UI responsiveness
418 --low-priority)
419 LOW_PRIORITY=true
420 shift
421 ;;
422 --model)
423 CLI_MODEL="$2"
424 shift 2
425 ;;
426 --model=*)
427 CLI_MODEL="${1#*=}"
428 shift
429 ;;
430 # Issue 8-022: Pagination flags for HTML generation
431 --pages)
432 PAGES="$2"
433 shift 2
434 ;;
435 --pages=*)
436 PAGES="${1#*=}"
437 shift
438 ;;
439 --poems-per-page)
440 POEMS_PER_PAGE="$2"
441 shift 2
442 ;;
443 --poems-per-page=*)
444 POEMS_PER_PAGE="${1#*=}"
445 shift
446 ;;
447 --chrono-per-page)
448 CHRONO_PER_PAGE="$2"
449 shift 2
450 ;;
451 --chrono-per-page=*)
452 CHRONO_PER_PAGE="${1#*=}"
453 shift
454 ;;
455 # Issue 8-043: Word cloud configuration. Word count is set with
456 # --wordcloud-words N, or "--wordcloud-words all" for every word.
457 --wordcloud-words)
458 WORDCLOUD_WORDS="$2"
459 shift 2
460 ;;
461 --wordcloud-words=*)
462 WORDCLOUD_WORDS="${1#*=}"
463 shift
464 ;;
465 # Issue 8-050d: Poems per word-cloud page
466 --wordcloud-poems)
467 WORDCLOUD_POEMS="$2"
468 shift 2
469 ;;
470 --wordcloud-poems=*)
471 WORDCLOUD_POEMS="${1#*=}"
472 shift
473 ;;
474 # Issue 8-011: Fediverse boost inclusion
475 --include-boosts)
476 INCLUDE_BOOSTS=true
477 shift
478 ;;
479 # Issue 10-003b: External file management
480 --list-external)
481 LIST_EXTERNAL=true
482 shift
483 ;;
484 --sync-only)
485 SYNC_ONLY="$2"
486 shift 2
487 ;;
488 --sync-only=*)
489 SYNC_ONLY="${1#*=}"
490 shift
491 ;;
492 # Issue 10-017: Inference server configuration
493 --server)
494 INFERENCE_SERVER="$2"
495 shift 2
496 ;;
497 --server=*)
498 INFERENCE_SERVER="${1#*=}"
499 shift
500 ;;
501 --list-servers)
502 LIST_SERVERS=true
503 shift
504 ;;
505 # Stage flags
506 --update-words)
507 UPDATE_WORDS=true
508 STAGE_FLAG_SET=true
509 shift
510 ;;
511 --extract)
512 EXTRACT=true
513 STAGE_FLAG_SET=true
514 shift
515 ;;
516 --parse)
517 PARSE=true
518 STAGE_FLAG_SET=true
519 shift
520 ;;
521 --validate)
522 VALIDATE=true
523 STAGE_FLAG_SET=true
524 shift
525 ;;
526 --catalog-images)
527 CATALOG_IMAGES=true
528 STAGE_FLAG_SET=true
529 shift
530 ;;
531 --generate-embeddings)
532 GENERATE_EMBEDDINGS=true
533 STAGE_FLAG_SET=true
534 shift
535 ;;
536 --generate-similarity)
537 GENERATE_SIMILARITY=true
538 STAGE_FLAG_SET=true
539 shift
540 ;;
541 --generate-diversity)
542 GENERATE_DIVERSITY=true
543 STAGE_FLAG_SET=true
544 shift
545 ;;
546 --generate-html)
547 GENERATE_HTML=true
548 STAGE_FLAG_SET=true
549 shift
550 ;;
551 --generate-wordcloud)
552 GENERATE_WORDCLOUD=true
553 STAGE_FLAG_SET=true
554 shift
555 ;;
556 # --stage N or --stage=N — select a specific stage by number.
557 # Stage map (numeric): 1=update-words, 2=extract, 3=parse,
558 # 4=validate, 5=catalog-images, 6=generate-embeddings,
559 # 7=generate-similarity, 8=generate-diversity, 9=generate-html,
560 # 10=generate-wordcloud. Can be repeated (e.g. --stage 6 --stage 7).
561 --stage)
562 case "$2" in
563 1) UPDATE_WORDS=true ;;
564 2) EXTRACT=true ;;
565 3) PARSE=true ;;
566 4) VALIDATE=true ;;
567 5) CATALOG_IMAGES=true ;;
568 6) GENERATE_EMBEDDINGS=true ;;
569 7) GENERATE_SIMILARITY=true ;;
570 8) GENERATE_DIVERSITY=true ;;
571 9) GENERATE_HTML=true ;;
572 10) GENERATE_WORDCLOUD=true ;;
573 *) echo "Error: --stage expects a number 1-10, got: $2" >&2; exit 1 ;;
574 esac
575 STAGE_FLAG_SET=true
576 shift 2
577 ;;
578 --stage=*)
579 STAGE_NUM="${1#*=}"
580 case "$STAGE_NUM" in
581 1) UPDATE_WORDS=true ;;
582 2) EXTRACT=true ;;
583 3) PARSE=true ;;
584 4) VALIDATE=true ;;
585 5) CATALOG_IMAGES=true ;;
586 6) GENERATE_EMBEDDINGS=true ;;
587 7) GENERATE_SIMILARITY=true ;;
588 8) GENERATE_DIVERSITY=true ;;
589 9) GENERATE_HTML=true ;;
590 10) GENERATE_WORDCLOUD=true ;;
591 *) echo "Error: --stage expects a number 1-10, got: $STAGE_NUM" >&2; exit 1 ;;
592 esac
593 STAGE_FLAG_SET=true
594 shift
595 ;;
596 --full)
597 # ALL stages including expensive embedding generation (1-10)
598 UPDATE_WORDS=true
599 EXTRACT=true
600 PARSE=true
601 VALIDATE=true
602 CATALOG_IMAGES=true
603 GENERATE_EMBEDDINGS=true
604 GENERATE_SIMILARITY=true
605 GENERATE_DIVERSITY=true
606 GENERATE_HTML=true
607 GENERATE_WORDCLOUD=true
608 STAGE_FLAG_SET=true
609 shift
610 ;;
611 -*)
612 echo "Unknown option: $1" >&2
613 echo "Use --help for usage information" >&2
614 exit 1
615 ;;
616 *)
617 DIR="$1"
618 shift
619 ;;
620 esac
621done
622
623# No implicit stages — require explicit selection. The operator should
624# say what they want to run: a named stage flag, --stage N, or --full.
625if ! $STAGE_FLAG_SET && ! $INTERACTIVE && ! $LIST_SERVERS; then
626 echo "Error: no stages selected. Use --full, a named stage flag" >&2
627 echo " (e.g. --generate-diversity), --stage N, or -I for interactive mode." >&2
628 echo " Run with --help for the full flag list." >&2
629 exit 1
630fi
631
632# Issue 8-032: Convert FORCE to Lua boolean for passing to Lua functions
633if $FORCE; then
634 FORCE_LUA="true"
635else
636 FORCE_LUA="false"
637fi
638
639# Issue 10-028: Set up nice prefix for low priority execution
640# When enabled, heavy operations run at nice level 10 (lower priority)
641# This keeps the desktop/terminal responsive during long pipeline runs
642NICE_PREFIX=""
643if $LOW_PRIORITY; then
644 NICE_PREFIX="nice -n 10"
645fi
646# }}}
647
648# {{{ Setup directories
649DIR=$(setup_dir_path "$DIR")
650
651# Issue 10-051: stage wall-clock timing. Sourced after DIR is final so the
652# library knows where .stage-timings lives. Provides timed_stage (wrap a stage
653# to record its duration on success) and stage_timing_label (render the measured
654# estimate for the pre-flight list). Missing file is harmless: timing is optional.
655[ -f "${DIR}/scripts/stage-timing.sh" ] && source "${DIR}/scripts/stage-timing.sh"
656# If the library was absent, timed_stage still has to exist so the dispatch below
657# runs unchanged -- define a passthrough that just runs the stage, no recording.
658command -v timed_stage >/dev/null 2>&1 || timed_stage() { shift; "$@"; }
659
660# Build arguments for Lua scripts
661ASSETS_ARG=""
662if [ -n "$ASSETS_DIR" ]; then
663 ASSETS_ARG="--dir $ASSETS_DIR"
664fi
665
666# Ensure we're in the right directory
667cd "$DIR" || {
668 echo "Error: Could not access directory $DIR" >&2
669 exit 1
670}
671# }}}
672
673# {{{ --debug: persistent logging
674# Why this exists: a hard GPU lock forces a power-cycle, and the tmp/ symlink
675# points at a tmpfs subdir under /tmp/ (RAM) that the reboot wipes — so the
676# logs that would explain the freeze are gone before they can be read.
677# --debug routes logs to output/debug-logs/ (durable disk) instead.
678#
679# Two mechanisms, working together:
680# 1. NEOCITIES_LOG_DIR is exported so the child scripts that own the
681# inference logs — scripts/start-llamacpp-server.sh (llamacpp-server.log)
682# and generate-embeddings.sh (embedding_generation.log) — write there
683# and skip their usual end-of-run log deletion.
684# 2. This script's own console output is tee'd to run.log, so whatever stage
685# was mid-flight at the instant of the freeze (including the GPU Vulkan
686# similarity/diversity stages, which log only to stdout) leaves a trail.
687#
688# Caveat worth knowing: on a true hard lock you must hard-power-cycle, and the
689# kernel may not have flushed the last few seconds of file writes (dirty pages)
690# to disk. Durable disk still captures vastly more than tmpfs, but the final
691# line or two before the lock can still be lost.
692if $DEBUG; then
693 LOG_DIR="$DIR/output/debug-logs"
694 mkdir -p "$LOG_DIR"
695 export NEOCITIES_LOG_DIR="$LOG_DIR"
696 # The Vulkan C library reads VKC_DEBUG to switch its progress bars from the
697 # animated single-line "\r" form to verbose, newline-terminated lines --
698 # the right shape when stdout is the fsync-logger pipe below and we want a
699 # durable, per-line history of a possibly-freezing run.
700 export VKC_DEBUG=1
701 # Don't reroute stdout through a pipe in interactive mode: the TUI checks
702 # isatty() and a pipe would break its rendering. The child-script file
703 # logs still land in LOG_DIR via the exported env var above.
704 #
705 # fsync-logger (not tee) is used so every line is fsync()'d to disk the
706 # instant it is printed — the stage banners are exactly what triage needs,
707 # and a hard lock right after a banner must not lose it to a dirty-page
708 # buffer. Slow, but --debug is for catching a freeze, not for speed.
709 if ! $INTERACTIVE; then
710 exec > >("$DIR/scripts/fsync-logger" "$LOG_DIR/run.log") 2>&1
711 fi
712 echo "[DEBUG] Logging to $LOG_DIR (per-line fsync to disk; persists across reboots; logs kept on exit)"
713fi
714# }}}
715
716# {{{ Issue 10-003b: Handle external file commands (immediate actions)
717if $LIST_EXTERNAL; then
718 "$DIR/scripts/sync-external-files" --list
719 exit 0
720fi
721
722if [ -n "$SYNC_ONLY" ]; then
723 "$DIR/scripts/sync-external-files" "$SYNC_ONLY"
724 exit $?
725fi
726# }}}
727
728# {{{ Issue 10-017: Handle Inference server commands (immediate actions)
729if $LIST_SERVERS; then
730 luajit -e "
731 package.path = '$DIR/libs/?.lua;' .. package.path
732 local inference = require('inference-server-config')
733 inference.list_servers()
734 "
735 exit 0
736fi
737# }}}
738
739# {{{ Resolve the effective embedding model and record this run's overrides
740# Why this exists: run.sh launches a fresh luajit process per stage, and argv/env
741# reach only the stages we remember to thread them through. Before this block, a
742# --model override silently reverted to config.lua's default in the HTML,
743# word-cloud and word-page stages (they resolve the model via get_selected_model()
744# / embeddings_dir() with no argument). The fix is a shared notepad in RAM: we
745# stamp THIS run's choices onto tmp/run-overrides.lua once, here, and the model
746# resolver reads them. It is rewritten every run, so a previous run's --model can
747# never leak in -- the staleness trap a file has but an env var does not. Passing
748# an empty CLI_MODEL records no model key, so a plain run falls back to config.lua.
749# Materialize the tmpfs-backed tmp/ symlink + target before writing into it.
750# A bare `mkdir -p tmp` does NOT work here: tmp/ is a symlink, and if its target
751# is missing (wiped on reboot) mkdir sees the link, reports "exists", and creates
752# nothing. ensure-tmp-symlink is the project's idempotent, fail-loud helper for
753# exactly this -- it creates the /tmp target the symlink points at.
754"$DIR/scripts/ensure-tmp-symlink" "$DIR" || {
755 echo "Error: could not materialize the tmp/ RAM directory (scripts/ensure-tmp-symlink)" >&2
756 exit 1
757}
758"$DIR/scripts/write-run-overrides" "$DIR" --model "$CLI_MODEL" || {
759 echo "Error: failed to record run overrides (scripts/write-run-overrides)" >&2
760 exit 1
761}
762
763# The effective model for run.sh's OWN path/freshness checks and for the stages it
764# hands an explicit model to: the CLI value if given, else config.lua's default
765# for the selected server -- resolved through the SAME resolver the children use
766# (which now also reads the notepad above), so every stage agrees by construction.
767if [ -n "$CLI_MODEL" ]; then
768 MODEL_NAME="$CLI_MODEL"
769else
770 MODEL_NAME="$(luajit -e "
771 package.path = '$DIR/libs/?.lua;$DIR/src/?.lua;' .. package.path
772 local inf = require('inference-server-config')
773 inf.set_project_root('$DIR')
774 if '$INFERENCE_SERVER' ~= '' then inf.set_selected_server('$INFERENCE_SERVER') end
775 io.write(inf.get_selected_model())
776 ")"
777 if [ -z "$MODEL_NAME" ]; then
778 echo "Error: could not resolve embedding model from config.lua" >&2
779 exit 1
780 fi
781fi
782
783# Create this model's cache directories ONCE here, at model-load, instead of
784# making each stage remember to mkdir its own output dir before its first write.
785# The paths are inferred from the model name by scripts/cache-dir (the single
786# place that maps a model -> its directories): the movable (RAM) dir, its
787# similarities/ subdir, and the reboot-surviving on-disk dir (--disk). A
788# brand-new model otherwise has no assets/embeddings/<model>/ folder, which once
789# let a 40-minute diversity run finish and then fail at its final write. Adding a
790# new model now needs no manual mkdir -- selecting it is enough.
791_ram_dir="$(luajit "$DIR/scripts/cache-dir" "$DIR" --model "$MODEL_NAME")"
792_disk_dir="$(luajit "$DIR/scripts/cache-dir" "$DIR" --model "$MODEL_NAME" --disk)"
793if [ -z "$_ram_dir" ] || [ -z "$_disk_dir" ]; then
794 echo "Error: could not resolve cache directories for model $MODEL_NAME" >&2
795 exit 1
796fi
797mkdir -p "$_ram_dir/similarities" "$_disk_dir" || {
798 echo "Error: could not create cache directories for model $MODEL_NAME" >&2
799 exit 1
800}
801# }}}
802
803# {{{ Logging functions
804log_info() {
805 if ! $QUIET; then
806 echo "$1"
807 fi
808}
809
810log_verbose() {
811 if $VERBOSE; then
812 echo "$1"
813 fi
814}
815
816log_stage() {
817 if ! $QUIET; then
818 echo ""
819 echo -e "${COLOR_MAGENTA}═══════════════════════════════════════════════════════════════════${COLOR_RESET}"
820 echo -e " ${COLOR_GREEN}$1${COLOR_RESET}"
821 echo -e "${COLOR_MAGENTA}═══════════════════════════════════════════════════════════════════${COLOR_RESET}"
822 fi
823}
824
825log_dry_run() {
826 echo "[DRY-RUN] Would execute: $1"
827}
828
829# ANSI color codes for terminal output
830# These add visual distinction to success/info/error messages
831COLOR_GREEN="\033[92m" # Bright green for success (✓, ✅)
832COLOR_BLUE="\033[94m" # Bright blue for info (ℹ️)
833COLOR_RED="\033[91m" # Bright red for errors (✗, ❌)
834COLOR_YELLOW="\033[93m" # Bright yellow for warnings (⚠️)
835COLOR_MAGENTA="\033[95m" # Bright magenta for stage delimiters
836COLOR_RESET="\033[0m" # Reset to default
837
838# Colored symbol helpers
839symbol_success() {
840 echo -e "${COLOR_GREEN}$1${COLOR_RESET}"
841}
842
843symbol_info() {
844 echo -e "${COLOR_BLUE}$1${COLOR_RESET}"
845}
846
847symbol_error() {
848 echo -e "${COLOR_RED}$1${COLOR_RESET}"
849}
850
851symbol_warning() {
852 echo -e "${COLOR_YELLOW}$1${COLOR_RESET}"
853}
854# }}}
855
856# {{{ Issue 10-058: resolve + record the build's master seed
857# A single integer governs every randomization site this run (the word-cloud
858# shuffle and image-order randomization). Resolved here -- after arg parsing and
859# DIR setup, before any stage -- with this precedence (highest first):
860# 1. --seed N on the command line
861# 2. config.randomization.seed in config.lua
862# 3. an auto-generated seed (so an unseeded build is still reproducible after
863# the fact, because we RECORD whatever we resolve)
864# The resolved seed is logged, written to output/generation-metadata.json, and
865# threaded to each randomizing subprocess as "--seed=N".
866
867# {{{ resolve_random_seed()
868resolve_random_seed() {
869 if [ -n "$RANDOM_SEED_FLAG" ]; then
870 RANDOM_SEED="$RANDOM_SEED_FLAG"
871 RANDOM_SEED_SOURCE="cli (--seed)"
872 return
873 fi
874 # config.lua is a static `return {...}` table, so dofile reads it without the
875 # Lua config-loader. pcall guards a malformed config (empty => fall through).
876 local cfg_seed
877 cfg_seed=$(luajit -e 'local ok,c=pcall(dofile,"'"$DIR"'/config.lua"); if ok and type(c)=="table" and c.randomization and c.randomization.seed then io.write(tostring(c.randomization.seed)) end')
878 if [ -n "$cfg_seed" ]; then
879 RANDOM_SEED="$cfg_seed"
880 RANDOM_SEED_SOURCE="config.lua (randomization.seed)"
881 return
882 fi
883 # Auto: mix epoch seconds with the PID so two runs in the same second differ;
884 # fold to a 31-bit non-negative int so it round-trips through CLI/JSON/randomseed.
885 RANDOM_SEED=$(( ($(date +%s) * 100000 + $$) % 2147483647 ))
886 RANDOM_SEED_SOURCE="auto-generated"
887}
888# }}}
889
890resolve_random_seed
891
892# No fallback on a bad value: a malformed seed is a hard error, because silently
893# substituting a random one would defeat the reproducibility this whole feature buys.
894case "$RANDOM_SEED" in
895 ''|*[!0-9]*)
896 echo "ERROR: resolved random seed '$RANDOM_SEED' is not a non-negative integer." >&2
897 echo " Fix --seed or config.randomization.seed and re-run." >&2
898 exit 1
899 ;;
900esac
901
902# The argument every randomizing subprocess receives. Equals-form on purpose: the
903# bare number can never be mistaken for a positional DIR by a child's arg parser.
904RANDOM_SEED_ARG="--seed=$RANDOM_SEED"
905log_info "🎲 Random seed: $RANDOM_SEED (source: $RANDOM_SEED_SOURCE)"
906
907# {{{ write_generation_metadata()
908# The canonical "which seed made this build?" record. A small JSON at the output
909# root; written early (so an interrupted build still leaves it) and at the root
910# (so per-stage clears, which only touch output/ subdirs, never wipe it).
911write_generation_metadata() {
912 local out_dir="${OUTPUT_DIR:-$DIR/output}"
913 mkdir -p "$out_dir"
914 local generated_at
915 generated_at=$(date -u +%Y-%m-%dT%H:%M:%SZ)
916 cat > "$out_dir/generation-metadata.json" <<EOF
917{
918 "seed": $RANDOM_SEED,
919 "seed_source": "$RANDOM_SEED_SOURCE",
920 "generated_at": "$generated_at",
921 "pages": "${PAGES:-default}",
922 "poems_per_page": "${POEMS_PER_PAGE:-default}"
923}
924EOF
925}
926# }}}
927
928if $DRY_RUN; then
929 log_dry_run "write $DIR/output/generation-metadata.json (seed $RANDOM_SEED)"
930else
931 write_generation_metadata
932fi
933# }}}
934
935# {{{ Stage execution functions
936
937# {{{ run_update_words
938run_update_words() {
939 log_stage "📁 Stage 1/10: Updating input files from words repository"
940
941 # Issue 10-016: Check both global and per-stage force flags (Stage 1)
942 local stage_force=$FORCE
943 $FORCE_STAGE_1 && stage_force=true
944
945 # Issue 7-003: Pass force flag to skip file preservation
946 local force_flag=""
947 if $stage_force; then
948 force_flag="--force"
949 fi
950
951 if $DRY_RUN; then
952 log_dry_run "$DIR/scripts/update-words $force_flag"
953 return 0
954 fi
955
956 "$DIR/scripts/update-words" $force_flag || {
957 echo "Warning: Failed to update input files, continuing anyway..." >&2
958 }
959}
960# }}}
961
962# {{{ run_extract
963run_extract() {
964 log_stage "🔄 Stage 2/10: Extracting content from backup archives"
965
966 # Issue 8-011: Build boost inclusion flag
967 local boost_flag=""
968 if $INCLUDE_BOOSTS; then
969 boost_flag="--include-boosts"
970 fi
971
972 if $DRY_RUN; then
973 log_dry_run "$DIR/scripts/update $DIR $boost_flag"
974 return 0
975 fi
976
977 "$DIR/scripts/update" "$DIR" $boost_flag || {
978 echo "Error: Content extraction failed" >&2
979 exit 1
980 }
981}
982# }}}
983
984# {{{ run_strip_excluded
985# Issue 10-053: After sync/extraction, remove excluded images + note source files
986# from input/ so they are never cataloged, embedded, rendered, or uploaded. Runs
987# before image cataloging. strip-excluded validates every exclusion BEFORE it
988# deletes anything; a non-zero exit means a broken exclusion path (it points at no
989# real file), which is FATAL -- continuing would ship content that was explicitly
990# marked do-not-ship. The validation happens before any stripping and before the
991# expensive catalog/embed stages, so a bad path costs only the cheap re-run.
992run_strip_excluded() {
993 log_stage "🧹 Stripping excluded content from input/"
994 if $DRY_RUN; then
995 log_dry_run "lua $DIR/scripts/strip-excluded $DIR"
996 return 0
997 fi
998 if ! lua "$DIR/scripts/strip-excluded" "$DIR"; then
999 echo "ERROR: strip-excluded failed -- a broken exclusion path in config.lua." >&2
1000 echo " Fix excluded_images and re-run; nothing was stripped or shipped." >&2
1001 exit 1
1002 fi
1003}
1004# }}}
1005
1006# {{{ run_parse
1007run_parse() {
1008 log_stage "📝 Stage 3/10: Parsing poems from JSON sources"
1009
1010 # Issue 10-016: Check both global and per-stage force flags (Stage 3)
1011 local stage_force=$FORCE
1012 $FORCE_STAGE_3 && stage_force=true
1013
1014 local force_arg=""
1015 if $stage_force; then
1016 force_arg="--force"
1017 fi
1018
1019 if $DRY_RUN; then
1020 log_dry_run "luajit src/main.lua $DIR --parse-only $force_arg $BOOSTS_ARG $ASSETS_ARG"
1021 return 0
1022 fi
1023
1024 luajit src/main.lua "$DIR" --parse-only $force_arg $BOOSTS_ARG $ASSETS_ARG || {
1025 echo "Error: Poem parsing failed" >&2
1026 exit 1
1027 }
1028}
1029# }}}
1030
1031# {{{ run_validate
1032run_validate() {
1033 log_stage "$(symbol_success "") Stage 4/10: Validating poem data"
1034
1035 if $DRY_RUN; then
1036 log_dry_run "luajit src/main.lua $DIR --validate-only $ASSETS_ARG"
1037 return 0
1038 fi
1039
1040 luajit src/main.lua "$DIR" --validate-only $ASSETS_ARG || {
1041 echo "Error: Poem validation failed" >&2
1042 exit 1
1043 }
1044}
1045# }}}
1046
1047# {{{ run_catalog_images
1048# Issue 10-015a: Pass --verbose flag to show detailed image catalog statistics
1049run_catalog_images() {
1050 log_stage "🖼️ Stage 5/10: Cataloging images"
1051
1052 # Build verbose argument if enabled
1053 local VERBOSE_ARG=""
1054 $VERBOSE && VERBOSE_ARG="--verbose"
1055
1056 if $DRY_RUN; then
1057 log_dry_run "luajit src/main.lua $DIR --catalog-only $VERBOSE_ARG $ASSETS_ARG $RANDOM_SEED_ARG"
1058 return 0
1059 fi
1060
1061 luajit src/main.lua "$DIR" --catalog-only $VERBOSE_ARG $ASSETS_ARG $RANDOM_SEED_ARG || {
1062 echo "Error: Image cataloging failed" >&2
1063 exit 1
1064 }
1065}
1066# }}}
1067
1068# {{{ emb_cache_dir
1069# Issue 10-054: resolve a model's cache directory through the shared resolver
1070# (scripts/cache-dir), so run.sh's freshness/pre-flight checks look in EXACTLY the
1071# place the Lua code and generate-embeddings.sh write -- disk or RAM, per the
1072# CACHE_IN_RAM switch. Pass --disk for the reboot-surviving diversity cache. A
1073# blank result is a hard error rather than a silently-wrong (empty) path.
1074emb_cache_dir() {
1075 local d
1076 d="$(luajit "$DIR/scripts/cache-dir" "$DIR" --model "$MODEL_NAME" "$@")"
1077 if [ -z "$d" ]; then
1078 echo "Error: could not resolve cache dir (scripts/cache-dir)" >&2
1079 exit 1
1080 fi
1081 echo "$d"
1082}
1083# }}}
1084
1085# {{{ run_generate_embeddings
1086run_generate_embeddings() {
1087 log_stage "🤖 Stage 6/10: Generating embeddings via the inference server"
1088
1089 # Convert model name for directory (embeddinggemma:latest -> embeddinggemma_latest)
1090 local model_dir_name="${MODEL_NAME//:/_}"
1091 local embeddings_file="$(emb_cache_dir)/embeddings.json"
1092 local poems_file="$DIR/assets/poems.json"
1093
1094 # Issue 10-016: Check both global and per-stage force flags
1095 local stage_force=$FORCE
1096 $FORCE_STAGE_6 && stage_force=true
1097
1098 # Freshness check (Issue 10-050): skip ONLY when every poem already has an
1099 # embedding. The old test compared mtimes (embeddings.json newer than
1100 # poems.json) — which was wrong: a run that embedded 8160/8362 and then died
1101 # leaves a NEWER but INCOMPLETE embeddings.json, so mtime said "fresh, skip"
1102 # and the missing poems never got done. Counting entries is the honest
1103 # signal; incremental mode then fills only the gap, so it is cheap to re-run.
1104 if ! $stage_force && [ -f "$embeddings_file" ] && [ -f "$poems_file" ]; then
1105 # Count embeddings WITHOUT parsing the (large) JSON: each entry carries
1106 # exactly one "poem_index" key. (This counts error records too, so it can
1107 # only over-report completeness; incremental retries those anyway.)
1108 local emb_count
1109 emb_count=$(grep -o '"poem_index"' "$embeddings_file" | wc -l)
1110 local poem_count
1111 poem_count=$(luajit -e "
1112 package.path = '$DIR/?.lua;' .. package.path
1113 local dk = require('libs/dkjson')
1114 local f = io.open('$poems_file'); local d = dk.decode(f:read('*a')); f:close()
1115 print(#(d.poems or d))
1116 ")
1117 if [ -n "$poem_count" ] && [ "$poem_count" -gt 0 ] && [ "$emb_count" -ge "$poem_count" ]; then
1118 log_info " ⏭️ Embeddings complete ($emb_count/$poem_count), skipping..."
1119 return 0
1120 fi
1121 log_info " Embeddings incomplete ($emb_count/${poem_count:-?}) — running incremental to fill the gap..."
1122 fi
1123
1124 local force_arg=""
1125 if $stage_force; then
1126 force_arg="--full-regen"
1127 else
1128 force_arg="--incremental"
1129 fi
1130
1131 # Issue 10-017: Build Inference server argument
1132 local server_arg=""
1133 if [ -n "$INFERENCE_SERVER" ]; then
1134 server_arg="--server=$INFERENCE_SERVER"
1135 fi
1136
1137 if $DRY_RUN; then
1138 log_dry_run "$DIR/generate-embeddings.sh $force_arg --model=$MODEL_NAME $server_arg $DIR"
1139 log_dry_run "luajit $DIR/src/generate-word-pages.lua $DIR --embeddings-only"
1140 return 0
1141 fi
1142
1143 if [ -n "$INFERENCE_SERVER" ]; then
1144 log_info " Inference Server: $INFERENCE_SERVER"
1145 fi
1146 log_info " Model: $MODEL_NAME"
1147 log_info " Output: assets/embeddings/$model_dir_name/embeddings.json"
1148 log_info " Mode: $(if $FORCE; then echo 'full regeneration'; else echo 'incremental (skip existing)'; fi)"
1149
1150 # Issue 10-028: Apply low priority to expensive embedding generation
1151 $NICE_PREFIX "$DIR/generate-embeddings.sh" $force_arg --model="$MODEL_NAME" $server_arg "$DIR" || {
1152 echo "Error: Embedding generation failed" >&2
1153 echo "Make sure the inference server is running with the $MODEL_NAME model" >&2
1154 exit 1
1155 }
1156
1157 # Word embeddings used to run here, but the word-COLOR step inside
1158 # generate-word-pages needs color_embeddings.json, which is produced later by
1159 # run_generate_semantic_colors. Running words first made that step skip with
1160 # "no color embeddings found". Moved to run_generate_word_embeddings, called
1161 # AFTER colors in main.
1162}
1163# }}}
1164
1165# {{{ run_generate_word_embeddings
1166# Word-cloud word embeddings + their semantic colors. Split out of
1167# run_generate_embeddings (Issue 8-043b) and ordered AFTER the semantic-color
1168# stage so color_embeddings.json already exists when the word-color step runs.
1169run_generate_word_embeddings() {
1170 log_info " Generating word embeddings for word cloud..."
1171 # WORDCLOUD_WORDS carries either a number or the literal "all"; the generator
1172 # accepts both via --words (it treats "--words all" the same as "--all").
1173 local wordcloud_args=""
1174 if [ -n "$WORDCLOUD_WORDS" ]; then
1175 wordcloud_args="--words $WORDCLOUD_WORDS"
1176 fi
1177 $NICE_PREFIX luajit "$DIR/src/generate-word-pages.lua" "$DIR" --embeddings-only $wordcloud_args || {
1178 echo "Warning: Word embedding generation failed, continuing..." >&2
1179 }
1180}
1181# }}}
1182
1183# {{{ run_generate_semantic_colors
1184run_generate_semantic_colors() {
1185 # Regenerate poem_colors.json if stale or missing
1186 # This runs BEFORE similarity matrix generation (Stage 6.5)
1187 # Requires: embeddings.json, color_embeddings.json
1188 # Respects: --force (skip freshness check), --dry-run (show actions only)
1189
1190 local model_dir_name="${MODEL_NAME//:/_}"
1191
1192 # Paths match what generate-embeddings.sh writes (see run_generate_embeddings above).
1193 # The stray assets/embeddings/embeddings/ directory on disk is a stale leftover from
1194 # before the model-name subfolder convention; it is not the real output location.
1195 local embeddings_file="$(emb_cache_dir)/embeddings.json"
1196 local poem_colors_file="$(emb_cache_dir)/poem_colors.json"
1197 local color_embeddings_file="$(emb_cache_dir)/color_embeddings.json"
1198
1199 # Embeddings must exist first (exit early if not - prevents confusing errors)
1200 if [ ! -f "$embeddings_file" ]; then
1201 log_verbose " Skipping semantic colors - embeddings not yet generated"
1202 return 0
1203 fi
1204
1205 # color_embeddings.json is derived from the color palette (color_names +
1206 # color_associations in config.lua). It used to regenerate ONLY when the file was
1207 # missing, so editing the palette -- e.g. dropping gray as a cluster color -- had
1208 # no effect until someone deleted the cache by hand (and the config comment that
1209 # said "re-run stage 6.5 after editing" was quietly false). We now fingerprint the
1210 # palette and regenerate whenever it changes, so editing colors then re-running
1211 # actually takes effect. The fingerprint is a sorted, deterministic dump of the
1212 # palette -- no server needed to compute it.
1213 local palette_fp_file="$(emb_cache_dir)/color_palette.fingerprint"
1214 local current_palette_fp
1215 current_palette_fp=$(luajit -e "
1216 package.path = '$DIR/libs/?.lua;$DIR/src/?.lua;' .. package.path
1217 local config = require('config-loader').load()
1218 local names = {}
1219 for _, n in ipairs(config.color_names or {}) do names[#names+1] = n end
1220 table.sort(names)
1221 local parts = {}
1222 for _, n in ipairs(names) do
1223 local a = {}
1224 for _, w in ipairs((config.color_associations or {})[n] or {}) do a[#a+1] = w end
1225 table.sort(a)
1226 parts[#parts+1] = n .. '=' .. table.concat(a, ',')
1227 end
1228 io.write(table.concat(parts, '|'))
1229 ")
1230 local stored_palette_fp=""
1231 [ -f "$palette_fp_file" ] && stored_palette_fp=$(cat "$palette_fp_file")
1232
1233 # Regenerate color embeddings if missing OR the palette changed since last time.
1234 if [ ! -f "$color_embeddings_file" ] || [ "$current_palette_fp" != "$stored_palette_fp" ]; then
1235 if [ -f "$color_embeddings_file" ]; then
1236 log_stage "🎨 Stage 6.5/10: Color palette changed -- regenerating color embeddings"
1237 else
1238 log_stage "🎨 Stage 6.5/10: Generating color embeddings (one-time)"
1239 fi
1240
1241 if $DRY_RUN; then
1242 log_dry_run "luajit semantic-color-calculator (generate color embeddings)"
1243 # Still need to skip poem colors generation in dry run
1244 else
1245 log_info " $(symbol_warning "⚠️") Color embeddings not found, generating via the inference server..."
1246 # Issue 10-003 migrated color_names from config/semantic-colors.json (now deleted)
1247 # into config.lua, loaded via libs/config-loader.lua. Errors here are loud rather
1248 # than silent so a missing config doesn't propagate downstream as a confusing
1249 # "Failed to load required data files" in the next stage.
1250 luajit -e "
1251 package.path = '$DIR/libs/?.lua;$DIR/src/?.lua;' .. package.path
1252 local calc = require('semantic-color-calculator')
1253 local utils = require('utils')
1254 utils.init_assets_root({'$DIR'})
1255
1256 -- Mirror the --server selection pattern used elsewhere in run.sh
1257 -- (see the interactive TUI block below). If INFERENCE_SERVER is empty
1258 -- the module falls back to config.lua's default_inference_server.
1259 -- The interactive flag is forwarded so that a typoed --server or
1260 -- --model triggers a 1/2 prompt only when the operator launched
1261 -- run.sh with -I; otherwise we hard-error.
1262 local inference = require('inference-server-config')
1263 inference.set_project_root('$DIR')
1264 inference.set_interactive_mode('$INTERACTIVE' == 'true')
1265 if '$INFERENCE_SERVER' ~= '' then
1266 inference.set_selected_server('$INFERENCE_SERVER')
1267 end
1268
1269 local config = require('config-loader').load()
1270 if not config.color_names then
1271 error('config.lua is missing color_names (Issue 10-003 migration)')
1272 end
1273 -- Pass color_associations so each color's embedding is the mean
1274 -- of its essence words, not the bare color word (richer + the
1275 -- z-scored assignment is balanced). nil endpoint = use the
1276 -- selected server. Falls back to bare words if associations absent.
1277 local embeddings = calc.generate_color_embeddings(config.color_names, '$MODEL_NAME', nil, config.color_associations)
1278 if not next(embeddings) then
1279 error('Inference server returned no color embeddings')
1280 end
1281 local data = {embeddings = embeddings, generated_at = os.date('%Y-%m-%d %H:%M:%S'), model_name = '$MODEL_NAME'}
1282 utils.write_json_file('$color_embeddings_file', data)
1283 print('[INFO] Color embeddings saved: ' .. '$color_embeddings_file')
1284 " || {
1285 echo "Error: Color embedding generation failed" >&2
1286 exit 1
1287 }
1288 # Remember the palette we just built from, so the next run can tell
1289 # whether it changed (and skip this server round-trip when it hasn't).
1290 echo "$current_palette_fp" > "$palette_fp_file"
1291 fi
1292 fi
1293
1294 # Issue 10-016: Check both global and per-stage force flags (Stage 6)
1295 local stage_force=$FORCE
1296 $FORCE_STAGE_6 && stage_force=true
1297
1298 # Check freshness: poem_colors.json should be newer than embeddings.json
1299 # With --force or --force-stage 6: always regenerate regardless of freshness
1300 if ! $stage_force && [ -f "$poem_colors_file" ] && [ -f "$embeddings_file" ]; then
1301 # Poem colors depend on BOTH the poem embeddings AND the color centroids, so
1302 # they are only fresh when newer than both. Watching only embeddings.json
1303 # meant a palette change (which rewrites color_embeddings.json but not
1304 # embeddings.json) left poem_colors.json stale yet considered "fresh".
1305 if [ "$poem_colors_file" -nt "$embeddings_file" ] && [ "$poem_colors_file" -nt "$color_embeddings_file" ]; then
1306 log_info " ⏭️ Semantic colors are fresh (newer than embeddings + palette), skipping..."
1307 return 0
1308 fi
1309 log_verbose " poem_colors.json is stale (older than embeddings or palette), regenerating..."
1310 elif $stage_force; then
1311 log_verbose " --force specified, regenerating semantic colors..."
1312 fi
1313
1314 log_stage "🎨 Stage 6b/10: Computing semantic colors (part of embeddings)"
1315
1316 if $DRY_RUN; then
1317 log_dry_run "luajit semantic-color-calculator (poem colors regeneration)"
1318 return 0
1319 fi
1320
1321 log_info " Input: $embeddings_file"
1322 log_info " Output: $poem_colors_file"
1323
1324 # Regenerate poem colors using existing embeddings
1325 luajit -e "
1326 package.path = '$DIR/libs/?.lua;$DIR/src/?.lua;' .. package.path
1327 local calc = require('semantic-color-calculator')
1328 local utils = require('utils')
1329 utils.init_assets_root({'$DIR'})
1330
1331 local poems_data = utils.read_json_file(utils.asset_path('poems.json'))
1332 local embeddings_data = utils.read_json_file('$embeddings_file')
1333 local color_embeddings_data = utils.read_json_file('$color_embeddings_file')
1334
1335 if poems_data and embeddings_data and color_embeddings_data then
1336 calc.precompute_poem_colors(poems_data, embeddings_data, color_embeddings_data.embeddings, '$poem_colors_file')
1337 else
1338 error('Failed to load required data files')
1339 end
1340 " || {
1341 echo "Error: Semantic color generation failed" >&2
1342 exit 1
1343 }
1344}
1345# }}}
1346
1347# {{{ run_augment_images
1348# Issue 9-013: give every text-less image a pseudo-embedding (the normalized
1349# average of the poem before and after it chronologically) and fold those into
1350# embeddings.json so the GPU similarity stage ranks images alongside poems.
1351# Also writes image-manifest.json, which the HTML renderer reads to draw image
1352# entries. Cheap and idempotent, so it runs each time before the matrix build.
1353run_augment_images() {
1354 log_stage "🖼️ Stage 6.7: Folding images into the embedding set (pseudo-embeddings)"
1355 local model_dir_name="${MODEL_NAME//:/_}"
1356 local embeddings_file="$(emb_cache_dir)/embeddings.json"
1357 if [ ! -f "$embeddings_file" ]; then
1358 echo "Error: embeddings.json not found; run --generate-embeddings first" >&2
1359 exit 1
1360 fi
1361 if $DRY_RUN; then
1362 log_dry_run "luajit $DIR/src/augment-embeddings-with-images.lua $DIR"
1363 return 0
1364 fi
1365 $NICE_PREFIX luajit "$DIR/src/augment-embeddings-with-images.lua" "$DIR" || {
1366 echo "Error: image augmentation failed" >&2
1367 exit 1
1368 }
1369}
1370# }}}
1371
1372# {{{ run_generate_similarity
1373run_generate_similarity() {
1374 # GPU (Vulkan) is required: these are O(N^2) similarity calculations that make no
1375 # sense on a CPU, so the CPU route was removed (Issue 10-057). A missing GPU library
1376 # is a hard error with build instructions, never a slow fallback.
1377 if [ ! -f "$DIR/libs/vulkan-compute/build/libvkcompute.so" ]; then
1378 echo "Error: GPU library not found: libs/vulkan-compute/build/libvkcompute.so" >&2
1379 echo "Build it: cd libs/vulkan-compute && make" >&2
1380 exit 1
1381 fi
1382 log_stage "📊 Stage 7/10: Building similarity matrix with GPU (~5-10 min)"
1383
1384 # Convert model name for directory
1385 local model_dir_name="${MODEL_NAME//:/_}"
1386 local embeddings_file="$(emb_cache_dir)/embeddings.json"
1387
1388 # Check if embeddings exist
1389 if [ ! -f "$embeddings_file" ]; then
1390 echo "Error: Embeddings file not found: $embeddings_file" >&2
1391 echo "Run --generate-embeddings first" >&2
1392 exit 1
1393 fi
1394
1395 # Issue 10-016: Check both global and per-stage force flags (Stage 7)
1396 local stage_force=$FORCE
1397 $FORCE_STAGE_7 && stage_force=true
1398
1399 # Issue 8-033: Check for individual similarity files instead of monolithic matrix
1400 local similarities_dir="$(emb_cache_dir)/similarities"
1401 local similarity_count=0
1402 if [ -d "$similarities_dir" ]; then
1403 similarity_count=$(find "$similarities_dir" -name "poem_*.json" 2>/dev/null | wc -l)
1404 fi
1405
1406 # Freshness check: skip if we have all files and they're fresh
1407 if ! $stage_force && [ "$similarity_count" -ge 7797 ]; then
1408 # Check if any are older than embeddings (check newest file)
1409 local newest_similarity=$(find "$similarities_dir" -name "poem_*.json" -type f -printf '%T@ %p\n' 2>/dev/null | sort -rn | head -1 | cut -d' ' -f2-)
1410 if [ -n "$newest_similarity" ] && [ "$newest_similarity" -nt "$embeddings_file" ]; then
1411 log_info " ⏭️ Similarity files are fresh ($similarity_count files newer than embeddings), skipping..."
1412 return 0
1413 fi
1414 fi
1415
1416 local threads_arg=""
1417 if [ -n "$THREADS" ]; then
1418 threads_arg="--threads=$THREADS"
1419 fi
1420
1421 if $DRY_RUN; then
1422 log_dry_run "luajit (GPU vk_similarity via libvkcompute.so) --generate-matrix $threads_arg"
1423 return 0
1424 fi
1425
1426 log_info " Input: assets/embeddings/$model_dir_name/embeddings.json"
1427 log_info " Output: assets/embeddings/$model_dir_name/similarities/*.json (individual files)"
1428
1429 # Issue 10-016: Convert stage_force to Lua boolean for Lua function calls
1430 local stage_force_lua="false"
1431 $stage_force && stage_force_lua="true"
1432
1433 # GPU similarity generation using Vulkan compute shaders (the only route now)
1434 log_info " Mode: GPU-accelerated (Vulkan)"
1435
1436 # Pass threads value to GPU similarity (defaults to 8 if not specified)
1437 local default_threads=8
1438 local threads_to_use=${THREADS:-$default_threads}
1439 log_info " CPU sorting threads: $threads_to_use"
1440
1441 DIR="$DIR" luajit -e "
1442 package.path = '$DIR/?.lua;$DIR/?/init.lua;$DIR/libs/?.lua;' .. package.path
1443 local vk_sim = require('libs.vulkan-compute.lua.vk_similarity')
1444 -- Issue 10-057: size the rankings cache to exactly what THIS build shows
1445 -- per poem -- the ACTUAL pages it generates (the --pages value, else the
1446 -- config default minimum_pages -- NOT the storage ceiling
1447 -- max_pages_per_poem) times the poems shown per page. Everything is read
1448 -- at runtime from the run's flags + config; no hardcoded page counts. The
1449 -- HTML stage's loader regenerates if a later run ever needs more (the
1450 -- top_k stamp makes that detectable). The list is sorted nearest-first, so
1451 -- the top-K ARE precisely what the pages display.
1452 local _cfg = require('config-loader'); _cfg.set_project_root('$DIR')
1453 local _pag = _cfg.load().pagination
1454 if not _pag then error('config.pagination missing; cannot size the rankings cache') end
1455 local _pages = tonumber('$PAGES') or _pag.minimum_pages
1456 local _per_page = tonumber('$POEMS_PER_PAGE') or _pag.poems_per_page
1457 if not _pages or not _per_page then
1458 error('cannot resolve pages/poems_per_page to size the rankings cache')
1459 end
1460 local _top_k = _pages * _per_page
1461 -- Use TRUE parallel GPU computation (Issue 9-002 original design)
1462 local success = vk_sim.generate_similarity_matrix_gpu_parallel(
1463 '$(emb_cache_dir)/embeddings.json',
1464 '$MODEL_NAME',
1465 $stage_force_lua,
1466 $threads_to_use,
1467 _top_k
1468 )
1469 if not success then
1470 print('[GPU SIMILARITY ERROR] GPU generation failed')
1471 os.exit(1)
1472 end
1473 " || {
1474 echo "Error: GPU similarity generation failed" >&2
1475 exit 1
1476 }
1477
1478 # Note: Pre-sorted similarity rankings cache is now generated automatically
1479 # by the GPU similarity engine (in-RAM, no file re-reading needed)
1480}
1481# }}}
1482
1483# {{{ run_generate_diversity
1484run_generate_diversity() {
1485 # GPU (Vulkan) is required: the diversity walk is O(N^2) GPU work, so the CPU route
1486 # was removed (Issue 10-057). A missing GPU library is a hard error, not a fallback.
1487 if [ ! -f "$DIR/libs/vulkan-compute/build/libvkcompute.so" ]; then
1488 echo "Error: GPU library not found: libs/vulkan-compute/build/libvkcompute.so" >&2
1489 echo "Build it: cd libs/vulkan-compute && make" >&2
1490 exit 1
1491 fi
1492 log_stage "🎲 Stage 8/10: Pre-computing diversity cache with GPU (~1 min)"
1493
1494 # Convert model name for directory
1495 local model_dir_name="${MODEL_NAME//:/_}"
1496 local cache_file="$(emb_cache_dir --disk)/diversity_cache.json"
1497 local embeddings_file="$(emb_cache_dir)/embeddings.json"
1498
1499 # Check if embeddings exist
1500 if [ ! -f "$embeddings_file" ]; then
1501 echo "Error: Embeddings file not found: $embeddings_file" >&2
1502 echo "Run --generate-embeddings first" >&2
1503 exit 1
1504 fi
1505
1506 # Issue 10-016: Check both global and per-stage force flags (Stage 8)
1507 local stage_force=$FORCE
1508 $FORCE_STAGE_8 && stage_force=true
1509
1510 # Freshness check: skip if cache newer than embeddings
1511 if ! $stage_force && [ -f "$cache_file" ]; then
1512 if [ "$cache_file" -nt "$embeddings_file" ]; then
1513 log_info " ⏭️ Diversity cache is fresh (newer than embeddings), skipping..."
1514 return 0
1515 fi
1516 fi
1517
1518 log_info " Input: assets/embeddings/$model_dir_name/embeddings.json"
1519 log_info " Output: assets/embeddings/$model_dir_name/diversity_cache.json"
1520
1521 # GPU diversity generation using Vulkan compute shaders (the only route now)
1522 log_info " Mode: GPU-accelerated (Vulkan)"
1523
1524 if $DRY_RUN; then
1525 log_dry_run "$DIR/scripts/precompute-diversity-sequences-gpu $DIR"
1526 return 0
1527 fi
1528
1529 # Issue 10-028: Apply low priority to expensive diversity generation.
1530 # The model is no longer passed via env here: the wrapper resolves it
1531 # through inference-server-config, which reads this run's overrides notepad
1532 # (tmp/run-overrides.lua, written above from --model) and falls back to
1533 # config.lua -- so the CLI override is honored without a per-stage env var.
1534 # Issue 10-057: pass the run's page settings so the wrapper caps each diversity
1535 # sequence to the SAME K the similarity cache and the HTML stage use.
1536 PAGES="$PAGES" POEMS_PER_PAGE="$POEMS_PER_PAGE" $NICE_PREFIX "$DIR/scripts/precompute-diversity-sequences-gpu" "$DIR" || {
1537 echo "Error: GPU diversity cache generation failed" >&2
1538 exit 1
1539 }
1540}
1541# }}}
1542
1543# {{{ run_generate_html
1544run_generate_html() {
1545 log_stage "🌐 Stage 9/10: Generating website HTML"
1546
1547 # Issue 10-016: Check both global and per-stage force flags (Stage 9)
1548 local stage_force=$FORCE
1549 $FORCE_STAGE_9 && stage_force=true
1550
1551 # Issue 10-024: Clear output directories when forcing regeneration
1552 # This prevents stale files with obsolete poem_index values from persisting
1553 # after poem re-extraction changes the poem_index assignments
1554 if $stage_force; then
1555 log_info " Clearing stale HTML files (--force)..."
1556 rm -f "$DIR/output/similar/"*.html 2>/dev/null
1557 rm -f "$DIR/output/different/"*.html 2>/dev/null
1558 rm -f "$DIR/output/chronological/"*.html 2>/dev/null
1559 fi
1560
1561 local force_arg=""
1562 if $stage_force; then
1563 force_arg="--force"
1564 fi
1565
1566 local threads_arg=""
1567 if [ -n "$THREADS" ]; then
1568 threads_arg="--threads $THREADS"
1569 fi
1570
1571 # Issue 8-022: Pagination arguments
1572 local pages_arg=""
1573 if [ -n "$PAGES" ]; then
1574 pages_arg="--pages $PAGES"
1575 fi
1576
1577 local poems_per_page_arg=""
1578 if [ -n "$POEMS_PER_PAGE" ]; then
1579 poems_per_page_arg="--poems-per-page $POEMS_PER_PAGE"
1580 fi
1581
1582 local chrono_per_page_arg=""
1583 if [ -n "$CHRONO_PER_PAGE" ]; then
1584 chrono_per_page_arg="--chrono-per-page $CHRONO_PER_PAGE"
1585 fi
1586
1587 if $DRY_RUN; then
1588 log_dry_run "$DIR/scripts/sync-page-templates $DIR (restore explore-page copy into input/pages/)"
1589 log_dry_run "luajit src/main.lua $DIR --html-only $force_arg $threads_arg $pages_arg $poems_per_page_arg $chrono_per_page_arg $ASSETS_ARG"
1590 log_dry_run "luajit $DIR/src/generate-gallery-pages.lua $DIR"
1591 log_dry_run "luajit $DIR/src/generate-source-browser.lua $DIR"
1592 return 0
1593 fi
1594
1595 # Issue 11-005: restore the authored explore-page copy into the ephemeral
1596 # input/pages/ before generating. The canonical, version-controlled source is
1597 # page-templates/*.txt; input/ is wiped + re-synced from external sources each
1598 # run and does NOT carry this prose, so it is copied back in here. (Edit the
1599 # files in page-templates/ -- input/pages/ is overwritten from them.)
1600 "$DIR/scripts/sync-page-templates" "$DIR" || {
1601 echo "Error: failed to restore page templates into input/pages/" >&2
1602 exit 1
1603 }
1604
1605 # Issue 10-028: Apply low priority to HTML generation (parallel processing)
1606 $NICE_PREFIX luajit src/main.lua "$DIR" --html-only $force_arg $threads_arg $pages_arg $poems_per_page_arg $chrono_per_page_arg $ASSETS_ARG || {
1607 echo "Error: HTML generation failed" >&2
1608 exit 1
1609 }
1610
1611 # Issue 10-059: the word-cloud menu and per-word similarity pages moved to their
1612 # own stage 10 (run_generate_wordcloud). They run after this stage, so the
1613 # chronological pages main.lua just built are already present for their #poem links.
1614
1615 # Issue 10-042: Build the image gallery (masonry pages per source + index +
1616 # chronological). It was previously a separate manual step, so the gallery
1617 # went stale -- it now regenerates with every HTML run from image-catalog.json.
1618 log_info " Generating image gallery..."
1619 $NICE_PREFIX luajit "$DIR/src/generate-gallery-pages.lua" "$DIR" || {
1620 echo "Warning: Gallery generation failed, continuing..." >&2
1621 }
1622
1623 # Issue 10-052: Build the link-only source browser (code/issues/docs as HTML)
1624 # under output/source/. This is the "git push that builds a webpage" -- the
1625 # private monorepo never leaves the machine; whoever has the site link can
1626 # browse the source. It publishes an ALLOWLIST only (never the private input
1627 # corpus), so it is safe to ship with the rest of the site.
1628 log_info " Generating source browser..."
1629 $NICE_PREFIX luajit "$DIR/src/generate-source-browser.lua" "$DIR" || {
1630 echo "Warning: Source browser generation failed, continuing..." >&2
1631 }
1632 # NOTE: the downloadable zip is built at POST time by running
1633 # scripts/build-download-zip directly, not here -- it is a deploy artifact, and
1634 # there is no point regenerating a multi-GB archive on every local build. (The
1635 # site's links are document-relative, so there is no URL-conversion step before
1636 # upload; just upload output/ and build the zip.)
1637}
1638# }}}
1639
1640# {{{ run_generate_wordcloud
1641# Issue 10-059: the word-cloud stage. Builds the site's entry menu (which carries the
1642# live poem index) and the per-word similarity pages. Runs after stage 9, so the
1643# chronological pages its #poem links target already exist. Replaces the retired
1644# numeric-similarity-index stage, whose output (numeric-index.html) was linked from
1645# nowhere and was superseded by the menu's embedded poem index.
1646run_generate_wordcloud() {
1647 log_stage "🔤 Stage 10/10: Generating word-cloud menu and per-word pages"
1648
1649 # Issue 10-059/10-061: wipe the per-word pages before regenerating. A word that
1650 # has fallen out of the cloud since the last build leaves an orphan page that the
1651 # generator never overwrites -- and an orphan from before a link-scheme change
1652 # ships BROKEN links (this is exactly how 134 stale "/similar-different/" pages
1653 # survived into a relative-path build). The pages are fully regenerated from the
1654 # current word set just below, so clearing every run (not only on --force) is
1655 # safe and is the only way to guarantee no stale orphans. Matches the principle
1656 # that each stage wipes its own output subdirectory before rebuilding it.
1657 if [ -d "$DIR/output/wordcloud" ]; then
1658 log_info " Clearing stale per-word pages before regeneration..."
1659 rm -f "$DIR/output/wordcloud/"*.html
1660 fi
1661
1662 # Word-cloud arguments. WORDCLOUD_WORDS is a number or "all"; --words carries
1663 # either ("--words all" == every word, per the generators).
1664 local wordcloud_words_arg=""
1665 if [ -n "$WORDCLOUD_WORDS" ]; then
1666 wordcloud_words_arg="--words $WORDCLOUD_WORDS"
1667 fi
1668
1669 # Issue 8-050d: Poems per word-cloud page
1670 local wordcloud_poems_arg=""
1671 if [ -n "$WORDCLOUD_POEMS" ]; then
1672 wordcloud_poems_arg="--poems-per-page $WORDCLOUD_POEMS"
1673 fi
1674
1675 # Issue 10-036: thread chrono_per_page so the word-cloud poem links paginate to
1676 # the SAME chronological pages stage 9 built (separate processes must agree on
1677 # page size, or every #poem link lands on the wrong page).
1678 local chrono_per_page_arg=""
1679 if [ -n "$CHRONO_PER_PAGE" ]; then
1680 chrono_per_page_arg="--chrono-per-page $CHRONO_PER_PAGE"
1681 fi
1682
1683 if $DRY_RUN; then
1684 log_dry_run "luajit $DIR/src/wordcloud-generator.lua $DIR $wordcloud_words_arg $chrono_per_page_arg $RANDOM_SEED_ARG"
1685 log_dry_run "luajit $DIR/src/generate-word-pages.lua $DIR --html-only $wordcloud_words_arg $wordcloud_poems_arg $chrono_per_page_arg"
1686 return 0
1687 fi
1688
1689 # The word cloud IS the site's menu (and carries the live poem index), so a
1690 # failure here is fatal, not a warning -- there is no usable entry page without it.
1691 log_info " Generating word cloud menu..."
1692 $NICE_PREFIX luajit "$DIR/src/wordcloud-generator.lua" "$DIR" $wordcloud_words_arg $chrono_per_page_arg $RANDOM_SEED_ARG || {
1693 echo "Error: Word cloud menu generation failed" >&2
1694 exit 1
1695 }
1696
1697 log_info " Generating word similarity pages..."
1698 $NICE_PREFIX luajit "$DIR/src/generate-word-pages.lua" "$DIR" --html-only $wordcloud_words_arg $wordcloud_poems_arg $chrono_per_page_arg || {
1699 echo "Error: Word similarity page generation failed" >&2
1700 exit 1
1701 }
1702}
1703# }}}
1704
1705# }}}
1706
1707# {{{ interactive_mode_tui
1708# TUI-based interactive mode with command preview
1709# Uses Lua menu library for stable rendering and real-time command preview
1710interactive_mode_tui() {
1711 if ! $TUI_AVAILABLE; then
1712 echo "ERROR: TUI library not available." >&2
1713 echo "Falling back to Lua-based interactive mode..." >&2
1714 luajit src/main.lua "$DIR" -I $ASSETS_ARG
1715 return $?
1716 fi
1717
1718 # Initialize TUI
1719 if ! tui_init; then
1720 echo "ERROR: TUI initialization failed." >&2
1721 echo "Falling back to Lua-based interactive mode..." >&2
1722 luajit src/main.lua "$DIR" -I $ASSETS_ARG
1723 return $?
1724 fi
1725
1726 # Build the menu
1727 menu_init
1728 menu_set_title "Neocities Pipeline" "Use j/k to navigate, space to toggle, Enter to run"
1729
1730 # ═══════════════════════════════════════════════════════════════════════════
1731 # Section 1: Pipeline Stages (multi - can select multiple)
1732 # Each checkbox maps to a CLI flag for command preview
1733 # Issue 10-016: Force regeneration moved here with per-stage options
1734 # ═══════════════════════════════════════════════════════════════════════════
1735 menu_add_section "stages" "multi" "Pipeline Stages (toggle stages to run)"
1736
1737 # Issue 10-016: Global force regenerate option at top of stages
1738 menu_add_item "stages" "force" "Force regenerate ALL stages" "checkbox" "0" \
1739 "Force regeneration even if files are fresh" "" "--force"
1740
1741 menu_add_item "stages" "update_words" "1. Update Words" "checkbox" "1" \
1742 "Sync input files from words repository" "" "--update-words"
1743 menu_add_item "stages" "force_update_words" " ↳ Force regenerate" "checkbox" "0" \
1744 "Force regenerate this stage only" "" "--force-stage 1"
1745
1746 menu_add_item "stages" "extract" "2. Extract" "checkbox" "1" \
1747 "Extract content from backup archives" "" "--extract"
1748 menu_add_item "stages" "force_extract" " ↳ Force regenerate" "checkbox" "0" \
1749 "Force regenerate this stage only" "" "--force-stage 2"
1750
1751 menu_add_item "stages" "parse" "3. Parse" "checkbox" "1" \
1752 "Parse poems from JSON sources into poems.json" "" "--parse"
1753 menu_add_item "stages" "force_parse" " ↳ Force regenerate" "checkbox" "0" \
1754 "Force regenerate this stage only" "" "--force-stage 3"
1755
1756 menu_add_item "stages" "validate" "4. Validate" "checkbox" "1" \
1757 "Run poem validation" "" "--validate"
1758 menu_add_item "stages" "force_validate" " ↳ Force regenerate" "checkbox" "0" \
1759 "Force regenerate this stage only" "" "--force-stage 4"
1760
1761 menu_add_item "stages" "catalog_images" "5. Catalog Images" "checkbox" "1" \
1762 "Catalog images from input directories" "" "--catalog-images"
1763 menu_add_item "stages" "force_catalog_images" " ↳ Force regenerate" "checkbox" "0" \
1764 "Force regenerate this stage only" "" "--force-stage 5"
1765
1766 menu_add_item "stages" "generate_embeddings" "6. Embeddings ⚠️" "checkbox" "0" \
1767 "Generate embeddings via the inference server (~2-3 hours)" "" "--generate-embeddings"
1768 menu_add_item "stages" "force_generate_embeddings" " ↳ Force regenerate" "checkbox" "0" \
1769 "Force regenerate this stage only" "" "--force-stage 6"
1770
1771 menu_add_item "stages" "generate_similarity" "7. Similarity ⚠️" "checkbox" "0" \
1772 "Build similarity matrix (~30 min)" "" "--generate-similarity"
1773 menu_add_item "stages" "force_generate_similarity" " ↳ Force regenerate" "checkbox" "0" \
1774 "Force regenerate this stage only" "" "--force-stage 7"
1775
1776 menu_add_item "stages" "generate_diversity" "8. Diversity ⚠️" "checkbox" "0" \
1777 "Pre-compute diversity cache (~42 hours)" "" "--generate-diversity"
1778 menu_add_item "stages" "force_generate_diversity" " ↳ Force regenerate" "checkbox" "0" \
1779 "Force regenerate this stage only" "" "--force-stage 8"
1780
1781 menu_add_item "stages" "generate_html" "9. Generate HTML" "checkbox" "1" \
1782 "Generate website HTML (chronological + similarity pages)" "" "--generate-html"
1783 menu_add_item "stages" "force_generate_html" " ↳ Force regenerate" "checkbox" "0" \
1784 "Force regenerate this stage only" "" "--force-stage 9"
1785
1786 menu_add_item "stages" "generate_wordcloud" "10. Generate Word Cloud" "checkbox" "1" \
1787 "Generate the word-cloud menu and per-word similarity pages" "" "--generate-wordcloud"
1788 menu_add_item "stages" "force_generate_wordcloud" " ↳ Force regenerate" "checkbox" "0" \
1789 "Force regenerate this stage only" "" "--force-stage 10"
1790
1791 # Issue 10-016: Dependencies - per-stage force options disabled when global force is checked
1792 # invert=true means: enable per-stage force when global force is NOT checked
1793 menu_add_dependency "force_update_words" "force" "1" "true" \
1794 "Disabled: global force is active" "orange"
1795 menu_add_dependency "force_extract" "force" "1" "true" \
1796 "Disabled: global force is active" "orange"
1797 menu_add_dependency "force_parse" "force" "1" "true" \
1798 "Disabled: global force is active" "orange"
1799 menu_add_dependency "force_validate" "force" "1" "true" \
1800 "Disabled: global force is active" "orange"
1801 menu_add_dependency "force_catalog_images" "force" "1" "true" \
1802 "Disabled: global force is active" "orange"
1803 menu_add_dependency "force_generate_embeddings" "force" "1" "true" \
1804 "Disabled: global force is active" "orange"
1805 menu_add_dependency "force_generate_similarity" "force" "1" "true" \
1806 "Disabled: global force is active" "orange"
1807 menu_add_dependency "force_generate_diversity" "force" "1" "true" \
1808 "Disabled: global force is active" "orange"
1809 menu_add_dependency "force_generate_html" "force" "1" "true" \
1810 "Disabled: global force is active" "orange"
1811 menu_add_dependency "force_generate_wordcloud" "force" "1" "true" \
1812 "Disabled: global force is active" "orange"
1813
1814 # ═══════════════════════════════════════════════════════════════════════════
1815 # Section 2: Configuration Options
1816 # ═══════════════════════════════════════════════════════════════════════════
1817 menu_add_section "config" "multi" "Configuration"
1818 # Issue 10-034: Orchestrator pattern enables parallel HTML with low memory
1819 # Main thread sends 80KB work slices instead of workers loading 700MB caches
1820 # Expected memory: ~2.5GB total (vs 14GB+ before fix)
1821 menu_add_item "config" "threads" "Thread Count" "flag" "4:8" \
1822 "Threads for HTML gen (orchestrator mode)" "" "--threads"
1823 # Issue 8-022: Pagination options for HTML generation
1824 menu_add_item "config" "pages" "Pages per Poem" "flag" ":2" \
1825 "Pages to generate per poem (default: from config, 1)" "" "--pages"
1826 menu_add_item "config" "poems_per_page" "Poems per Page" "flag" ":3" \
1827 "Poems per page for similar/different (default: 200)" "" "--poems-per-page"
1828 menu_add_item "config" "chrono_per_page" "Chrono per Page" "flag" ":3" \
1829 "Poems per page for chronological (default: 500)" "" "--chrono-per-page"
1830 # Issue 10-016: Force Regeneration moved to stages section
1831 menu_add_item "config" "dry_run" "Dry Run" "checkbox" "0" \
1832 "Show what would be executed without running" "" "--dry-run"
1833 menu_add_item "config" "verbose" "Verbose Output" "checkbox" "0" \
1834 "Show detailed progress information" "" "--verbose"
1835 menu_add_item "config" "include_boosts" "Include Boosts" "checkbox" "0" \
1836 "Include fediverse boosts/reblogs in extraction" "" "--include-boosts"
1837
1838 # ═══════════════════════════════════════════════════════════════════════════
1839 # Section 3: Word Cloud Configuration
1840 # Issue 8-043: Configurable word count with "all words" toggle
1841 # ═══════════════════════════════════════════════════════════════════════════
1842 menu_add_section "wordcloud" "multi" "Word Cloud Options"
1843 menu_add_item "wordcloud" "wordcloud_all" "All Words" "checkbox" "0" \
1844 "Include all words (disables word count limit)" "" "--wordcloud-words all"
1845 menu_add_item "wordcloud" "wordcloud_words" "Word Count" "flag" "200:3" \
1846 "Maximum words in word cloud (default: 200)" "" "--wordcloud-words"
1847 # Issue 8-050d: Poems per word-cloud page
1848 menu_add_item "wordcloud" "wordcloud_poems" "Poems Per Page" "flag" "50:3" \
1849 "Poems per word-cloud similarity page (default: 50)" "" "--wordcloud-poems"
1850 # Dependency: Disable wordcloud_words when wordcloud_all is checked
1851 # invert=true means: enable wordcloud_words when wordcloud_all is NOT checked (value "1")
1852 menu_add_dependency "wordcloud_words" "wordcloud_all" "1" "true" \
1853 "Word count disabled when 'All Words' is checked"
1854
1855 # ═══════════════════════════════════════════════════════════════════════════
1856 # Section 4: Command Preview (shows the command that will be executed)
1857 # ═══════════════════════════════════════════════════════════════════════════
1858 menu_add_section "preview" "multi" "Command Preview"
1859 menu_add_item "preview" "cmd_preview" "" "text" "" \
1860 "The command that will be executed (press ~ to copy to clipboard)"
1861
1862 # Configure command preview - links checkboxes to command string
1863 menu_set_command_config "./run.sh" "cmd_preview" ""
1864
1865 # ═══════════════════════════════════════════════════════════════════════════
1866 # Section 5: Actions
1867 # ═══════════════════════════════════════════════════════════════════════════
1868 menu_add_section "actions" "single" "Actions"
1869 menu_add_item "actions" "run" "Run Selected Stages" "action" "" \
1870 "Execute the selected pipeline stages" ""
1871
1872 # Run the menu loop
1873 while true; do
1874 if menu_run; then
1875 # User selected "run" - extract values and execute
1876 local update_words_val=$(menu_get_value "update_words")
1877 local extract_val=$(menu_get_value "extract")
1878 local parse_val=$(menu_get_value "parse")
1879 local validate_val=$(menu_get_value "validate")
1880 local catalog_val=$(menu_get_value "catalog_images")
1881 local embeddings_val=$(menu_get_value "generate_embeddings")
1882 local similarity_val=$(menu_get_value "generate_similarity")
1883 local diversity_val=$(menu_get_value "generate_diversity")
1884 local html_val=$(menu_get_value "generate_html")
1885 local wordcloud_stage_val=$(menu_get_value "generate_wordcloud")
1886 local threads_val=$(menu_get_value "threads")
1887 # Issue 8-022: Get pagination values from TUI
1888 local pages_val=$(menu_get_value "pages")
1889 local poems_per_page_val=$(menu_get_value "poems_per_page")
1890 local chrono_per_page_val=$(menu_get_value "chrono_per_page")
1891 local force_val=$(menu_get_value "force")
1892 # Issue 10-016: Get per-stage force values from TUI
1893 local force_update_words_val=$(menu_get_value "force_update_words")
1894 local force_extract_val=$(menu_get_value "force_extract")
1895 local force_parse_val=$(menu_get_value "force_parse")
1896 local force_validate_val=$(menu_get_value "force_validate")
1897 local force_catalog_val=$(menu_get_value "force_catalog_images")
1898 local force_embeddings_val=$(menu_get_value "force_generate_embeddings")
1899 local force_similarity_val=$(menu_get_value "force_generate_similarity")
1900 local force_diversity_val=$(menu_get_value "force_generate_diversity")
1901 local force_html_val=$(menu_get_value "force_generate_html")
1902 local force_wordcloud_val=$(menu_get_value "force_generate_wordcloud")
1903 local dry_val=$(menu_get_value "dry_run")
1904 local verbose_val=$(menu_get_value "verbose")
1905 # Issue 8-011: Get boost inclusion value from TUI
1906 local include_boosts_val=$(menu_get_value "include_boosts")
1907 # Issue 8-043: Get wordcloud values from TUI
1908 local wordcloud_all_val=$(menu_get_value "wordcloud_all")
1909 local wordcloud_words_val=$(menu_get_value "wordcloud_words")
1910 # Issue 8-050d: Get poems per word-cloud page from TUI
1911 local wordcloud_poems_val=$(menu_get_value "wordcloud_poems")
1912
1913 # Set global flags based on menu selection
1914 [[ "$update_words_val" == "1" ]] && UPDATE_WORDS=true || UPDATE_WORDS=false
1915 [[ "$extract_val" == "1" ]] && EXTRACT=true || EXTRACT=false
1916 [[ "$parse_val" == "1" ]] && PARSE=true || PARSE=false
1917 [[ "$validate_val" == "1" ]] && VALIDATE=true || VALIDATE=false
1918 [[ "$catalog_val" == "1" ]] && CATALOG_IMAGES=true || CATALOG_IMAGES=false
1919 [[ "$embeddings_val" == "1" ]] && GENERATE_EMBEDDINGS=true || GENERATE_EMBEDDINGS=false
1920 [[ "$similarity_val" == "1" ]] && GENERATE_SIMILARITY=true || GENERATE_SIMILARITY=false
1921 [[ "$diversity_val" == "1" ]] && GENERATE_DIVERSITY=true || GENERATE_DIVERSITY=false
1922 [[ "$html_val" == "1" ]] && GENERATE_HTML=true || GENERATE_HTML=false
1923 [[ "$wordcloud_stage_val" == "1" ]] && GENERATE_WORDCLOUD=true || GENERATE_WORDCLOUD=false
1924
1925 # Config flags
1926 [[ -n "$threads_val" && "$threads_val" != "0" ]] && THREADS="$threads_val"
1927 # Issue 8-022: Set pagination values from TUI
1928 [[ -n "$pages_val" && "$pages_val" != "0" ]] && PAGES="$pages_val"
1929 [[ -n "$poems_per_page_val" && "$poems_per_page_val" != "0" ]] && POEMS_PER_PAGE="$poems_per_page_val"
1930 [[ -n "$chrono_per_page_val" && "$chrono_per_page_val" != "0" ]] && CHRONO_PER_PAGE="$chrono_per_page_val"
1931 [[ "$force_val" == "1" ]] && FORCE=true || FORCE=false
1932 # Issue 10-016: Set per-stage force flags from TUI
1933 [[ "$force_update_words_val" == "1" ]] && FORCE_STAGE_1=true || FORCE_STAGE_1=false
1934 [[ "$force_extract_val" == "1" ]] && FORCE_STAGE_2=true || FORCE_STAGE_2=false
1935 [[ "$force_parse_val" == "1" ]] && FORCE_STAGE_3=true || FORCE_STAGE_3=false
1936 [[ "$force_validate_val" == "1" ]] && FORCE_STAGE_4=true || FORCE_STAGE_4=false
1937 [[ "$force_catalog_val" == "1" ]] && FORCE_STAGE_5=true || FORCE_STAGE_5=false
1938 [[ "$force_embeddings_val" == "1" ]] && FORCE_STAGE_6=true || FORCE_STAGE_6=false
1939 [[ "$force_similarity_val" == "1" ]] && FORCE_STAGE_7=true || FORCE_STAGE_7=false
1940 [[ "$force_diversity_val" == "1" ]] && FORCE_STAGE_8=true || FORCE_STAGE_8=false
1941 [[ "$force_html_val" == "1" ]] && FORCE_STAGE_9=true || FORCE_STAGE_9=false
1942 [[ "$force_wordcloud_val" == "1" ]] && FORCE_STAGE_10=true || FORCE_STAGE_10=false
1943 [[ "$dry_val" == "1" ]] && DRY_RUN=true || DRY_RUN=false
1944 [[ "$verbose_val" == "1" ]] && VERBOSE=true || VERBOSE=false
1945 # Issue 8-011: Set boost inclusion from TUI
1946 [[ "$include_boosts_val" == "1" ]] && INCLUDE_BOOSTS=true || INCLUDE_BOOSTS=false
1947 # Issue 8-043: Set the word count from the TUI. The "All Words" checkbox
1948 # wins -- it sets the count to the literal "all" (and the dependency has
1949 # already disabled the now-irrelevant Word Count field). Otherwise the
1950 # typed count is used. One value, WORDCLOUD_WORDS, feeds --wordcloud-words.
1951 if [[ "$wordcloud_all_val" == "1" ]]; then
1952 WORDCLOUD_WORDS="all"
1953 elif [[ -n "$wordcloud_words_val" && "$wordcloud_words_val" != "0" ]]; then
1954 WORDCLOUD_WORDS="$wordcloud_words_val"
1955 fi
1956 # Issue 8-050d: Set poems per word-cloud page from TUI
1957 [[ -n "$wordcloud_poems_val" && "$wordcloud_poems_val" != "0" ]] && WORDCLOUD_POEMS="$wordcloud_poems_val"
1958
1959 # Check if at least one stage is selected
1960 if ! $UPDATE_WORDS && ! $EXTRACT && ! $PARSE && ! $VALIDATE && \
1961 ! $CATALOG_IMAGES && ! $GENERATE_EMBEDDINGS && ! $GENERATE_SIMILARITY && \
1962 ! $GENERATE_DIVERSITY && ! $GENERATE_HTML && ! $GENERATE_WORDCLOUD; then
1963 echo ""
1964 echo "No stages selected. Please select at least one stage to run."
1965 echo "Press Enter to continue..."
1966 read -r
1967 continue
1968 fi
1969
1970 # Exit menu and run the pipeline
1971 menu_cleanup
1972 return 0
1973 else
1974 # User quit
1975 menu_cleanup
1976 echo "Goodbye!"
1977 exit 0
1978 fi
1979 done
1980}
1981# }}}
1982
1983# {{{ Main execution
1984
1985# Handle interactive mode
1986EXECUTED_COMMAND="" # Store command for post-run display
1987if $INTERACTIVE; then
1988 log_info "🎛️ Launching interactive mode with command preview..."
1989 interactive_mode_tui
1990 # Save the command preview for display after execution
1991 EXECUTED_COMMAND=$(menu_get_value "cmd_preview")
1992 # After TUI, fall through to execute selected stages
1993fi
1994
1995# Show what will be executed (in non-interactive or after TUI selection)
1996if $DRY_RUN || $VERBOSE; then
1997 echo "Pipeline stages to execute:"
1998 # Issue 10-051 / alignment: render the plan as a TABLE -- stage names in one
1999 # left-aligned column, the measured average time right-aligned in the next --
2000 # so durations line up and the eye can scan them. Measured wall-clock (avg of
2001 # recent runs) appears once a stage has run here before; until then a coarse
2002 # magnitude word (short/medium/long) stands in, since a word can't go stale
2003 # the way a hard number can. The ⚠ marks the heavy stages.
2004 #
2005 # Each row is "enabled|number|name|warned|timing-key|magnitude". The timing
2006 # key can differ from the display name (word-cloud history is stored under
2007 # "wordcloud" but shown as "generate-wordcloud").
2008 _plan_rows=(
2009 "$UPDATE_WORDS|1|update-words|0|update-words|short"
2010 "$EXTRACT|2|extract|0|extract|short"
2011 "$PARSE|3|parse|0|parse|short"
2012 "$VALIDATE|4|validate|0|validate|short"
2013 "$CATALOG_IMAGES|5|catalog-images|0|catalog-images|short"
2014 "$GENERATE_EMBEDDINGS|6|generate-embeddings|1|generate-embeddings|long"
2015 "$GENERATE_SIMILARITY|7|generate-similarity|1|generate-similarity|medium"
2016 "$GENERATE_DIVERSITY|8|generate-diversity|1|generate-diversity|medium"
2017 "$GENERATE_HTML|9|generate-html|0|generate-html|medium"
2018 "$GENERATE_WORDCLOUD|10|generate-wordcloud|0|wordcloud|short"
2019 )
2020 _have_timing=false
2021 command -v stage_timing_mean >/dev/null 2>&1 && _have_timing=true
2022
2023 # Pass 1: collect enabled rows + each one's time string and tail, and track
2024 # the widest label and widest time. The ⚠ glyph is counted as ONE display
2025 # column (not its byte length) so the multibyte char does not skew alignment.
2026 _p_num=(); _p_label=(); _p_lvis=(); _p_time=(); _p_tail=()
2027 _labelw=0; _timew=0
2028 for _row in "${_plan_rows[@]}"; do
2029 IFS='|' read -r _en _num _name _warn _key _mag <<< "$_row"
2030 [ "$_en" = "true" ] || continue
2031 _lbl="$_name"; _lvis=${#_name}
2032 if [ "$_warn" = "1" ]; then _lbl="$_name $(symbol_warning "")"; _lvis=$(( ${#_name} + 2 )); fi
2033 _time=""; _tail="$_mag"
2034 if $_have_timing; then
2035 _mean="$(stage_timing_mean "$_key" 2>/dev/null)"
2036 if [ -n "$_mean" ]; then
2037 _cnt="$(stage_timing_count "$_key")"
2038 _pl="s"; [ "$_cnt" = "1" ] && _pl=""
2039 _time="$(stage_timing_format_seconds "$_mean")"
2040 _tail="last ${_cnt} run${_pl}"
2041 fi
2042 fi
2043 _p_num+=("$_num"); _p_label+=("$_lbl"); _p_lvis+=("$_lvis")
2044 _p_time+=("$_time"); _p_tail+=("$_tail")
2045 [ "$_lvis" -gt "$_labelw" ] && _labelw=$_lvis
2046 [ "${#_time}" -gt "$_timew" ] && _timew=${#_time}
2047 done
2048
2049 # Pass 2: print aligned. Number in a 3-wide field ("1." / "10."), label padded
2050 # to _labelw, time right-aligned to _timew inside "(avg <time>, <tail>)".
2051 _i=0
2052 while [ "$_i" -lt "${#_p_num[@]}" ]; do
2053 _pad=$(( _labelw - ${_p_lvis[$_i]} ))
2054 _sp=""; [ "$_pad" -gt 0 ] && _sp="$(printf '%*s' "$_pad" '')"
2055 if [ -n "${_p_time[$_i]}" ]; then
2056 printf " %-3s %s%s (avg %*s, %s)\n" \
2057 "${_p_num[$_i]}." "${_p_label[$_i]}" "$_sp" \
2058 "$_timew" "${_p_time[$_i]}" "${_p_tail[$_i]}"
2059 else
2060 printf " %-3s %s%s (%s)\n" \
2061 "${_p_num[$_i]}." "${_p_label[$_i]}" "$_sp" "${_p_tail[$_i]}"
2062 fi
2063 _i=$(( _i + 1 ))
2064 done
2065 echo ""
2066fi
2067
2068# {{{ Issue 10-017: Validate Inference server connectivity before embedding stages
2069if $GENERATE_EMBEDDINGS && ! $DRY_RUN; then
2070 log_info "Validating Inference server connectivity..."
2071 VALIDATION_RESULT=$(luajit -e "
2072 package.path = '$DIR/libs/?.lua;' .. package.path
2073 local inference = require('inference-server-config')
2074 if '$INFERENCE_SERVER' ~= '' then
2075 inference.set_selected_server('$INFERENCE_SERVER')
2076 end
2077 local server = inference.get_selected_server()
2078 local ok, msg = inference.validate_server(server)
2079 if ok then
2080 print('OK:' .. server.name .. ':' .. inference.build_host_url(server))
2081 else
2082 print('FAIL:' .. server.name .. ':' .. msg)
2083 end
2084 " 2>&1)
2085
2086 if [[ "$VALIDATION_RESULT" == OK:* ]]; then
2087 SERVER_NAME=$(echo "$VALIDATION_RESULT" | cut -d: -f2)
2088 SERVER_URL=$(echo "$VALIDATION_RESULT" | cut -d: -f3-)
2089 log_info " ✓ Inference server '$SERVER_NAME' is reachable at $SERVER_URL"
2090 else
2091 # Server unreachable. Try to start it ourselves (and remember we
2092 # did, so the EXIT trap shuts it down again). If start succeeds,
2093 # re-validate to confirm /health is responsive before proceeding.
2094 SERVER_NAME=$(echo "$VALIDATION_RESULT" | cut -d: -f2)
2095 ERROR_MSG=$(echo "$VALIDATION_RESULT" | cut -d: -f3-)
2096 log_info " ✗ Inference server '$SERVER_NAME' not reachable: $ERROR_MSG"
2097 log_info " Attempting to start it via scripts/start-llamacpp-server.sh..."
2098
2099 START_ARGS=("$DIR")
2100 if [ -n "$INFERENCE_SERVER" ]; then
2101 START_ARGS+=("--server=$INFERENCE_SERVER")
2102 fi
2103 if "$DIR/scripts/start-llamacpp-server.sh" "${START_ARGS[@]}"; then
2104 WE_STARTED_INFERENCE_SERVER=true
2105
2106 # Re-validate to confirm the freshly-started server is responsive.
2107 VALIDATION_RESULT=$(luajit -e "
2108 package.path = '$DIR/libs/?.lua;' .. package.path
2109 local inference = require('inference-server-config')
2110 if '$INFERENCE_SERVER' ~= '' then
2111 inference.set_selected_server('$INFERENCE_SERVER')
2112 end
2113 local server = inference.get_selected_server()
2114 local ok, msg = inference.validate_server(server)
2115 if ok then
2116 print('OK:' .. server.name .. ':' .. inference.build_host_url(server))
2117 else
2118 print('FAIL:' .. server.name .. ':' .. msg)
2119 end
2120 " 2>&1)
2121
2122 if [[ "$VALIDATION_RESULT" == OK:* ]]; then
2123 SERVER_NAME=$(echo "$VALIDATION_RESULT" | cut -d: -f2)
2124 SERVER_URL=$(echo "$VALIDATION_RESULT" | cut -d: -f3-)
2125 log_info " ✓ Inference server '$SERVER_NAME' started at $SERVER_URL"
2126 log_info " (will be shut down again when this run completes)"
2127 else
2128 ERROR_MSG=$(echo "$VALIDATION_RESULT" | cut -d: -f3-)
2129 echo -e "${RED}❌ ERROR: Started the inference server but it is still not reachable${NC}" >&2
2130 echo -e "${RED} $ERROR_MSG${NC}" >&2
2131 echo -e "${YELLOW}💡 Check ${NEOCITIES_LOG_DIR:-$DIR/tmp}/llamacpp-server.log for the server's own diagnostics${NC}" >&2
2132 exit 1
2133 fi
2134 else
2135 echo -e "${RED}❌ ERROR: Failed to start the inference server${NC}" >&2
2136 echo -e "${YELLOW}💡 Run ./scripts/start-llamacpp-server.sh manually for verbose output${NC}" >&2
2137 echo -e "${YELLOW}💡 Use --list-servers to see available servers${NC}" >&2
2138 echo -e "${YELLOW}💡 Use --server=NAME to select a different server${NC}" >&2
2139 exit 1
2140 fi
2141 fi
2142fi
2143# }}}
2144
2145# Execute stages in pipeline order (regardless of argument order)
2146# Issue 10-051: timed_stage <name> wraps each stage so its wall-clock is recorded
2147# to .stage-timings on success (skipped stages and failures record nothing). The
2148# names here are the keys the pre-flight list reads back for its estimates.
2149$UPDATE_WORDS && timed_stage update-words run_update_words
2150$EXTRACT && timed_stage extract run_extract
2151# Issue 10-053: strip excluded content from input/ right after sync/extraction,
2152# before anything catalogs or embeds it. Tied to extraction (which follows sync).
2153$EXTRACT && timed_stage strip-excluded run_strip_excluded
2154$PARSE && timed_stage parse run_parse
2155$VALIDATE && timed_stage validate run_validate
2156$CATALOG_IMAGES && timed_stage catalog-images run_catalog_images
2157$GENERATE_EMBEDDINGS && timed_stage generate-embeddings run_generate_embeddings
2158# Semantic colors are part of embedding generation (Stage 6.5)
2159# Only regenerate when embeddings are generated - HTML should use existing poem_colors.json
2160$GENERATE_EMBEDDINGS && timed_stage generate-semantic-colors run_generate_semantic_colors
2161# Word embeddings run AFTER colors so the word-color step finds color_embeddings.json
2162$GENERATE_EMBEDDINGS && timed_stage generate-word-embeddings run_generate_word_embeddings
2163# Issue 9-013: fold image pseudo-embeddings into the set BEFORE the similarity
2164# matrix is built, so images rank alongside poems. Idempotent + cheap.
2165$GENERATE_SIMILARITY && timed_stage augment-images run_augment_images
2166$GENERATE_SIMILARITY && timed_stage generate-similarity run_generate_similarity
2167$GENERATE_DIVERSITY && timed_stage generate-diversity run_generate_diversity
2168$GENERATE_HTML && timed_stage generate-html run_generate_html
2169$GENERATE_WORDCLOUD && timed_stage wordcloud run_generate_wordcloud
2170
2171if ! $QUIET; then
2172 echo ""
2173 echo -e "$(symbol_success "") Pipeline completed successfully"
2174
2175 # Print the executed command for easy re-running (copy-paste friendly)
2176 if [[ -n "$EXECUTED_COMMAND" ]]; then
2177 echo ""
2178 echo -e "$(symbol_info "📋") Command executed:"
2179 echo " $EXECUTED_COMMAND"
2180 fi
2181fi
2182# }}}
2183