generate-embeddings.sh
1#!/bin/bash
2# Embedding Generation Manager for Neocities Poetry Modernization
3#
4# Generates vector embeddings for poems using inference server embedding models.
5# Supports incremental processing, cache management, and multiple models.
6#
7# Uses TUI library for vim-style interactive mode when available.
8#
9# Usage: ./generate-embeddings.sh [OPTIONS] [DIRECTORY]
10
11# {{{ TUI Library
12LIBS_DIR="/home/ritz/programming/ai-stuff/scripts/libs"
13TUI_AVAILABLE=false
14if [[ -f "${LIBS_DIR}/lua-menu.sh" ]] && command -v luajit &>/dev/null; then
15 source "${LIBS_DIR}/lua-menu.sh"
16 TUI_AVAILABLE=true
17fi
18# }}}
19
20# {{{ setup_dir_path
21setup_dir_path() {
22 if [ -n "$1" ]; then
23 echo "$1"
24 else
25 echo "/mnt/mtwo/programming/ai-stuff/neocities-modernization"
26 fi
27}
28# }}}
29
30# Parse command line options first to find directory argument
31INCREMENTAL=true
32FORCE_REGEN=false
33SHOW_STATUS=false
34VALIDATE_CACHE=false
35FLUSH_ALL=false
36FLUSH_ERRORS=false
37BACKUP_BEFORE_FLUSH=true
38FORCE_OPERATION=false
39# 10-049 + Pascal-quant note: must match what config.lua's
40# inference_servers[selected].model returns (no Ollama-style ":tag" colon,
41# no GGUF ".gguf" extension). The /v1/models check later substring-greps
42# for this string in the server's loaded-model list, so it should be the
43# family identifier ("nomic-embed-text-v1.5"), not the full file basename
44# ("nomic-embed-text-v1.5.Q8_0.gguf"). When the config drifts from this
45# constant the substring-grep falls back to "model not found" and bails.
46MODEL_NAME="nomic-embed-text-v1.5"
47LIST_MODELS=false
48MODEL_STATUS=false
49INTERACTIVE_MODE=false
50DIRECTORY_ARG=""
51ASSETS_DIR=""
52# Issue 10-017: Inference server selection from config.lua
53INFERENCE_SERVER=""
54
55for arg in "$@"; do
56 case $arg in
57 --dir=*)
58 ASSETS_DIR="${arg#*=}"
59 ;;
60 --full-regen|--full)
61 INCREMENTAL=false
62 FORCE_REGEN=true
63 ;;
64 --incremental|--inc)
65 INCREMENTAL=true
66 ;;
67 --status)
68 SHOW_STATUS=true
69 ;;
70 --validate)
71 VALIDATE_CACHE=true
72 ;;
73 --flush-all)
74 FLUSH_ALL=true
75 ;;
76 --flush-errors)
77 FLUSH_ERRORS=true
78 ;;
79 --backup-before-flush)
80 BACKUP_BEFORE_FLUSH=true
81 ;;
82 --no-backup)
83 BACKUP_BEFORE_FLUSH=false
84 ;;
85 --force)
86 FORCE_OPERATION=true
87 ;;
88 --model=*)
89 MODEL_NAME="${arg#*=}"
90 ;;
91 # Issue 10-017: Inference server selection
92 --server=*)
93 INFERENCE_SERVER="${arg#*=}"
94 ;;
95 --list-models)
96 LIST_MODELS=true
97 ;;
98 --model-status)
99 MODEL_STATUS=true
100 ;;
101 -I)
102 INTERACTIVE_MODE=true
103 ;;
104 --help|-h)
105 echo "Usage: $0 [options] [directory]"
106 echo "Options:"
107 echo " --incremental, --inc Use incremental processing (default)"
108 echo " --full-regen, --full Force full regeneration of all embeddings"
109 echo " --status Show cache status without processing"
110 echo " --validate Validate cache integrity"
111 echo ""
112 echo "Cache Management Options:"
113 echo " --flush-all Remove all cached embeddings (complete regeneration)"
114 echo " --flush-errors Remove only error entries, keep valid embeddings"
115 echo " --backup-before-flush Create timestamped backup before flushing (default)"
116 echo " --no-backup Skip backup creation when flushing"
117 echo " --force Skip confirmation prompts for automated scripts"
118 echo ""
119 echo "Model Selection Options:"
120 echo " --model=MODEL_NAME Specify embedding model (default: nomic-embed-text-v1.5)"
121 echo " --list-models Show available models and their configurations"
122 echo " --model-status Show cache status for all models"
123 echo " --dir=PATH Use custom assets directory instead of default"
124 echo ""
125 echo "Examples:"
126 echo " $0 --flush-errors # Clean up failed entries"
127 echo " $0 --flush-all # Start completely fresh"
128 echo " $0 --model=text-embedding-ada-002 # Use OpenAI model"
129 echo " $0 --list-models # Show available models"
130 echo " $0 --model-status # Show cache status for all models"
131 echo " --help, -h Show this help message"
132 echo " -I Interactive mode - query user for options"
133 exit 0
134 ;;
135 *)
136 # If argument doesn't start with --, treat as directory
137 if [[ ! $arg == --* ]] && [ -z "$DIRECTORY_ARG" ]; then
138 DIRECTORY_ARG="$arg"
139 fi
140 ;;
141 esac
142done
143
144# {{{ setup_embedding_tui_menu
145# Configure the TUI menu for embedding generation options
146setup_embedding_tui_menu() {
147 if ! $TUI_AVAILABLE; then
148 return 1
149 fi
150
151 # Initialize TUI
152 if ! tui_init; then
153 return 1
154 fi
155
156 # Build the menu
157 menu_init
158 menu_set_title "Embedding Manager" "neocities-modernization - j/k:nav space:toggle Enter:run"
159
160 # ═══════════════════════════════════════════════════════════════════════════
161 # Section 1: Processing Mode (radio buttons - single selection)
162 # ═══════════════════════════════════════════════════════════════════════════
163 menu_add_section "mode" "single" "Processing Mode (select one)"
164 menu_add_item "mode" "incremental" "Incremental" "checkbox" "1" \
165 "Process only new/changed poems (fastest)" "1" ""
166 menu_add_item "mode" "full_regen" "Full Regeneration" "checkbox" "0" \
167 "Regenerate all embeddings from scratch" "2" ""
168 menu_add_item "mode" "status_only" "Status Check" "checkbox" "0" \
169 "Show current progress without processing" "3" ""
170
171 # ═══════════════════════════════════════════════════════════════════════════
172 # Section 2: Cache Management
173 # ═══════════════════════════════════════════════════════════════════════════
174 menu_add_section "cache" "multi" "Cache Management"
175 menu_add_item "cache" "flush_all" "Flush All Embeddings ⚠️" "checkbox" "0" \
176 "WARNING: Removes entire cache" "f" ""
177 menu_add_item "cache" "flush_errors" "Flush Errors Only" "checkbox" "0" \
178 "Remove failed entries, keep valid ones" "e" ""
179 menu_add_item "cache" "validate" "Validate Cache" "checkbox" "0" \
180 "Check integrity without changes" "v" ""
181
182 # ═══════════════════════════════════════════════════════════════════════════
183 # Section 3: Cache Options
184 # ═══════════════════════════════════════════════════════════════════════════
185 menu_add_section "cache_opts" "multi" "Cache Options"
186 menu_add_item "cache_opts" "backup" "Backup Before Flush" "checkbox" "1" \
187 "Create timestamped backup" "b" ""
188 menu_add_item "cache_opts" "force" "Skip Confirmations" "checkbox" "0" \
189 "Don't prompt for dangerous operations" "s" ""
190
191 # ═══════════════════════════════════════════════════════════════════════════
192 # Section 4: Model Selection
193 # ═══════════════════════════════════════════════════════════════════════════
194 menu_add_section "model" "multi" "Model Selection"
195 menu_add_item "model" "model_name" "Embedding Model" "multistate" "nomic-embed-text" \
196 "nomic-embed-text,qwen3-embedding,embeddinggemma,text-embedding-ada-002,all-MiniLM-L6-v2" "m" ""
197 menu_add_item "model" "model_status" "Show Model Status" "checkbox" "0" \
198 "Display cache stats for each model" "t" ""
199 menu_add_item "model" "list_models" "List Available Models" "checkbox" "0" \
200 "Show all configured models" "l" ""
201
202 # ═══════════════════════════════════════════════════════════════════════════
203 # Section 5: Actions
204 # ═══════════════════════════════════════════════════════════════════════════
205 menu_add_section "actions" "single" "Actions"
206 menu_add_item "actions" "run" "Run" "action" "" \
207 "Execute with selected options" "r"
208
209 return 0
210}
211# }}}
212
213# {{{ apply_tui_selections
214# Map TUI menu values to the script's flag variables
215apply_tui_selections() {
216 # Processing mode (radio - only one should be set)
217 if [[ "$(menu_get_value "incremental")" == "1" ]]; then
218 INCREMENTAL=true
219 FORCE_REGEN=false
220 SHOW_STATUS=false
221 elif [[ "$(menu_get_value "full_regen")" == "1" ]]; then
222 INCREMENTAL=false
223 FORCE_REGEN=true
224 SHOW_STATUS=false
225 elif [[ "$(menu_get_value "status_only")" == "1" ]]; then
226 SHOW_STATUS=true
227 INCREMENTAL=true
228 FORCE_REGEN=false
229 fi
230
231 # Cache management
232 [[ "$(menu_get_value "flush_all")" == "1" ]] && FLUSH_ALL=true
233 [[ "$(menu_get_value "flush_errors")" == "1" ]] && FLUSH_ERRORS=true
234 [[ "$(menu_get_value "validate")" == "1" ]] && VALIDATE_CACHE=true
235
236 # Cache options
237 [[ "$(menu_get_value "backup")" == "1" ]] && BACKUP_BEFORE_FLUSH=true || BACKUP_BEFORE_FLUSH=false
238 [[ "$(menu_get_value "force")" == "1" ]] && FORCE_OPERATION=true
239
240 # Model selection
241 local model=$(menu_get_value "model_name")
242 # 10-049: identifiers shifted from Ollama-style tags ("nomic-embed-text-v1.5")
243 # to llama.cpp's family-identifier convention ("nomic-embed-text-v1.5").
244 # Embedding cache directory derives from this string via the sanitize-then-mkdir
245 # pattern in utils.embeddings_dir(), so changing it triggers a fresh cache.
246 case "$model" in
247 "qwen3-embedding") MODEL_NAME="qwen3-embedding-4b" ;;
248 "nomic-embed-text") MODEL_NAME="nomic-embed-text-v1.5" ;;
249 "embeddinggemma") MODEL_NAME="embeddinggemma" ;;
250 "text-embedding-ada-002") MODEL_NAME="text-embedding-ada-002" ;;
251 "all-MiniLM-L6-v2") MODEL_NAME="all-MiniLM-L6-v2" ;;
252 *) MODEL_NAME="nomic-embed-text-v1.5" ;;
253 esac
254
255 [[ "$(menu_get_value "model_status")" == "1" ]] && MODEL_STATUS=true
256 [[ "$(menu_get_value "list_models")" == "1" ]] && LIST_MODELS=true
257
258 # Validation: flush_all takes precedence over flush_errors
259 if [[ "$FLUSH_ALL" == "true" ]] && [[ "$FLUSH_ERRORS" == "true" ]]; then
260 FLUSH_ERRORS=false
261 fi
262}
263# }}}
264
265# {{{ run_tui_interactive_mode
266# Run the TUI-based interactive mode
267run_tui_interactive_mode() {
268 if ! setup_embedding_tui_menu; then
269 return 1
270 fi
271
272 if menu_run; then
273 menu_cleanup
274 apply_tui_selections
275
276 # Show selected configuration
277 echo ""
278 echo "Selected configuration:"
279 echo "- Model: $MODEL_NAME"
280 echo "- Mode: $([ "$FORCE_REGEN" = true ] && echo "Full regeneration" || echo "Incremental")"
281 echo "- Status check: $([ "$SHOW_STATUS" = true ] && echo "Yes" || echo "No")"
282 echo "- Cache operations: $([ "$FLUSH_ALL" = true ] && echo "Flush all" || [ "$FLUSH_ERRORS" = true ] && echo "Flush errors" || [ "$VALIDATE_CACHE" = true ] && echo "Validate" || echo "None")"
283 echo ""
284 return 0
285 else
286 menu_cleanup
287 echo "Operation cancelled."
288 exit 0
289 fi
290}
291# }}}
292
293# {{{ run_simple_interactive_mode
294# Fallback simple interactive mode (original implementation)
295run_simple_interactive_mode() {
296 echo "=== Embedding Generation Interactive Mode ==="
297 echo ""
298 echo "Select processing mode:"
299 echo "1. Incremental (default) - Process only new/changed poems"
300 echo "2. Full regeneration - Regenerate all embeddings"
301 echo "3. Cache management - Flush/validate cache"
302 echo "4. Status check - Show current progress"
303 echo ""
304 read -p "Choose option (1-4): " mode_choice
305
306 case $mode_choice in
307 1)
308 INCREMENTAL=true
309 ;;
310 2)
311 INCREMENTAL=false
312 FORCE_REGEN=true
313 ;;
314 3)
315 echo ""
316 echo "Cache management options:"
317 echo "1. Flush all cached embeddings"
318 echo "2. Flush only failed embedding attempts"
319 echo "3. Validate cache integrity"
320 read -p "Choose cache option (1-3): " cache_choice
321 case $cache_choice in
322 1) FLUSH_ALL=true ;;
323 2) FLUSH_ERRORS=true ;;
324 3) VALIDATE_CACHE=true ;;
325 esac
326 ;;
327 4)
328 SHOW_STATUS=true
329 ;;
330 esac
331
332 echo ""
333 echo "Available embedding models:"
334 echo "1. nomic-embed-text-v1.5 (default)"
335 echo "2. qwen3-embedding:4b"
336 echo "3. embeddinggemma:latest"
337 echo "4. text-embedding-ada-002"
338 echo "5. all-MiniLM-L6-v2"
339 read -p "Choose model (1-5, or press enter for default): " model_choice
340
341 case $model_choice in
342 2) MODEL_NAME="qwen3-embedding:4b" ;;
343 3) MODEL_NAME="embeddinggemma:latest" ;;
344 4) MODEL_NAME="text-embedding-ada-002" ;;
345 5) MODEL_NAME="all-MiniLM-L6-v2" ;;
346 *) MODEL_NAME="nomic-embed-text-v1.5" ;;
347 esac
348
349 echo ""
350 echo "Selected configuration:"
351 echo "- Model: $MODEL_NAME"
352 echo "- Mode: $([ "$INCREMENTAL" = true ] && echo "Incremental" || echo "Full regeneration")"
353 echo "- Status check: $([ "$SHOW_STATUS" = true ] && echo "Yes" || echo "No")"
354 echo "- Cache operations: $([ "$FLUSH_ALL" = true ] && echo "Flush all" || [ "$FLUSH_ERRORS" = true ] && echo "Flush errors" || [ "$VALIDATE_CACHE" = true ] && echo "Validate" || echo "None")"
355 echo ""
356 read -p "Continue with this configuration? (y/n): " confirm
357 if [[ ! "$confirm" =~ ^[Yy] ]]; then
358 echo "Operation cancelled."
359 exit 0
360 fi
361 echo ""
362}
363# }}}
364
365# Interactive mode handling - try TUI first, fall back to simple mode
366if [ "$INTERACTIVE_MODE" = true ]; then
367 if $TUI_AVAILABLE; then
368 run_tui_interactive_mode || run_simple_interactive_mode
369 else
370 run_simple_interactive_mode
371 fi
372fi
373
374# Set up directory after parsing arguments
375DIR=$(setup_dir_path "$DIRECTORY_ARG")
376cd "$DIR" || exit 1
377
378# Issue 8-059: ensure the tmpfs-backed tmp/ symlink exists before any write,
379# since the progress file we share with similarity-engine.lua now lives there.
380"${DIR}/scripts/ensure-tmp-symlink" "${DIR}"
381
382# Build --dir argument for Lua scripts if assets dir was specified
383ASSETS_ARG=""
384if [ -n "$ASSETS_DIR" ]; then
385 ASSETS_ARG="--dir $ASSETS_DIR"
386fi
387
388# Colors for output
389RED='\033[0;31m'
390GREEN='\033[0;32m'
391YELLOW='\033[1;33m'
392BLUE='\033[0;34m'
393PURPLE='\033[0;35m'
394CYAN='\033[0;36m'
395NC='\033[0m' # No Color
396
397# Progress tracking
398START_TIME=$(date +%s)
399POEMS_FILE="$DIR/assets/poems.json"
400EMBEDDINGS_FILE="$DIR/assets/embeddings.json"
401# run.sh's --debug exports NEOCITIES_LOG_DIR → durable disk (output/debug-logs)
402# so this log survives the reboot a hard GPU lock forces; the default is the
403# RAM-backed tmp/. The end-of-run cleanup below is skipped when this is set.
404EMBED_LOG_DIR="${NEOCITIES_LOG_DIR:-${DIR}/tmp}"
405mkdir -p "$EMBED_LOG_DIR"
406TEMP_LOG="${EMBED_LOG_DIR}/embedding_generation.log"
407
408echo -e "${CYAN}================================================================${NC}"
409echo -e "${CYAN} POEM EMBEDDING GENERATION - LIVE PROGRESS MONITOR${NC}"
410echo -e "${CYAN}================================================================${NC}"
411echo ""
412# Handle model-specific operations
413if [ "$LIST_MODELS" = true ]; then
414 lua -e "
415 package.path = '$DIR/libs/?.lua;$DIR/src/?.lua;' .. package.path
416 local engine = require('similarity-engine')
417 engine.list_available_models()
418 "
419 exit 0
420fi
421
422if [ "$MODEL_STATUS" = true ]; then
423 lua -e "
424 package.path = '$DIR/libs/?.lua;$DIR/src/?.lua;' .. package.path
425 local engine = require('similarity-engine')
426 engine.show_all_model_status('$DIR/assets')
427 "
428 exit 0
429fi
430
431# Generate model-specific paths.
432# Issue 10-054: resolve the cache dir through scripts/cache-dir so this shell
433# writer lands in the SAME place the Lua readers look (disk or RAM, per the
434# CACHE_IN_RAM switch). It hard-errors out if the resolver yields nothing, rather
435# than silently writing to a wrong/empty path.
436SAFE_MODEL_NAME=$(echo "$MODEL_NAME" | sed 's/[^a-zA-Z0-9._-]/_/g')
437EMBEDDINGS_DIR=$(luajit "$DIR/scripts/cache-dir" "$DIR" --model "$MODEL_NAME")
438if [ -z "$EMBEDDINGS_DIR" ]; then
439 echo "Error: could not resolve the embeddings cache dir (scripts/cache-dir)" >&2
440 exit 1
441fi
442EMBEDDINGS_FILE="$EMBEDDINGS_DIR/embeddings.json"
443
444# Create model directory if needed
445mkdir -p "$EMBEDDINGS_DIR"
446
447echo -e "${BLUE}Project Directory:${NC} $DIR"
448echo -e "${BLUE}Input File:${NC} $POEMS_FILE"
449echo -e "${BLUE}Model:${NC} $MODEL_NAME"
450echo -e "${BLUE}Output File:${NC} $EMBEDDINGS_FILE"
451echo -e "${BLUE}Processing Mode:${NC} $([ "$INCREMENTAL" = true ] && echo "Incremental (default)" || echo "Full Regeneration")"
452echo -e "${BLUE}Start Time:${NC} $(date)"
453echo ""
454
455# Handle flush operations
456if [ "$FLUSH_ALL" = true ] || [ "$FLUSH_ERRORS" = true ]; then
457 echo -e "${YELLOW}🗑️ Cache Flush Operation${NC}"
458 echo ""
459
460 FLUSH_TYPE="all"
461 if [ "$FLUSH_ERRORS" = true ]; then
462 FLUSH_TYPE="errors"
463 fi
464
465 echo -e "${BLUE}Flush Type:${NC} $FLUSH_TYPE"
466 echo -e "${BLUE}Target File:${NC} $EMBEDDINGS_FILE"
467 echo -e "${BLUE}Backup Enabled:${NC} $BACKUP_BEFORE_FLUSH"
468
469 if [ -f "$EMBEDDINGS_FILE" ]; then
470 FILE_SIZE=$(du -h "$EMBEDDINGS_FILE" | cut -f1)
471 echo -e "${BLUE}Current File Size:${NC} $FILE_SIZE"
472 else
473 echo -e "${YELLOW}No cache file found${NC}"
474 exit 0
475 fi
476 echo ""
477
478 # Safety confirmation
479 if [ "$FORCE_OPERATION" != true ]; then
480 echo -e "${YELLOW}⚠️ WARNING: This will permanently modify the embedding cache${NC}"
481 if [ "$FLUSH_TYPE" = "all" ]; then
482 echo -e "${RED}This will DELETE ALL cached embeddings!${NC}"
483 else
484 echo -e "${YELLOW}This will remove error entries but keep valid embeddings${NC}"
485 fi
486 echo ""
487 read -p "Are you sure you want to proceed? (yes/no): " confirmation
488 if [ "$confirmation" != "yes" ]; then
489 echo "Operation cancelled"
490 exit 0
491 fi
492 fi
493
494 # Execute flush operation
495 echo -e "${CYAN}Executing flush operation...${NC}"
496
497 BACKUP_LUA_FLAG="true"
498 if [ "$BACKUP_BEFORE_FLUSH" = false ]; then
499 BACKUP_LUA_FLAG="false"
500 fi
501
502 lua -e "
503 package.path = package.path .. ';./libs/?.lua;./src/?.lua'
504 local similarity_engine = require('similarity-engine')
505 local success = similarity_engine.flush_embeddings_cache('$EMBEDDINGS_FILE', '$FLUSH_TYPE', $BACKUP_LUA_FLAG)
506 if not success then
507 os.exit(1)
508 end
509 "
510
511 FLUSH_RESULT=$?
512 if [ $FLUSH_RESULT -eq 0 ]; then
513 echo ""
514 echo -e "${GREEN}✅ Cache flush operation completed successfully${NC}"
515 else
516 echo ""
517 echo -e "${RED}❌ Cache flush operation failed${NC}"
518 exit 1
519 fi
520
521 echo ""
522 echo -e "${CYAN}================================================================${NC}"
523 echo -e "${CYAN} CACHE FLUSH COMPLETE${NC}"
524 echo -e "${CYAN}================================================================${NC}"
525 exit 0
526fi
527
528# Handle status and validation modes
529if [ "$SHOW_STATUS" = true ] || [ "$VALIDATE_CACHE" = true ]; then
530 echo -e "${YELLOW}🔍 Checking embedding cache status...${NC}"
531
532 if [ -f "$EMBEDDINGS_FILE" ]; then
533 CACHE_INFO=$(lua -e "
534 local dkjson = require('libs.dkjson')
535 local f = io.open('$EMBEDDINGS_FILE')
536 local data = dkjson.decode(f:read('*a'))
537 f:close()
538
539 -- Count actual embeddings in file
540 local total_entries = 0
541 local completed_embeddings = 0
542 for id, emb in pairs(data.embeddings or {}) do
543 total_entries = total_entries + 1
544 if emb.embedding then
545 completed_embeddings = completed_embeddings + 1
546 end
547 end
548
549 local rate = total_entries > 0 and (completed_embeddings / total_entries) or 0
550 local mode = data.metadata and data.metadata.processing_mode or 'unknown'
551 local generated = data.metadata and data.metadata.generated_at or 'unknown'
552 local model = data.metadata and data.metadata.embedding_model or 'qwen3-embedding:4b'
553
554 print(string.format('%d,%d,%.3f,%s,%s,%s', total_entries, completed_embeddings, rate, mode, generated, model))
555 " 2>/dev/null || echo "0,0,0,error,unknown,unknown")
556
557 IFS=',' read -r CACHE_TOTAL CACHE_COMPLETED CACHE_RATE CACHE_MODE CACHE_DATE CACHE_MODEL <<< "$CACHE_INFO"
558
559 echo -e "${GREEN}✓ Embedding cache found${NC}"
560 echo -e "${BLUE}Cache Statistics:${NC}"
561 echo -e " Total poems: ${YELLOW}$CACHE_TOTAL${NC}"
562 echo -e " Completed embeddings: ${GREEN}$CACHE_COMPLETED${NC}"
563 echo -e " Completion rate: ${GREEN}$(printf "%.1f%%" $(echo "$CACHE_RATE * 100" | bc -l))${NC}"
564 echo -e " Processing mode: ${PURPLE}$CACHE_MODE${NC}"
565 echo -e " Generated: ${CYAN}$CACHE_DATE${NC}"
566 echo -e " Model: ${PURPLE}$CACHE_MODEL${NC}"
567
568 if [ "$VALIDATE_CACHE" = true ]; then
569 echo ""
570 echo -e "${YELLOW}🔍 Validating cache integrity...${NC}"
571 # Add cache validation logic here
572 echo -e "${GREEN}✓ Cache validation complete${NC}"
573 fi
574 else
575 echo -e "${RED}❌ No embedding cache found${NC}"
576 echo -e "${YELLOW}💡 Run without --status to generate embeddings${NC}"
577 fi
578
579 if [ "$SHOW_STATUS" = true ]; then
580 exit 0
581 fi
582fi
583
584# Check prerequisites
585echo -e "${YELLOW}🔍 Checking prerequisites...${NC}"
586
587if [ ! -f "$POEMS_FILE" ]; then
588 echo -e "${RED}❌ ERROR: Poems file not found at $POEMS_FILE${NC}"
589 exit 1
590fi
591
592# Count total poems
593TOTAL_POEMS=$(lua -e "local dkjson = require('libs.dkjson'); local f = io.open('$POEMS_FILE'); local data = dkjson.decode(f:read('*a')); f:close(); print(#data.poems)")
594echo -e "${GREEN}✓ Found $TOTAL_POEMS poems to process${NC}"
595
596# Check inference server availability. 10-049: was Ollama; now llama.cpp.
597if [ -n "$INFERENCE_SERVER" ]; then
598 INFERENCE_ENDPOINT=$(luajit -e "
599 package.path = '$DIR/libs/?.lua;' .. package.path
600 local inference = require('inference-server-config')
601 inference.set_selected_server('$INFERENCE_SERVER')
602 print(inference.build_host_url())
603 ")
604 echo -e "${CYAN}Using Inference server: $INFERENCE_SERVER${NC}"
605else
606 # Default: use config default or hardcoded fallback
607 INFERENCE_ENDPOINT=$(luajit -e "
608 package.path = '$DIR/libs/?.lua;' .. package.path
609 local inference = require('inference-server-config')
610 print(inference.build_host_url())
611 " 2>/dev/null || echo "http://127.0.0.1:18080")
612fi
613# /v1/models is llama.cpp's OpenAI-compatible "what's loaded" endpoint.
614# Used both as a liveness probe and as the source for the model list.
615if curl -s --max-time 3 "$INFERENCE_ENDPOINT/v1/models" > /dev/null; then
616 echo -e "${GREEN}✓ Inference server reachable at $INFERENCE_ENDPOINT${NC}"
617else
618 echo -e "${RED}❌ ERROR: Cannot connect to inference server at $INFERENCE_ENDPOINT${NC}"
619 exit 1
620fi
621
622# Check selected embedding model. /v1/models returns {data: [{id: "..."},...]}.
623if curl -s "$INFERENCE_ENDPOINT/v1/models" | grep -q "$MODEL_NAME"; then
624 echo -e "${GREEN}✓ $MODEL_NAME model available${NC}"
625else
626 echo -e "${RED}❌ ERROR: $MODEL_NAME model not found${NC}"
627 echo -e "${YELLOW}💡 Loaded models on this server:${NC}"
628 curl -s "$INFERENCE_ENDPOINT/v1/models" | lua -e "
629 local dkjson = require('libs.dkjson')
630 local data = dkjson.decode(io.read('*a'))
631 if data and data.data then
632 for _, model in ipairs(data.data) do
633 print(' ' .. (model.id or '(unnamed)'))
634 end
635 end
636 " 2>/dev/null
637 exit 1
638fi
639
640echo ""
641if [ "$INCREMENTAL" = true ]; then
642 echo -e "${CYAN}🚀 Starting incremental embedding generation for $TOTAL_POEMS poems...${NC}"
643 echo -e "${YELLOW}💡 Only new/changed poems will be processed (time savings expected)${NC}"
644else
645 echo -e "${CYAN}🚀 Starting FULL regeneration of embeddings for $TOTAL_POEMS poems...${NC}"
646 echo -e "${YELLOW}⚠️ All embeddings will be regenerated (this may take longer)${NC}"
647fi
648echo ""
649
650# Graceful termination handler
651cleanup_and_exit() {
652 echo ""
653 echo -e "${YELLOW}🛑 Termination signal received${NC}"
654 echo -e "${CYAN}Performing graceful cleanup...${NC}"
655
656 # Kill background processes
657 if [ -n "$EMBED_PID" ]; then
658 echo -e "${BLUE}Stopping embedding generation process...${NC}"
659 kill -TERM "$EMBED_PID" 2>/dev/null
660 wait "$EMBED_PID" 2>/dev/null
661 fi
662
663 if [ -n "$MONITOR_PID" ]; then
664 echo -e "${BLUE}Stopping progress monitor...${NC}"
665 kill -TERM "$MONITOR_PID" 2>/dev/null
666 wait "$MONITOR_PID" 2>/dev/null
667 fi
668
669 # Show current progress
670 if [ -f "$EMBEDDINGS_FILE" ]; then
671 local final_count=$(lua -e "
672 local dkjson = require('libs.dkjson')
673 local f = io.open('$EMBEDDINGS_FILE')
674 if not f then print(0); return end
675 local content = f:read('*a')
676 f:close()
677 if content == '' then print(0); return end
678 local data = dkjson.decode(content)
679 if not data or not data.embeddings then print(0); return end
680 local count = 0
681 -- Handle both array and object format
682 if data.embeddings[1] then
683 -- Array format
684 for _, emb in ipairs(data.embeddings) do
685 if emb.embedding then count = count + 1 end
686 end
687 else
688 -- Object format
689 for id, emb in pairs(data.embeddings) do
690 if emb.embedding then count = count + 1 end
691 end
692 end
693 print(count)
694 " 2>/dev/null || echo "0")
695
696 echo -e "${GREEN}✅ Embeddings saved to cache${NC}"
697 echo -e "${BLUE}Progress preserved: $final_count/$TOTAL_POEMS embeddings completed${NC}"
698 echo -e "${CYAN}Use incremental mode to resume from current position${NC}"
699 fi
700
701 local end_time=$(date +%s)
702 local total_time=$((end_time - START_TIME))
703 local total_minutes=$((total_time / 60))
704 echo -e "${BLUE}Total runtime: ${total_minutes}m${NC}"
705
706 # Cleanup progress file
707 rm -f "${DIR}/tmp/embedding_progress_${USER}.txt" 2>/dev/null
708
709 exit 0
710}
711
712# Register signal handlers
713trap cleanup_and_exit SIGINT SIGTERM
714
715# Create monitoring function
716# Issue 10-022: Updated to use PID-based detection instead of pgrep for process name
717# This is needed because we now use luajit -e instead of lua similarity-engine.lua
718monitor_progress() {
719 local current_poem=0
720 local start_time=$(date +%s)
721 local percent=0
722 local progress_file="${DIR}/tmp/embedding_progress_${USER}.txt"
723 local last_progress_time=0
724 # EMBED_PID is set in the calling scope before monitor_progress is started
725 local target_pid=$EMBED_PID
726
727 # Detect whether stdout is an interactive terminal. Under run.sh --debug,
728 # stdout is a pipe to scripts/fsync-logger, which reads line-by-line and so
729 # cannot render a carriage-return progress bar (the bar emits no newlines, so
730 # the logger blocks waiting for one and nothing shows). In that case we fall
731 # back to a newline-terminated progress LINE on each percent change, which
732 # flows through the logger to both the terminal and the debug log.
733 local is_tty=0
734 if [ -t 1 ]; then is_tty=1; fi
735 local last_reported_percent=-1
736
737 while true; do
738 # Check for real-time progress updates from Lua script
739 if [ -f "$progress_file" ]; then
740 local file_mtime=$(stat -c %Y "$progress_file" 2>/dev/null || echo "0")
741 if [ "$file_mtime" -gt "$last_progress_time" ]; then
742 # File has been updated - read new progress
743 local progress_data=$(cat "$progress_file" 2>/dev/null || echo "0,0")
744 IFS=',' read -r current_poem total_poems <<< "$progress_data"
745 last_progress_time=$file_mtime
746
747 # Calculate percentage (guard against division by zero)
748 if [ "$total_poems" -gt 0 ]; then
749 percent=$((current_poem * 100 / total_poems))
750 else
751 percent=0
752 fi
753 fi
754 else
755 # No progress file found - fallback to basic monitoring
756 current_poem=0
757 percent=0
758 fi
759
760 # Create progress bar
761 local bar_length=50
762 local filled=$((percent * bar_length / 100))
763 local bar=""
764 for ((i=0; i<filled; i++)); do bar="${bar}█"; done
765 for ((i=filled; i<bar_length; i++)); do bar="${bar}░"; done
766
767 # Render progress. Interactive terminal: redraw the bar in place with a
768 # carriage return. Captured/non-TTY (e.g. --debug): emit one line per
769 # percentage change so the line-based logger can pass it through.
770 if [ "$is_tty" -eq 1 ]; then
771 echo -ne "\033[2K\r${PURPLE}Progress: ${bar} ${percent}% (${current_poem}/${TOTAL_POEMS})${NC}"
772 elif [ "$percent" -ne "$last_reported_percent" ]; then
773 echo "Progress: ${percent}% (${current_poem}/${TOTAL_POEMS})"
774 last_reported_percent=$percent
775 fi
776
777 # Check if embedding process is still running using PID
778 # Issue 10-022: Changed from pgrep to kill -0 for accurate PID detection
779 if ! kill -0 "$target_pid" 2>/dev/null; then
780 break
781 fi
782
783 # Periodic health check of the inference server (every 5 minutes)
784 local current_time=$(date +%s)
785 local health_check_interval=300 # 5 minutes
786 if [ $((current_time % health_check_interval)) -eq 0 ] && [ $((current_time - start_time)) -gt 60 ]; then
787 if ! curl -s --max-time 3 "$INFERENCE_ENDPOINT/v1/models" > /dev/null; then
788 echo ""
789 echo ""
790 echo -e "${RED}⚠️ INFERENCE SERVER UNAVAILABLE${NC}"
791 echo -e "${YELLOW}Embedding process may fail and could corrupt the cache.${NC}"
792 echo -e "${YELLOW}Consider stopping the process and restarting the inference server.${NC}"
793 fi
794 fi
795
796 sleep 0.2
797 done
798}
799
800# Start the embedding generation in background
801# Issue 10-022: Use direct function call instead of piped stdin
802# Piped stdin causes curl exit code 7 (connection refused) due to file descriptor issues
803echo "Generating embeddings..." > "$TEMP_LOG"
804
805# The two run modes (incremental vs full regeneration) differ only by one
806# boolean argument, so we build a single Lua program and pass the flag in
807# rather than duplicating the whole snippet.
808INCREMENTAL_LUA=$([ "$INCREMENTAL" = true ] && echo true || echo false)
809
810# In debug mode (NEOCITIES_LOG_DIR set by run.sh --debug) make Lua's stdout and
811# stderr UNBUFFERED. Otherwise Lua holds log lines in a block buffer and a hard
812# lock loses everything not yet flushed — defeating the whole point of routing
813# the log through fsync-logger below. Empty string outside debug = default
814# (block) buffering, which is faster for normal runs.
815LUA_DEBUG_PROLOGUE=""
816if [ -n "${NEOCITIES_LOG_DIR:-}" ]; then
817 LUA_DEBUG_PROLOGUE="io.stdout:setvbuf('no'); io.stderr:setvbuf('no');"
818fi
819
820LUA_EMBED_PROGRAM="
821 ${LUA_DEBUG_PROLOGUE}
822 package.path = '$DIR/libs/?.lua;$DIR/src/?.lua;' .. package.path
823 local sim = require('similarity-engine')
824 local success = sim.generate_all_embeddings(
825 '$POEMS_FILE',
826 '$DIR/assets',
827 '$INFERENCE_ENDPOINT',
828 ${INCREMENTAL_LUA},
829 '$MODEL_NAME'
830 )
831 os.exit(success and 0 or 1)
832"
833
834# In debug, pipe the embedding output through fsync-logger so each line is
835# committed to disk the moment it is written (survives a hard lock). The
836# process substitution is a sibling, so $! still captures the luajit PID we
837# wait on below. Outside debug, the plain append keeps things fast.
838if [ -n "${NEOCITIES_LOG_DIR:-}" ]; then
839 luajit -e "$LUA_EMBED_PROGRAM" \
840 > >("${DIR}/scripts/fsync-logger" --quiet "$TEMP_LOG") 2>&1 &
841else
842 luajit -e "$LUA_EMBED_PROGRAM" >> "$TEMP_LOG" 2>&1 &
843fi
844EMBED_PID=$!
845
846# Start progress monitoring
847monitor_progress &
848MONITOR_PID=$!
849
850# Wait for completion
851wait $EMBED_PID
852EMBED_RESULT=$?
853
854# Stop monitoring
855kill $MONITOR_PID 2>/dev/null
856wait $MONITOR_PID 2>/dev/null
857
858echo ""
859echo ""
860
861# Generate completion report
862END_TIME=$(date +%s)
863TOTAL_TIME=$((END_TIME - START_TIME))
864MINUTES=$((TOTAL_TIME / 60))
865SECONDS=$((TOTAL_TIME % 60))
866
867echo -e "${CYAN}================================================================${NC}"
868echo -e "${CYAN} EMBEDDING GENERATION COMPLETE${NC}"
869echo -e "${CYAN}================================================================${NC}"
870echo ""
871
872if [ $EMBED_RESULT -eq 0 ] && [ -f "$EMBEDDINGS_FILE" ]; then
873 # Generate detailed statistics
874 STATS=$(lua -e "
875 local dkjson = require('libs.dkjson')
876 local f = io.open('$EMBEDDINGS_FILE')
877 local data = dkjson.decode(f:read('*a'))
878 f:close()
879
880 local total = 0
881 local successful = 0
882 local failed = 0
883 local empty_content = 0
884 local avg_length = 0
885 local total_length = 0
886 local new_embeddings = data.metadata.new_embeddings or 0
887 local reused_embeddings = data.metadata.reused_embeddings or 0
888 local processing_mode = data.metadata.processing_mode or 'unknown'
889
890 for id, emb in pairs(data.embeddings) do
891 total = total + 1
892 if emb.embedding then
893 successful = successful + 1
894 if emb.content_length then
895 total_length = total_length + emb.content_length
896 end
897 elseif emb.error == 'empty_content' then
898 empty_content = empty_content + 1
899 else
900 failed = failed + 1
901 end
902 end
903
904 if successful > 0 then
905 avg_length = math.floor(total_length / successful)
906 end
907
908 -- Guard against division by zero when embeddings array is empty
909 local success_rate = 0
910 if total > 0 then
911 success_rate = math.floor((successful / total) * 100)
912 end
913 local processing_rate = 0
914 if $TOTAL_TIME > 0 then
915 processing_rate = math.floor(successful * 3600 / $TOTAL_TIME)
916 end
917 local time_savings = 0
918 if total > 0 then
919 time_savings = math.floor((reused_embeddings / total) * 100)
920 end
921
922 print(string.format('%d,%d,%d,%d,%d,%d,%d,%d,%d,%s', total, successful, failed, empty_content, success_rate, avg_length, processing_rate, new_embeddings, time_savings, processing_mode))
923 ")
924
925 IFS=',' read -r TOTAL_PROCESSED SUCCESSFUL FAILED EMPTY_CONTENT SUCCESS_RATE AVG_LENGTH PROCESSING_RATE NEW_EMBEDDINGS TIME_SAVINGS PROCESSING_MODE <<< "$STATS"
926
927 # Check if generation actually produced embeddings
928 # terminated_network_error mode with 0 successful is a failure
929 if [ "$SUCCESSFUL" -eq 0 ] || [ "$PROCESSING_MODE" = "terminated_network_error" ]; then
930 echo -e "${RED}❌ GENERATION FAILED${NC}"
931 echo ""
932 echo -e "${YELLOW}The embedding generation terminated without completing:${NC}"
933 echo -e " Processing Mode: ${RED}$PROCESSING_MODE${NC}"
934 echo -e " Successful Embeddings: ${RED}$SUCCESSFUL${NC}"
935 echo ""
936 echo -e "${YELLOW}💡 Troubleshooting:${NC}"
937 echo -e " 1. Check inference server: curl $INFERENCE_ENDPOINT/v1/models"
938 echo -e " 2. Test embedding API: curl $INFERENCE_ENDPOINT/v1/embeddings -d '{\"model\":\"$MODEL_NAME\",\"input\":\"test\"}'"
939 echo -e " 3. Retry: ./run.sh --generate-embeddings --force"
940 exit 1
941 fi
942
943 echo -e "${GREEN}✅ GENERATION SUCCESSFUL${NC}"
944 echo ""
945 echo -e "${BLUE}📊 Processing Statistics:${NC}"
946 echo -e " Processing Mode: ${PURPLE}$PROCESSING_MODE${NC}"
947 echo -e " Total Poems Processed: ${YELLOW}$TOTAL_PROCESSED${NC}"
948 echo -e " Successful Embeddings: ${GREEN}$SUCCESSFUL${NC}"
949 if [ "$PROCESSING_MODE" = "incremental" ]; then
950 REUSED_EMBEDDINGS=$((SUCCESSFUL - NEW_EMBEDDINGS))
951 echo -e " New Embeddings Generated: ${CYAN}$NEW_EMBEDDINGS${NC}"
952 echo -e " Existing Embeddings Reused: ${GREEN}$REUSED_EMBEDDINGS${NC}"
953 echo -e " Time Savings: ${GREEN}$TIME_SAVINGS%${NC}"
954 fi
955 echo -e " Failed Embeddings: ${RED}$FAILED${NC}"
956 echo -e " Empty Content Skipped: ${YELLOW}$EMPTY_CONTENT${NC}"
957 echo -e " Success Rate: ${GREEN}$SUCCESS_RATE%${NC}"
958 echo ""
959 echo -e "${BLUE}📈 Performance Metrics:${NC}"
960 echo -e " Total Processing Time: ${YELLOW}${MINUTES}m ${SECONDS}s${NC}"
961 if [ "$PROCESSING_MODE" = "incremental" ] && [ "$NEW_EMBEDDINGS" -gt 0 ]; then
962 ACTUAL_PROCESSING_RATE=$((NEW_EMBEDDINGS * 3600 / TOTAL_TIME))
963 echo -e " New Embedding Rate: ${GREEN}$ACTUAL_PROCESSING_RATE embeddings/hour${NC}"
964 echo -e " Overall Effective Rate: ${GREEN}$PROCESSING_RATE embeddings/hour${NC}"
965 else
966 echo -e " Average Processing Rate: ${GREEN}$PROCESSING_RATE embeddings/hour${NC}"
967 fi
968 echo -e " Average Poem Length: ${CYAN}$AVG_LENGTH characters${NC}"
969 echo ""
970 echo -e "${BLUE}🎯 Technical Details:${NC}"
971 echo -e " Embedding Model: ${PURPLE}${MODEL_NAME}${NC}"
972 echo -e " Vector Dimensions: ${PURPLE}${EMBEDDING_DIM:-unknown}${NC}"
973 echo -e " CUDA Acceleration: ${GREEN}Enabled${NC}"
974 echo -e " Endpoint: ${CYAN}$INFERENCE_ENDPOINT${NC}"
975 echo ""
976
977 # File size information
978 if [ -f "$EMBEDDINGS_FILE" ]; then
979 FILE_SIZE=$(du -h "$EMBEDDINGS_FILE" | cut -f1)
980 echo -e "${BLUE}📁 Output File:${NC}"
981 echo -e " Location: ${CYAN}$EMBEDDINGS_FILE${NC}"
982 echo -e " Size: ${YELLOW}$FILE_SIZE${NC}"
983 echo ""
984 fi
985
986 echo -e "${GREEN}🎉 Ready for similarity matrix calculation!${NC}"
987 echo -e "${CYAN}Next step: Run similarity matrix generation${NC}"
988
989else
990 echo -e "${RED}❌ GENERATION FAILED${NC}"
991 echo ""
992 echo -e "${YELLOW}📋 Error Log (last 20 lines):${NC}"
993 if [ -f "$TEMP_LOG" ]; then
994 tail -20 "$TEMP_LOG" | sed 's/^/ /'
995 fi
996 echo ""
997 echo -e "${YELLOW}💡 Troubleshooting:${NC}"
998 echo -e " 1. Check inference server status"
999 echo -e " 2. Verify EmbeddingGemma model availability"
1000 echo -e " 3. Check network connectivity"
1001 echo -e " 4. Review full log: ${CYAN}$TEMP_LOG${NC}"
1002fi
1003
1004echo ""
1005echo -e "${CYAN}================================================================${NC}"
1006echo -e "${BLUE}Generation completed at:${NC} $(date)"
1007echo -e "${CYAN}================================================================${NC}"
1008
1009# Cleanup. In debug mode (NEOCITIES_LOG_DIR exported by run.sh --debug) the
1010# embedding log is preserved for post-crash review; otherwise it is removed as
1011# before. The progress file is ephemeral inter-process state, always cleaned.
1012if [ -z "${NEOCITIES_LOG_DIR:-}" ]; then
1013 rm -f "$TEMP_LOG"
1014fi
1015rm -f "${DIR}/tmp/embedding_progress_${USER}.txt" 2>/dev/null
1016
1017# Propagate the embedding result as this script's exit code so run.sh's
1018# `generate-embeddings.sh ... || exit 1` actually fires on failure. Previously
1019# the script fell off the end after the cleanup rm (exit 0), which masked a
1020# failed run and let the pipeline push on into the word/color stages against a
1021# half-built cache — exactly what happened when the giant-poem tail tripped the
1022# network-error threshold. (Found during 10-050.)
1023exit $EMBED_RESULT