generate-embeddings.sh

1023 lines

1#!/bin/bash

2# Embedding Generation Manager for Neocities Poetry Modernization

4# Generates vector embeddings for poems using inference server embedding models.

5# Supports incremental processing, cache management, and multiple models.

7# Uses TUI library for vim-style interactive mode when available.

9# Usage: ./generate-embeddings.sh [OPTIONS] [DIRECTORY]

11# {{{ TUI Library

12LIBS_DIR="/home/ritz/programming/ai-stuff/scripts/libs"

13TUI_AVAILABLE=false

14if [[ -f "${LIBS_DIR}/lua-menu.sh" ]] && command -v luajit &>/dev/null; then

15 source "${LIBS_DIR}/lua-menu.sh"

16 TUI_AVAILABLE=true

17fi

18# }}}

20# {{{ setup_dir_path

21setup_dir_path() {

22 if [ -n "$1" ]; then

23 echo "$1"

24 else

25 echo "/mnt/mtwo/programming/ai-stuff/neocities-modernization"

26 fi

27}

28# }}}

30# Parse command line options first to find directory argument

31INCREMENTAL=true

32FORCE_REGEN=false

33SHOW_STATUS=false

34VALIDATE_CACHE=false

35FLUSH_ALL=false

36FLUSH_ERRORS=false

37BACKUP_BEFORE_FLUSH=true

38FORCE_OPERATION=false

39# 10-049 + Pascal-quant note: must match what config.lua's

40# inference_servers[selected].model returns (no Ollama-style ":tag" colon,

41# no GGUF ".gguf" extension). The /v1/models check later substring-greps

42# for this string in the server's loaded-model list, so it should be the

43# family identifier ("nomic-embed-text-v1.5"), not the full file basename

44# ("nomic-embed-text-v1.5.Q8_0.gguf"). When the config drifts from this

45# constant the substring-grep falls back to "model not found" and bails.

46MODEL_NAME="nomic-embed-text-v1.5"

47LIST_MODELS=false

48MODEL_STATUS=false

49INTERACTIVE_MODE=false

50DIRECTORY_ARG=""

51ASSETS_DIR=""

52# Issue 10-017: Inference server selection from config.lua

53INFERENCE_SERVER=""

55for arg in "$@"; do

56 case $arg in

57 --dir=*)

58 ASSETS_DIR="${arg#*=}"

59 ;;

60 --full-regen|--full)

61 INCREMENTAL=false

62 FORCE_REGEN=true

63 ;;

64 --incremental|--inc)

65 INCREMENTAL=true

66 ;;

67 --status)

68 SHOW_STATUS=true

69 ;;

70 --validate)

71 VALIDATE_CACHE=true

72 ;;

73 --flush-all)

74 FLUSH_ALL=true

75 ;;

76 --flush-errors)

77 FLUSH_ERRORS=true

78 ;;

79 --backup-before-flush)

80 BACKUP_BEFORE_FLUSH=true

81 ;;

82 --no-backup)

83 BACKUP_BEFORE_FLUSH=false

84 ;;

85 --force)

86 FORCE_OPERATION=true

87 ;;

88 --model=*)

89 MODEL_NAME="${arg#*=}"

90 ;;

91 # Issue 10-017: Inference server selection

92 --server=*)

93 INFERENCE_SERVER="${arg#*=}"

94 ;;

95 --list-models)

96 LIST_MODELS=true

97 ;;

98 --model-status)

99 MODEL_STATUS=true

100 ;;

101 -I)

102 INTERACTIVE_MODE=true

103 ;;

104 --help|-h)

105 echo "Usage: $0 [options] [directory]"

106 echo "Options:"

107 echo " --incremental, --inc Use incremental processing (default)"

108 echo " --full-regen, --full Force full regeneration of all embeddings"

109 echo " --status Show cache status without processing"

110 echo " --validate Validate cache integrity"

111 echo ""

112 echo "Cache Management Options:"

113 echo " --flush-all Remove all cached embeddings (complete regeneration)"

114 echo " --flush-errors Remove only error entries, keep valid embeddings"

115 echo " --backup-before-flush Create timestamped backup before flushing (default)"

116 echo " --no-backup Skip backup creation when flushing"

117 echo " --force Skip confirmation prompts for automated scripts"

118 echo ""

119 echo "Model Selection Options:"

120 echo " --model=MODEL_NAME Specify embedding model (default: nomic-embed-text-v1.5)"

121 echo " --list-models Show available models and their configurations"

122 echo " --model-status Show cache status for all models"

123 echo " --dir=PATH Use custom assets directory instead of default"

124 echo ""

125 echo "Examples:"

126 echo " $0 --flush-errors # Clean up failed entries"

127 echo " $0 --flush-all # Start completely fresh"

128 echo " $0 --model=text-embedding-ada-002 # Use OpenAI model"

129 echo " $0 --list-models # Show available models"

130 echo " $0 --model-status # Show cache status for all models"

131 echo " --help, -h Show this help message"

132 echo " -I Interactive mode - query user for options"

133 exit 0

134 ;;

135 *)

136 # If argument doesn't start with --, treat as directory

137 if [[ ! $arg == --* ]] && [ -z "$DIRECTORY_ARG" ]; then

138 DIRECTORY_ARG="$arg"

139 fi

140 ;;

141 esac

142done

143

144# {{{ setup_embedding_tui_menu

145# Configure the TUI menu for embedding generation options

146setup_embedding_tui_menu() {

147 if ! $TUI_AVAILABLE; then

148 return 1

149 fi

150

151 # Initialize TUI

152 if ! tui_init; then

153 return 1

154 fi

155

156 # Build the menu

157 menu_init

158 menu_set_title "Embedding Manager" "neocities-modernization - j/k:nav space:toggle Enter:run"

159

160 # ═══════════════════════════════════════════════════════════════════════════

161 # Section 1: Processing Mode (radio buttons - single selection)

162 # ═══════════════════════════════════════════════════════════════════════════

163 menu_add_section "mode" "single" "Processing Mode (select one)"

164 menu_add_item "mode" "incremental" "Incremental" "checkbox" "1" \

165 "Process only new/changed poems (fastest)" "1" ""

166 menu_add_item "mode" "full_regen" "Full Regeneration" "checkbox" "0" \

167 "Regenerate all embeddings from scratch" "2" ""

168 menu_add_item "mode" "status_only" "Status Check" "checkbox" "0" \

169 "Show current progress without processing" "3" ""

170

171 # ═══════════════════════════════════════════════════════════════════════════

172 # Section 2: Cache Management

173 # ═══════════════════════════════════════════════════════════════════════════

174 menu_add_section "cache" "multi" "Cache Management"

175 menu_add_item "cache" "flush_all" "Flush All Embeddings ⚠️" "checkbox" "0" \

176 "WARNING: Removes entire cache" "f" ""

177 menu_add_item "cache" "flush_errors" "Flush Errors Only" "checkbox" "0" \

178 "Remove failed entries, keep valid ones" "e" ""

179 menu_add_item "cache" "validate" "Validate Cache" "checkbox" "0" \

180 "Check integrity without changes" "v" ""

181

182 # ═══════════════════════════════════════════════════════════════════════════

183 # Section 3: Cache Options

184 # ═══════════════════════════════════════════════════════════════════════════

185 menu_add_section "cache_opts" "multi" "Cache Options"

186 menu_add_item "cache_opts" "backup" "Backup Before Flush" "checkbox" "1" \

187 "Create timestamped backup" "b" ""

188 menu_add_item "cache_opts" "force" "Skip Confirmations" "checkbox" "0" \

189 "Don't prompt for dangerous operations" "s" ""

190

191 # ═══════════════════════════════════════════════════════════════════════════

192 # Section 4: Model Selection

193 # ═══════════════════════════════════════════════════════════════════════════

194 menu_add_section "model" "multi" "Model Selection"

195 menu_add_item "model" "model_name" "Embedding Model" "multistate" "nomic-embed-text" \

196 "nomic-embed-text,qwen3-embedding,embeddinggemma,text-embedding-ada-002,all-MiniLM-L6-v2" "m" ""

197 menu_add_item "model" "model_status" "Show Model Status" "checkbox" "0" \

198 "Display cache stats for each model" "t" ""

199 menu_add_item "model" "list_models" "List Available Models" "checkbox" "0" \

200 "Show all configured models" "l" ""

201

202 # ═══════════════════════════════════════════════════════════════════════════

203 # Section 5: Actions

204 # ═══════════════════════════════════════════════════════════════════════════

205 menu_add_section "actions" "single" "Actions"

206 menu_add_item "actions" "run" "Run" "action" "" \

207 "Execute with selected options" "r"

208

209 return 0

210}

211# }}}

212

213# {{{ apply_tui_selections

214# Map TUI menu values to the script's flag variables

215apply_tui_selections() {

216 # Processing mode (radio - only one should be set)

217 if [[ "$(menu_get_value "incremental")" == "1" ]]; then

218 INCREMENTAL=true

219 FORCE_REGEN=false

220 SHOW_STATUS=false

221 elif [[ "$(menu_get_value "full_regen")" == "1" ]]; then

222 INCREMENTAL=false

223 FORCE_REGEN=true

224 SHOW_STATUS=false

225 elif [[ "$(menu_get_value "status_only")" == "1" ]]; then

226 SHOW_STATUS=true

227 INCREMENTAL=true

228 FORCE_REGEN=false

229 fi

230

231 # Cache management

232 [[ "$(menu_get_value "flush_all")" == "1" ]] && FLUSH_ALL=true

233 [[ "$(menu_get_value "flush_errors")" == "1" ]] && FLUSH_ERRORS=true

234 [[ "$(menu_get_value "validate")" == "1" ]] && VALIDATE_CACHE=true

235

236 # Cache options

237 [[ "$(menu_get_value "backup")" == "1" ]] && BACKUP_BEFORE_FLUSH=true || BACKUP_BEFORE_FLUSH=false

238 [[ "$(menu_get_value "force")" == "1" ]] && FORCE_OPERATION=true

239

240 # Model selection

241 local model=$(menu_get_value "model_name")

242 # 10-049: identifiers shifted from Ollama-style tags ("nomic-embed-text-v1.5")

243 # to llama.cpp's family-identifier convention ("nomic-embed-text-v1.5").

244 # Embedding cache directory derives from this string via the sanitize-then-mkdir

245 # pattern in utils.embeddings_dir(), so changing it triggers a fresh cache.

246 case "$model" in

247 "qwen3-embedding") MODEL_NAME="qwen3-embedding-4b" ;;

248 "nomic-embed-text") MODEL_NAME="nomic-embed-text-v1.5" ;;

249 "embeddinggemma") MODEL_NAME="embeddinggemma" ;;

250 "text-embedding-ada-002") MODEL_NAME="text-embedding-ada-002" ;;

251 "all-MiniLM-L6-v2") MODEL_NAME="all-MiniLM-L6-v2" ;;

252 *) MODEL_NAME="nomic-embed-text-v1.5" ;;

253 esac

254

255 [[ "$(menu_get_value "model_status")" == "1" ]] && MODEL_STATUS=true

256 [[ "$(menu_get_value "list_models")" == "1" ]] && LIST_MODELS=true

257

258 # Validation: flush_all takes precedence over flush_errors

259 if [[ "$FLUSH_ALL" == "true" ]] && [[ "$FLUSH_ERRORS" == "true" ]]; then

260 FLUSH_ERRORS=false

261 fi

262}

263# }}}

264

265# {{{ run_tui_interactive_mode

266# Run the TUI-based interactive mode

267run_tui_interactive_mode() {

268 if ! setup_embedding_tui_menu; then

269 return 1

270 fi

271

272 if menu_run; then

273 menu_cleanup

274 apply_tui_selections

275

276 # Show selected configuration

277 echo ""

278 echo "Selected configuration:"

279 echo "- Model: $MODEL_NAME"

280 echo "- Mode: $([ "$FORCE_REGEN" = true ] && echo "Full regeneration" || echo "Incremental")"

281 echo "- Status check: $([ "$SHOW_STATUS" = true ] && echo "Yes" || echo "No")"

282 echo "- Cache operations: $([ "$FLUSH_ALL" = true ] && echo "Flush all" || [ "$FLUSH_ERRORS" = true ] && echo "Flush errors" || [ "$VALIDATE_CACHE" = true ] && echo "Validate" || echo "None")"

283 echo ""

284 return 0

285 else

286 menu_cleanup

287 echo "Operation cancelled."

288 exit 0

289 fi

290}

291# }}}

292

293# {{{ run_simple_interactive_mode

294# Fallback simple interactive mode (original implementation)

295run_simple_interactive_mode() {

296 echo "=== Embedding Generation Interactive Mode ==="

297 echo ""

298 echo "Select processing mode:"

299 echo "1. Incremental (default) - Process only new/changed poems"

300 echo "2. Full regeneration - Regenerate all embeddings"

301 echo "3. Cache management - Flush/validate cache"

302 echo "4. Status check - Show current progress"

303 echo ""

304 read -p "Choose option (1-4): " mode_choice

305

306 case $mode_choice in

307 1)

308 INCREMENTAL=true

309 ;;

310 2)

311 INCREMENTAL=false

312 FORCE_REGEN=true

313 ;;

314 3)

315 echo ""

316 echo "Cache management options:"

317 echo "1. Flush all cached embeddings"

318 echo "2. Flush only failed embedding attempts"

319 echo "3. Validate cache integrity"

320 read -p "Choose cache option (1-3): " cache_choice

321 case $cache_choice in

322 1) FLUSH_ALL=true ;;

323 2) FLUSH_ERRORS=true ;;

324 3) VALIDATE_CACHE=true ;;

325 esac

326 ;;

327 4)

328 SHOW_STATUS=true

329 ;;

330 esac

331

332 echo ""

333 echo "Available embedding models:"

334 echo "1. nomic-embed-text-v1.5 (default)"

335 echo "2. qwen3-embedding:4b"

336 echo "3. embeddinggemma:latest"

337 echo "4. text-embedding-ada-002"

338 echo "5. all-MiniLM-L6-v2"

339 read -p "Choose model (1-5, or press enter for default): " model_choice

340

341 case $model_choice in

342 2) MODEL_NAME="qwen3-embedding:4b" ;;

343 3) MODEL_NAME="embeddinggemma:latest" ;;

344 4) MODEL_NAME="text-embedding-ada-002" ;;

345 5) MODEL_NAME="all-MiniLM-L6-v2" ;;

346 *) MODEL_NAME="nomic-embed-text-v1.5" ;;

347 esac

348

349 echo ""

350 echo "Selected configuration:"

351 echo "- Model: $MODEL_NAME"

352 echo "- Mode: $([ "$INCREMENTAL" = true ] && echo "Incremental" || echo "Full regeneration")"

353 echo "- Status check: $([ "$SHOW_STATUS" = true ] && echo "Yes" || echo "No")"

354 echo "- Cache operations: $([ "$FLUSH_ALL" = true ] && echo "Flush all" || [ "$FLUSH_ERRORS" = true ] && echo "Flush errors" || [ "$VALIDATE_CACHE" = true ] && echo "Validate" || echo "None")"

355 echo ""

356 read -p "Continue with this configuration? (y/n): " confirm

357 if [[ ! "$confirm" =~ ^[Yy] ]]; then

358 echo "Operation cancelled."

359 exit 0

360 fi

361 echo ""

362}

363# }}}

364

365# Interactive mode handling - try TUI first, fall back to simple mode

366if [ "$INTERACTIVE_MODE" = true ]; then

367 if $TUI_AVAILABLE; then

368 run_tui_interactive_mode || run_simple_interactive_mode

369 else

370 run_simple_interactive_mode

371 fi

372fi

373

374# Set up directory after parsing arguments

375DIR=$(setup_dir_path "$DIRECTORY_ARG")

376cd "$DIR" || exit 1

377

378# Issue 8-059: ensure the tmpfs-backed tmp/ symlink exists before any write,

379# since the progress file we share with similarity-engine.lua now lives there.

380"${DIR}/scripts/ensure-tmp-symlink" "${DIR}"

381

382# Build --dir argument for Lua scripts if assets dir was specified

383ASSETS_ARG=""

384if [ -n "$ASSETS_DIR" ]; then

385 ASSETS_ARG="--dir $ASSETS_DIR"

386fi

387

388# Colors for output

389RED='\033[0;31m'

390GREEN='\033[0;32m'

391YELLOW='\033[1;33m'

392BLUE='\033[0;34m'

393PURPLE='\033[0;35m'

394CYAN='\033[0;36m'

395NC='\033[0m' # No Color

396

397# Progress tracking

398START_TIME=$(date +%s)

399POEMS_FILE="$DIR/assets/poems.json"

400EMBEDDINGS_FILE="$DIR/assets/embeddings.json"

401# run.sh's --debug exports NEOCITIES_LOG_DIR → durable disk (output/debug-logs)

402# so this log survives the reboot a hard GPU lock forces; the default is the

403# RAM-backed tmp/. The end-of-run cleanup below is skipped when this is set.

404EMBED_LOG_DIR="${NEOCITIES_LOG_DIR:-${DIR}/tmp}"

405mkdir -p "$EMBED_LOG_DIR"

406TEMP_LOG="${EMBED_LOG_DIR}/embedding_generation.log"

407

408echo -e "${CYAN}================================================================${NC}"

409echo -e "${CYAN} POEM EMBEDDING GENERATION - LIVE PROGRESS MONITOR${NC}"

410echo -e "${CYAN}================================================================${NC}"

411echo ""

412# Handle model-specific operations

413if [ "$LIST_MODELS" = true ]; then

414 lua -e "

415 package.path = '$DIR/libs/?.lua;$DIR/src/?.lua;' .. package.path

416 local engine = require('similarity-engine')

417 engine.list_available_models()

418 "

419 exit 0

420fi

421

422if [ "$MODEL_STATUS" = true ]; then

423 lua -e "

424 package.path = '$DIR/libs/?.lua;$DIR/src/?.lua;' .. package.path

425 local engine = require('similarity-engine')

426 engine.show_all_model_status('$DIR/assets')

427 "

428 exit 0

429fi

430

431# Generate model-specific paths.

432# Issue 10-054: resolve the cache dir through scripts/cache-dir so this shell

433# writer lands in the SAME place the Lua readers look (disk or RAM, per the

434# CACHE_IN_RAM switch). It hard-errors out if the resolver yields nothing, rather

435# than silently writing to a wrong/empty path.

436SAFE_MODEL_NAME=$(echo "$MODEL_NAME" | sed 's/[^a-zA-Z0-9._-]/_/g')

437EMBEDDINGS_DIR=$(luajit "$DIR/scripts/cache-dir" "$DIR" --model "$MODEL_NAME")

438if [ -z "$EMBEDDINGS_DIR" ]; then

439 echo "Error: could not resolve the embeddings cache dir (scripts/cache-dir)" >&2

440 exit 1

441fi

442EMBEDDINGS_FILE="$EMBEDDINGS_DIR/embeddings.json"

443

444# Create model directory if needed

445mkdir -p "$EMBEDDINGS_DIR"

446

447echo -e "${BLUE}Project Directory:${NC} $DIR"

448echo -e "${BLUE}Input File:${NC} $POEMS_FILE"

449echo -e "${BLUE}Model:${NC} $MODEL_NAME"

450echo -e "${BLUE}Output File:${NC} $EMBEDDINGS_FILE"

451echo -e "${BLUE}Processing Mode:${NC} $([ "$INCREMENTAL" = true ] && echo "Incremental (default)" || echo "Full Regeneration")"

452echo -e "${BLUE}Start Time:${NC} $(date)"

453echo ""

454

455# Handle flush operations

456if [ "$FLUSH_ALL" = true ] || [ "$FLUSH_ERRORS" = true ]; then

457 echo -e "${YELLOW}🗑️ Cache Flush Operation${NC}"

458 echo ""

459

460 FLUSH_TYPE="all"

461 if [ "$FLUSH_ERRORS" = true ]; then

462 FLUSH_TYPE="errors"

463 fi

464

465 echo -e "${BLUE}Flush Type:${NC} $FLUSH_TYPE"

466 echo -e "${BLUE}Target File:${NC} $EMBEDDINGS_FILE"

467 echo -e "${BLUE}Backup Enabled:${NC} $BACKUP_BEFORE_FLUSH"

468

469 if [ -f "$EMBEDDINGS_FILE" ]; then

470 FILE_SIZE=$(du -h "$EMBEDDINGS_FILE" | cut -f1)

471 echo -e "${BLUE}Current File Size:${NC} $FILE_SIZE"

472 else

473 echo -e "${YELLOW}No cache file found${NC}"

474 exit 0

475 fi

476 echo ""

477

478 # Safety confirmation

479 if [ "$FORCE_OPERATION" != true ]; then

480 echo -e "${YELLOW}⚠️ WARNING: This will permanently modify the embedding cache${NC}"

481 if [ "$FLUSH_TYPE" = "all" ]; then

482 echo -e "${RED}This will DELETE ALL cached embeddings!${NC}"

483 else

484 echo -e "${YELLOW}This will remove error entries but keep valid embeddings${NC}"

485 fi

486 echo ""

487 read -p "Are you sure you want to proceed? (yes/no): " confirmation

488 if [ "$confirmation" != "yes" ]; then

489 echo "Operation cancelled"

490 exit 0

491 fi

492 fi

493

494 # Execute flush operation

495 echo -e "${CYAN}Executing flush operation...${NC}"

496

497 BACKUP_LUA_FLAG="true"

498 if [ "$BACKUP_BEFORE_FLUSH" = false ]; then

499 BACKUP_LUA_FLAG="false"

500 fi

501

502 lua -e "

503 package.path = package.path .. ';./libs/?.lua;./src/?.lua'

504 local similarity_engine = require('similarity-engine')

505 local success = similarity_engine.flush_embeddings_cache('$EMBEDDINGS_FILE', '$FLUSH_TYPE', $BACKUP_LUA_FLAG)

506 if not success then

507 os.exit(1)

508 end

509 "

510

511 FLUSH_RESULT=$?

512 if [ $FLUSH_RESULT -eq 0 ]; then

513 echo ""

514 echo -e "${GREEN}✅ Cache flush operation completed successfully${NC}"

515 else

516 echo ""

517 echo -e "${RED}❌ Cache flush operation failed${NC}"

518 exit 1

519 fi

520

521 echo ""

522 echo -e "${CYAN}================================================================${NC}"

523 echo -e "${CYAN} CACHE FLUSH COMPLETE${NC}"

524 echo -e "${CYAN}================================================================${NC}"

525 exit 0

526fi

527

528# Handle status and validation modes

529if [ "$SHOW_STATUS" = true ] || [ "$VALIDATE_CACHE" = true ]; then

530 echo -e "${YELLOW}🔍 Checking embedding cache status...${NC}"

531

532 if [ -f "$EMBEDDINGS_FILE" ]; then

533 CACHE_INFO=$(lua -e "

534 local dkjson = require('libs.dkjson')

535 local f = io.open('$EMBEDDINGS_FILE')

536 local data = dkjson.decode(f:read('*a'))

537 f:close()

538

539 -- Count actual embeddings in file

540 local total_entries = 0

541 local completed_embeddings = 0

542 for id, emb in pairs(data.embeddings or {}) do

543 total_entries = total_entries + 1

544 if emb.embedding then

545 completed_embeddings = completed_embeddings + 1

546 end

547 end

548

549 local rate = total_entries > 0 and (completed_embeddings / total_entries) or 0

550 local mode = data.metadata and data.metadata.processing_mode or 'unknown'

551 local generated = data.metadata and data.metadata.generated_at or 'unknown'

552 local model = data.metadata and data.metadata.embedding_model or 'qwen3-embedding:4b'

553

554 print(string.format('%d,%d,%.3f,%s,%s,%s', total_entries, completed_embeddings, rate, mode, generated, model))

555 " 2>/dev/null || echo "0,0,0,error,unknown,unknown")

556

557 IFS=',' read -r CACHE_TOTAL CACHE_COMPLETED CACHE_RATE CACHE_MODE CACHE_DATE CACHE_MODEL <<< "$CACHE_INFO"

558

559 echo -e "${GREEN}✓ Embedding cache found${NC}"

560 echo -e "${BLUE}Cache Statistics:${NC}"

561 echo -e " Total poems: ${YELLOW}$CACHE_TOTAL${NC}"

562 echo -e " Completed embeddings: ${GREEN}$CACHE_COMPLETED${NC}"

563 echo -e " Completion rate: ${GREEN}$(printf "%.1f%%" $(echo "$CACHE_RATE * 100" | bc -l))${NC}"

564 echo -e " Processing mode: ${PURPLE}$CACHE_MODE${NC}"

565 echo -e " Generated: ${CYAN}$CACHE_DATE${NC}"

566 echo -e " Model: ${PURPLE}$CACHE_MODEL${NC}"

567

568 if [ "$VALIDATE_CACHE" = true ]; then

569 echo ""

570 echo -e "${YELLOW}🔍 Validating cache integrity...${NC}"

571 # Add cache validation logic here

572 echo -e "${GREEN}✓ Cache validation complete${NC}"

573 fi

574 else

575 echo -e "${RED}❌ No embedding cache found${NC}"

576 echo -e "${YELLOW}💡 Run without --status to generate embeddings${NC}"

577 fi

578

579 if [ "$SHOW_STATUS" = true ]; then

580 exit 0

581 fi

582fi

583

584# Check prerequisites

585echo -e "${YELLOW}🔍 Checking prerequisites...${NC}"

586

587if [ ! -f "$POEMS_FILE" ]; then

588 echo -e "${RED}❌ ERROR: Poems file not found at $POEMS_FILE${NC}"

589 exit 1

590fi

591

592# Count total poems

593TOTAL_POEMS=$(lua -e "local dkjson = require('libs.dkjson'); local f = io.open('$POEMS_FILE'); local data = dkjson.decode(f:read('*a')); f:close(); print(#data.poems)")

594echo -e "${GREEN}✓ Found $TOTAL_POEMS poems to process${NC}"

595

596# Check inference server availability. 10-049: was Ollama; now llama.cpp.

597if [ -n "$INFERENCE_SERVER" ]; then

598 INFERENCE_ENDPOINT=$(luajit -e "

599 package.path = '$DIR/libs/?.lua;' .. package.path

600 local inference = require('inference-server-config')

601 inference.set_selected_server('$INFERENCE_SERVER')

602 print(inference.build_host_url())

603 ")

604 echo -e "${CYAN}Using Inference server: $INFERENCE_SERVER${NC}"

605else

606 # Default: use config default or hardcoded fallback

607 INFERENCE_ENDPOINT=$(luajit -e "

608 package.path = '$DIR/libs/?.lua;' .. package.path

609 local inference = require('inference-server-config')

610 print(inference.build_host_url())

611 " 2>/dev/null || echo "http://127.0.0.1:18080")

612fi

613# /v1/models is llama.cpp's OpenAI-compatible "what's loaded" endpoint.

614# Used both as a liveness probe and as the source for the model list.

615if curl -s --max-time 3 "$INFERENCE_ENDPOINT/v1/models" > /dev/null; then

616 echo -e "${GREEN}✓ Inference server reachable at $INFERENCE_ENDPOINT${NC}"

617else

618 echo -e "${RED}❌ ERROR: Cannot connect to inference server at $INFERENCE_ENDPOINT${NC}"

619 exit 1

620fi

621

622# Check selected embedding model. /v1/models returns {data: [{id: "..."},...]}.

623if curl -s "$INFERENCE_ENDPOINT/v1/models" | grep -q "$MODEL_NAME"; then

624 echo -e "${GREEN}✓ $MODEL_NAME model available${NC}"

625else

626 echo -e "${RED}❌ ERROR: $MODEL_NAME model not found${NC}"

627 echo -e "${YELLOW}💡 Loaded models on this server:${NC}"

628 curl -s "$INFERENCE_ENDPOINT/v1/models" | lua -e "

629 local dkjson = require('libs.dkjson')

630 local data = dkjson.decode(io.read('*a'))

631 if data and data.data then

632 for _, model in ipairs(data.data) do

633 print(' ' .. (model.id or '(unnamed)'))

634 end

635 end

636 " 2>/dev/null

637 exit 1

638fi

639

640echo ""

641if [ "$INCREMENTAL" = true ]; then

642 echo -e "${CYAN}🚀 Starting incremental embedding generation for $TOTAL_POEMS poems...${NC}"

643 echo -e "${YELLOW}💡 Only new/changed poems will be processed (time savings expected)${NC}"

644else

645 echo -e "${CYAN}🚀 Starting FULL regeneration of embeddings for $TOTAL_POEMS poems...${NC}"

646 echo -e "${YELLOW}⚠️ All embeddings will be regenerated (this may take longer)${NC}"

647fi

648echo ""

649

650# Graceful termination handler

651cleanup_and_exit() {

652 echo ""

653 echo -e "${YELLOW}🛑 Termination signal received${NC}"

654 echo -e "${CYAN}Performing graceful cleanup...${NC}"

655

656 # Kill background processes

657 if [ -n "$EMBED_PID" ]; then

658 echo -e "${BLUE}Stopping embedding generation process...${NC}"

659 kill -TERM "$EMBED_PID" 2>/dev/null

660 wait "$EMBED_PID" 2>/dev/null

661 fi

662

663 if [ -n "$MONITOR_PID" ]; then

664 echo -e "${BLUE}Stopping progress monitor...${NC}"

665 kill -TERM "$MONITOR_PID" 2>/dev/null

666 wait "$MONITOR_PID" 2>/dev/null

667 fi

668

669 # Show current progress

670 if [ -f "$EMBEDDINGS_FILE" ]; then

671 local final_count=$(lua -e "

672 local dkjson = require('libs.dkjson')

673 local f = io.open('$EMBEDDINGS_FILE')

674 if not f then print(0); return end

675 local content = f:read('*a')

676 f:close()

677 if content == '' then print(0); return end

678 local data = dkjson.decode(content)

679 if not data or not data.embeddings then print(0); return end

680 local count = 0

681 -- Handle both array and object format

682 if data.embeddings[1] then

683 -- Array format

684 for _, emb in ipairs(data.embeddings) do

685 if emb.embedding then count = count + 1 end

686 end

687 else

688 -- Object format

689 for id, emb in pairs(data.embeddings) do

690 if emb.embedding then count = count + 1 end

691 end

692 end

693 print(count)

694 " 2>/dev/null || echo "0")

695

696 echo -e "${GREEN}✅ Embeddings saved to cache${NC}"

697 echo -e "${BLUE}Progress preserved: $final_count/$TOTAL_POEMS embeddings completed${NC}"

698 echo -e "${CYAN}Use incremental mode to resume from current position${NC}"

699 fi

700

701 local end_time=$(date +%s)

702 local total_time=$((end_time - START_TIME))

703 local total_minutes=$((total_time / 60))

704 echo -e "${BLUE}Total runtime: ${total_minutes}m${NC}"

705

706 # Cleanup progress file

707 rm -f "${DIR}/tmp/embedding_progress_${USER}.txt" 2>/dev/null

708

709 exit 0

710}

711

712# Register signal handlers

713trap cleanup_and_exit SIGINT SIGTERM

714

715# Create monitoring function

716# Issue 10-022: Updated to use PID-based detection instead of pgrep for process name

717# This is needed because we now use luajit -e instead of lua similarity-engine.lua

718monitor_progress() {

719 local current_poem=0

720 local start_time=$(date +%s)

721 local percent=0

722 local progress_file="${DIR}/tmp/embedding_progress_${USER}.txt"

723 local last_progress_time=0

724 # EMBED_PID is set in the calling scope before monitor_progress is started

725 local target_pid=$EMBED_PID

726

727 # Detect whether stdout is an interactive terminal. Under run.sh --debug,

728 # stdout is a pipe to scripts/fsync-logger, which reads line-by-line and so

729 # cannot render a carriage-return progress bar (the bar emits no newlines, so

730 # the logger blocks waiting for one and nothing shows). In that case we fall

731 # back to a newline-terminated progress LINE on each percent change, which

732 # flows through the logger to both the terminal and the debug log.

733 local is_tty=0

734 if [ -t 1 ]; then is_tty=1; fi

735 local last_reported_percent=-1

736

737 while true; do

738 # Check for real-time progress updates from Lua script

739 if [ -f "$progress_file" ]; then

740 local file_mtime=$(stat -c %Y "$progress_file" 2>/dev/null || echo "0")

741 if [ "$file_mtime" -gt "$last_progress_time" ]; then

742 # File has been updated - read new progress

743 local progress_data=$(cat "$progress_file" 2>/dev/null || echo "0,0")

744 IFS=',' read -r current_poem total_poems <<< "$progress_data"

745 last_progress_time=$file_mtime

746

747 # Calculate percentage (guard against division by zero)

748 if [ "$total_poems" -gt 0 ]; then

749 percent=$((current_poem * 100 / total_poems))

750 else

751 percent=0

752 fi

753 fi

754 else

755 # No progress file found - fallback to basic monitoring

756 current_poem=0

757 percent=0

758 fi

759

760 # Create progress bar

761 local bar_length=50

762 local filled=$((percent * bar_length / 100))

763 local bar=""

764 for ((i=0; i<filled; i++)); do bar="${bar}█"; done

765 for ((i=filled; i<bar_length; i++)); do bar="${bar}░"; done

766

767 # Render progress. Interactive terminal: redraw the bar in place with a

768 # carriage return. Captured/non-TTY (e.g. --debug): emit one line per

769 # percentage change so the line-based logger can pass it through.

770 if [ "$is_tty" -eq 1 ]; then

771 echo -ne "\033[2K\r${PURPLE}Progress: ${bar} ${percent}% (${current_poem}/${TOTAL_POEMS})${NC}"

772 elif [ "$percent" -ne "$last_reported_percent" ]; then

773 echo "Progress: ${percent}% (${current_poem}/${TOTAL_POEMS})"

774 last_reported_percent=$percent

775 fi

776

777 # Check if embedding process is still running using PID

778 # Issue 10-022: Changed from pgrep to kill -0 for accurate PID detection

779 if ! kill -0 "$target_pid" 2>/dev/null; then

780 break

781 fi

782

783 # Periodic health check of the inference server (every 5 minutes)

784 local current_time=$(date +%s)

785 local health_check_interval=300 # 5 minutes

786 if [ $((current_time % health_check_interval)) -eq 0 ] && [ $((current_time - start_time)) -gt 60 ]; then

787 if ! curl -s --max-time 3 "$INFERENCE_ENDPOINT/v1/models" > /dev/null; then

788 echo ""

789 echo ""

790 echo -e "${RED}⚠️ INFERENCE SERVER UNAVAILABLE${NC}"

791 echo -e "${YELLOW}Embedding process may fail and could corrupt the cache.${NC}"

792 echo -e "${YELLOW}Consider stopping the process and restarting the inference server.${NC}"

793 fi

794 fi

795

796 sleep 0.2

797 done

798}

799

800# Start the embedding generation in background

801# Issue 10-022: Use direct function call instead of piped stdin

802# Piped stdin causes curl exit code 7 (connection refused) due to file descriptor issues

803echo "Generating embeddings..." > "$TEMP_LOG"

804

805# The two run modes (incremental vs full regeneration) differ only by one

806# boolean argument, so we build a single Lua program and pass the flag in

807# rather than duplicating the whole snippet.

808INCREMENTAL_LUA=$([ "$INCREMENTAL" = true ] && echo true || echo false)

809

810# In debug mode (NEOCITIES_LOG_DIR set by run.sh --debug) make Lua's stdout and

811# stderr UNBUFFERED. Otherwise Lua holds log lines in a block buffer and a hard

812# lock loses everything not yet flushed — defeating the whole point of routing

813# the log through fsync-logger below. Empty string outside debug = default

814# (block) buffering, which is faster for normal runs.

815LUA_DEBUG_PROLOGUE=""

816if [ -n "${NEOCITIES_LOG_DIR:-}" ]; then

817 LUA_DEBUG_PROLOGUE="io.stdout:setvbuf('no'); io.stderr:setvbuf('no');"

818fi

819

820LUA_EMBED_PROGRAM="

821 ${LUA_DEBUG_PROLOGUE}

822 package.path = '$DIR/libs/?.lua;$DIR/src/?.lua;' .. package.path

823 local sim = require('similarity-engine')

824 local success = sim.generate_all_embeddings(

825 '$POEMS_FILE',

826 '$DIR/assets',

827 '$INFERENCE_ENDPOINT',

828 ${INCREMENTAL_LUA},

829 '$MODEL_NAME'

830 )

831 os.exit(success and 0 or 1)

832"

833

834# In debug, pipe the embedding output through fsync-logger so each line is

835# committed to disk the moment it is written (survives a hard lock). The

836# process substitution is a sibling, so $! still captures the luajit PID we

837# wait on below. Outside debug, the plain append keeps things fast.

838if [ -n "${NEOCITIES_LOG_DIR:-}" ]; then

839 luajit -e "$LUA_EMBED_PROGRAM" \

840 > >("${DIR}/scripts/fsync-logger" --quiet "$TEMP_LOG") 2>&1 &

841else

842 luajit -e "$LUA_EMBED_PROGRAM" >> "$TEMP_LOG" 2>&1 &

843fi

844EMBED_PID=$!

845

846# Start progress monitoring

847monitor_progress &

848MONITOR_PID=$!

849

850# Wait for completion

851wait $EMBED_PID

852EMBED_RESULT=$?

853

854# Stop monitoring

855kill $MONITOR_PID 2>/dev/null

856wait $MONITOR_PID 2>/dev/null

857

858echo ""

859echo ""

860

861# Generate completion report

862END_TIME=$(date +%s)

863TOTAL_TIME=$((END_TIME - START_TIME))

864MINUTES=$((TOTAL_TIME / 60))

865SECONDS=$((TOTAL_TIME % 60))

866

867echo -e "${CYAN}================================================================${NC}"

868echo -e "${CYAN} EMBEDDING GENERATION COMPLETE${NC}"

869echo -e "${CYAN}================================================================${NC}"

870echo ""

871

872if [ $EMBED_RESULT -eq 0 ] && [ -f "$EMBEDDINGS_FILE" ]; then

873 # Generate detailed statistics

874 STATS=$(lua -e "

875 local dkjson = require('libs.dkjson')

876 local f = io.open('$EMBEDDINGS_FILE')

877 local data = dkjson.decode(f:read('*a'))

878 f:close()

879

880 local total = 0

881 local successful = 0

882 local failed = 0

883 local empty_content = 0

884 local avg_length = 0

885 local total_length = 0

886 local new_embeddings = data.metadata.new_embeddings or 0

887 local reused_embeddings = data.metadata.reused_embeddings or 0

888 local processing_mode = data.metadata.processing_mode or 'unknown'

889

890 for id, emb in pairs(data.embeddings) do

891 total = total + 1

892 if emb.embedding then

893 successful = successful + 1

894 if emb.content_length then

895 total_length = total_length + emb.content_length

896 end

897 elseif emb.error == 'empty_content' then

898 empty_content = empty_content + 1

899 else

900 failed = failed + 1

901 end

902 end

903

904 if successful > 0 then

905 avg_length = math.floor(total_length / successful)

906 end

907

908 -- Guard against division by zero when embeddings array is empty

909 local success_rate = 0

910 if total > 0 then

911 success_rate = math.floor((successful / total) * 100)

912 end

913 local processing_rate = 0

914 if $TOTAL_TIME > 0 then

915 processing_rate = math.floor(successful * 3600 / $TOTAL_TIME)

916 end

917 local time_savings = 0

918 if total > 0 then

919 time_savings = math.floor((reused_embeddings / total) * 100)

920 end

921

922 print(string.format('%d,%d,%d,%d,%d,%d,%d,%d,%d,%s', total, successful, failed, empty_content, success_rate, avg_length, processing_rate, new_embeddings, time_savings, processing_mode))

923 ")

924

925 IFS=',' read -r TOTAL_PROCESSED SUCCESSFUL FAILED EMPTY_CONTENT SUCCESS_RATE AVG_LENGTH PROCESSING_RATE NEW_EMBEDDINGS TIME_SAVINGS PROCESSING_MODE <<< "$STATS"

926

927 # Check if generation actually produced embeddings

928 # terminated_network_error mode with 0 successful is a failure

929 if [ "$SUCCESSFUL" -eq 0 ] || [ "$PROCESSING_MODE" = "terminated_network_error" ]; then

930 echo -e "${RED}❌ GENERATION FAILED${NC}"

931 echo ""

932 echo -e "${YELLOW}The embedding generation terminated without completing:${NC}"

933 echo -e " Processing Mode: ${RED}$PROCESSING_MODE${NC}"

934 echo -e " Successful Embeddings: ${RED}$SUCCESSFUL${NC}"

935 echo ""

936 echo -e "${YELLOW}💡 Troubleshooting:${NC}"

937 echo -e " 1. Check inference server: curl $INFERENCE_ENDPOINT/v1/models"

938 echo -e " 2. Test embedding API: curl $INFERENCE_ENDPOINT/v1/embeddings -d '{\"model\":\"$MODEL_NAME\",\"input\":\"test\"}'"

939 echo -e " 3. Retry: ./run.sh --generate-embeddings --force"

940 exit 1

941 fi

942

943 echo -e "${GREEN}✅ GENERATION SUCCESSFUL${NC}"

944 echo ""

945 echo -e "${BLUE}📊 Processing Statistics:${NC}"

946 echo -e " Processing Mode: ${PURPLE}$PROCESSING_MODE${NC}"

947 echo -e " Total Poems Processed: ${YELLOW}$TOTAL_PROCESSED${NC}"

948 echo -e " Successful Embeddings: ${GREEN}$SUCCESSFUL${NC}"

949 if [ "$PROCESSING_MODE" = "incremental" ]; then

950 REUSED_EMBEDDINGS=$((SUCCESSFUL - NEW_EMBEDDINGS))

951 echo -e " New Embeddings Generated: ${CYAN}$NEW_EMBEDDINGS${NC}"

952 echo -e " Existing Embeddings Reused: ${GREEN}$REUSED_EMBEDDINGS${NC}"

953 echo -e " Time Savings: ${GREEN}$TIME_SAVINGS%${NC}"

954 fi

955 echo -e " Failed Embeddings: ${RED}$FAILED${NC}"

956 echo -e " Empty Content Skipped: ${YELLOW}$EMPTY_CONTENT${NC}"

957 echo -e " Success Rate: ${GREEN}$SUCCESS_RATE%${NC}"

958 echo ""

959 echo -e "${BLUE}📈 Performance Metrics:${NC}"

960 echo -e " Total Processing Time: ${YELLOW}${MINUTES}m ${SECONDS}s${NC}"

961 if [ "$PROCESSING_MODE" = "incremental" ] && [ "$NEW_EMBEDDINGS" -gt 0 ]; then

962 ACTUAL_PROCESSING_RATE=$((NEW_EMBEDDINGS * 3600 / TOTAL_TIME))

963 echo -e " New Embedding Rate: ${GREEN}$ACTUAL_PROCESSING_RATE embeddings/hour${NC}"

964 echo -e " Overall Effective Rate: ${GREEN}$PROCESSING_RATE embeddings/hour${NC}"

965 else

966 echo -e " Average Processing Rate: ${GREEN}$PROCESSING_RATE embeddings/hour${NC}"

967 fi

968 echo -e " Average Poem Length: ${CYAN}$AVG_LENGTH characters${NC}"

969 echo ""

970 echo -e "${BLUE}🎯 Technical Details:${NC}"

971 echo -e " Embedding Model: ${PURPLE}${MODEL_NAME}${NC}"

972 echo -e " Vector Dimensions: ${PURPLE}${EMBEDDING_DIM:-unknown}${NC}"

973 echo -e " CUDA Acceleration: ${GREEN}Enabled${NC}"

974 echo -e " Endpoint: ${CYAN}$INFERENCE_ENDPOINT${NC}"

975 echo ""

976

977 # File size information

978 if [ -f "$EMBEDDINGS_FILE" ]; then

979 FILE_SIZE=$(du -h "$EMBEDDINGS_FILE" | cut -f1)

980 echo -e "${BLUE}📁 Output File:${NC}"

981 echo -e " Location: ${CYAN}$EMBEDDINGS_FILE${NC}"

982 echo -e " Size: ${YELLOW}$FILE_SIZE${NC}"

983 echo ""

984 fi

985

986 echo -e "${GREEN}🎉 Ready for similarity matrix calculation!${NC}"

987 echo -e "${CYAN}Next step: Run similarity matrix generation${NC}"

988

989else

990 echo -e "${RED}❌ GENERATION FAILED${NC}"

991 echo ""

992 echo -e "${YELLOW}📋 Error Log (last 20 lines):${NC}"

993 if [ -f "$TEMP_LOG" ]; then

994 tail -20 "$TEMP_LOG" | sed 's/^/ /'

995 fi

996 echo ""

997 echo -e "${YELLOW}💡 Troubleshooting:${NC}"

998 echo -e " 1. Check inference server status"

999 echo -e " 2. Verify EmbeddingGemma model availability"

1000 echo -e " 3. Check network connectivity"

1001 echo -e " 4. Review full log: ${CYAN}$TEMP_LOG${NC}"

1002fi

1003

1004echo ""

1005echo -e "${CYAN}================================================================${NC}"

1006echo -e "${BLUE}Generation completed at:${NC} $(date)"

1007echo -e "${CYAN}================================================================${NC}"

1008

1009# Cleanup. In debug mode (NEOCITIES_LOG_DIR exported by run.sh --debug) the

1010# embedding log is preserved for post-crash review; otherwise it is removed as

1011# before. The progress file is ephemeral inter-process state, always cleaned.

1012if [ -z "${NEOCITIES_LOG_DIR:-}" ]; then

1013 rm -f "$TEMP_LOG"

1014fi

1015rm -f "${DIR}/tmp/embedding_progress_${USER}.txt" 2>/dev/null

1016

1017# Propagate the embedding result as this script's exit code so run.sh's

1018# `generate-embeddings.sh ... || exit 1` actually fires on failure. Previously

1019# the script fell off the end after the cleanup rm (exit 0), which masked a

1020# failed run and let the pipeline push on into the word/color stages against a

1021# half-built cache — exactly what happened when the giant-poem tail tripped the

1022# network-error threshold. (Found during 10-050.)

1023exit $EMBED_RESULT