src/similarity-engine.lua

3-- {{{ local function setup_dir_path

4local function setup_dir_path(provided_dir)

5 if provided_dir then

6 return provided_dir

7 end

8 return "/mnt/mtwo/programming/ai-stuff/neocities-modernization"

9end

10-- }}}

32-- {{{ Model configurations

33local embedding_models = {

34 -- Key is the GGUF-basename form ("nomic-embed-text-v1.5") to match what

35 -- config.lua, run.sh, and generate-embeddings.sh actually pass. The old

36 -- Ollama-era "model:tag" colon form ("nomic-embed-text:v1.5") never resolved

37 -- after the 10-049 migration, so a full regen aborted with "Unknown

38 -- embedding model" before sending any request. (Leftover from 10-049.)

39 ["nomic-embed-text-v1.5"] = {

40 dimensions = 768,

41 timeout = 30,

42 -- v1.5 routes through task-specific weights based on prompt prefix;

43 -- the active prefix is configured per inference_servers entry.

44 requires_prompt_prefix = true,

45 },

46 ["embeddinggemma:latest"] = {

47 dimensions = 768,

48 timeout = 30

49 },

50 -- Issue 10-031: GGUF-basename forms for the local model-comparison set.

51 ["mxbai-embed-large-v1"] = {

52 dimensions = 1024,

53 timeout = 30

54 },

55 ["embeddinggemma-300m"] = {

56 dimensions = 768,

57 timeout = 30,

58 -- Uses a clustering task prompt, configured per inference_servers entry.

59 requires_prompt_prefix = true,

60 },

61 ["qwen3-embedding:4b"] = {

62 dimensions = 2560,

63 timeout = 60 -- bigger model, longer per-call

64 },

65 ["qwen3-embedding:8b"] = {

66 dimensions = 4096,

67 timeout = 90

68 },

69 ["text-embedding-ada-002"] = {

70 dimensions = 1536,

71 timeout = 60

72 },

73 ["all-MiniLM-L6-v2"] = {

74 dimensions = 384,

75 timeout = 20

76 }

77}

78-- }}}

80-- {{{ local function get_model_storage_path

81local function get_model_storage_path(base_dir, model_name)

82 -- Issue 10-054: the model's cache dir comes from embeddings_dir() so it

83 -- follows the RAM/disk switch -- this function is the EMBEDDING GENERATOR's

84 -- write path (similarity-engine.lua is the embedder behind generate-

85 -- embeddings.sh, not just legacy matrix code), so leaving it on disk is what

86 -- made the flip write embeddings where no reader looked. base_dir is now

87 -- ignored (kept in the signature for callers that still pass get_assets_root);

88 -- embeddings_dir builds the same <root>/embeddings/<safe_model> path and is

89 -- identical to the old base_dir/embeddings/<safe_model> while the switch is off.

90 local model_dir = utils.embeddings_dir(model_name)

91

92 -- Create directory if it doesn't exist

93 os.execute("mkdir -p " .. model_dir)

94

95 return {

96 embeddings = model_dir .. "/embeddings.json",

97 similarity_matrix = model_dir .. "/similarity_matrix.json",

98 metadata = model_dir .. "/metadata.json"

99 }

100end

101-- }}}

103-- {{{ local function cosine_similarity

104local function cosine_similarity(vec1, vec2)

105 if #vec1 ~= #vec2 then

106 error("Vectors must have same dimension")

107 end

108

109 local dot_product = 0

110 local norm1 = 0

111 local norm2 = 0

112

113 for i = 1, #vec1 do

114 dot_product = dot_product + (vec1[i] * vec2[i])

115 norm1 = norm1 + (vec1[i] * vec1[i])

116 norm2 = norm2 + (vec2[i] * vec2[i])

117 end

118

119 norm1 = math.sqrt(norm1)

120 norm2 = math.sqrt(norm2)

121

122 if norm1 == 0 or norm2 == 0 then

123 return 0

124 end

125

126 return dot_product / (norm1 * norm2)

127end

128-- }}}

130-- {{{ local function generate_embedding

131-- model_name is required; it ends up in the request payload AND determines

132-- which dimension downstream validators expect. Defaults are dangerous here

133-- because the wrong model silently produces wrong-shape embeddings.

134local function generate_embedding(text, endpoint, model_name)

135 -- Create a temporary file to avoid shell escaping issues.

136 -- Issue 8-059: route through the project's tmpfs-backed tmp/ symlink so

137 -- parallel checkouts of this repository do not collide on a single shared

138 -- /tmp/ filename.

139 os.execute(string.format('"%s/scripts/ensure-tmp-symlink" "%s"', DIR, DIR))

140 local temp_file = DIR .. "/tmp/embedding_input.json"

141 local payload = {

142 model = model_name,

143 -- Apply the active server's task-prefix (e.g. "clustering: " for

144 -- nomic-embed-text v1.5+). No-op for models that don't need one.

145 input = inference_config.format_embedding_prompt(text)

146 }

147

148 local f = io.open(temp_file, "w")

149 if not f then

150 utils.log_error("Failed to create temporary file")

151 return nil, "file_error"

152 end

153 f:write(dkjson.encode(payload))

154 f:close()

155

156 -- 10-049: /v1/embeddings (OpenAI shape) replaces Ollama's /api/embed.

157 -- llama.cpp exposes a single endpoint regardless of which model is

158 -- loaded, so the endpoint path is the same for every model in the

159 -- embedding_models table above (the per-model endpoint_path field

160 -- was removed in the same migration).

161 local cmd = string.format(

162 'curl -s --connect-timeout 10 --max-time 30 "%s/v1/embeddings" -H "Content-Type: application/json" -d @%s',

163 endpoint, temp_file

164 )

165

166 local handle = io.popen(cmd)

167 local result = handle:read("*a")

168 local success, exit_type, exit_code = handle:close()

169

170 -- Clean up temp file

171 os.remove(temp_file)

172

173 -- Check for network/connection errors

174 if not success or exit_code ~= 0 then

175 utils.log_error("Network error: curl failed with exit code " .. (exit_code or "unknown"))

176 return nil, "network_error"

177 end

178

179 -- Check for empty or invalid response

180 if not result or result:match("^%s*$") then

181 utils.log_error("Empty response from API endpoint")

182 return nil, "empty_response"

183 end

184

185 -- Check for curl error messages

186 if result:match("curl:") or result:match("Could not resolve host") or result:match("Connection refused") then

187 utils.log_error("Connection error: " .. result:gsub("\n", " "))

188 return nil, "connection_error"

189 end

190

191 local parsed = dkjson.decode(result)

192 -- 10-049: OpenAI shape — vectors live under data[N].embedding rather

193 -- than directly under .embeddings[N]. We send one input per call here,

194 -- so we read data[1].embedding.

195 if parsed and parsed.data and parsed.data[1] and parsed.data[1].embedding then

196 -- Accept any positive-dimension embedding. The hardcoded "== 768"

197 -- that used to live here would have rejected every output from

198 -- qwen3-embedding (2560-D) or any other non-gemma model. Downstream

199 -- code reads the dimension off the embedding itself rather than

200 -- relying on a fixed value, so there is nothing to gain from

201 -- gating here.

202 local embedding = parsed.data[1].embedding

203 if type(embedding) == "table" and #embedding > 0 then

204 return embedding, "success"

205 else

206 utils.log_error("Invalid embedding response: " .. (type(embedding) == "table" and "empty table" or type(embedding)))

207 return nil, "invalid_dimensions"

208 end

209 else

210 utils.log_error("Failed to parse API response: " .. (result:sub(1, 200) or "nil"))

211 return nil, "parse_error"

212 end

213end

214-- }}}

216-- {{{ local function table_length

217local function table_length(t)

218 local count = 0

219 for _ in pairs(t) do

220 count = count + 1

221 end

222 return count

223end

224-- }}}

226-- {{{ local function generate_random_embedding

227-- Generates a random 768-dimensional embedding for empty poems

228-- Seeded by poem_id for reproducibility

229local function generate_random_embedding(poem_id, dimension)

230 dimension = dimension or 768

231

232 -- Seed with poem_id for reproducibility

233 local seed = type(poem_id) == "number" and poem_id or 12345

234 math.randomseed(seed)

235

236 local embedding = {}

237 local norm = 0

238

239 -- Generate random values

240 for i = 1, dimension do

241 embedding[i] = math.random() * 2 - 1 -- Range: -1 to 1

242 norm = norm + embedding[i] * embedding[i]

243 end

244

245 -- Normalize to unit vector for consistent similarity calculations

246 norm = math.sqrt(norm)

247 if norm > 0 then

248 for i = 1, dimension do

249 embedding[i] = embedding[i] / norm

250 end

251 end

252

253 return embedding

254end

255-- }}}

257-- {{{ local function inherit_embedding

258-- Issue 9-010: For image-only posts, inherit embedding from nearest text poem

259-- Optionally combine with own text embedding if the post has any content

260local function inherit_embedding(nearest_embedding, own_embedding, dimension)

261 dimension = dimension or 768

262

263 if not nearest_embedding then

264 return nil -- No embedding to inherit

265 end

266

267 if not own_embedding then

268 -- Pure inheritance: just copy the nearest embedding

269 local result = {}

270 for i = 1, dimension do

271 result[i] = nearest_embedding[i]

272 end

273 return result

274 end

275

276 -- Combine embeddings: average of nearest and own

277 -- This gives semantic meaning from context while preserving any content the post has

278 local result = {}

279 local norm = 0

280

281 for i = 1, dimension do

282 result[i] = (nearest_embedding[i] + own_embedding[i]) / 2

283 norm = norm + result[i] * result[i]

284 end

285

286 -- Normalize to unit vector for consistent similarity calculations

287 norm = math.sqrt(norm)

288 if norm > 0 then

289 for i = 1, dimension do

290 result[i] = result[i] / norm

291 end

292 end

293

294 return result

295end

296-- }}}

298-- {{{ local network_error_config

299local network_error_config = {

300 max_consecutive_errors = 5, -- Max consecutive network errors before abort

301 max_total_errors = 20, -- Max total network errors in session

302 initial_retry_delay = 2, -- Initial delay in seconds

303 max_retry_delay = 60, -- Maximum delay in seconds

304 backoff_multiplier = 2 -- Exponential backoff multiplier

305}

306-- }}}

308-- {{{ function migrate_legacy_cache

309function migrate_legacy_cache(legacy_file, target_model_dir)

310 if utils.file_exists(legacy_file) then

311 utils.log_info("Migrating legacy cache to model-specific storage...")

312

313 local backup_file = legacy_file .. ".legacy_backup"

314 os.rename(legacy_file, backup_file)

315

316 local legacy_data = utils.read_json_file(backup_file)

317 if legacy_data then

318 utils.write_json_file(target_model_dir .. "/embeddings.json", legacy_data)

319 utils.log_info("Legacy cache migrated successfully")

320 end

321 end

322end

323-- }}}

325-- {{{ function M.list_available_models

326function M.list_available_models()

327 utils.log_info("Available Embedding Models:")

328 for model_name, config in pairs(embedding_models) do

329 utils.log_info(" " .. model_name .. " (" .. config.dimensions .. " dims)")

330 end

331 return embedding_models

332end

333-- }}}

335-- {{{ function M.get_model_status

336function M.get_model_status(base_output_dir, model_name)

337 -- Default to the configured/overridden model, not a hardcoded literal, so a

338 -- model swap in config.lua (or a --model on the CLI) is reflected here too.

339 model_name = model_name or inference_config.get_selected_model()

340 local storage_paths = get_model_storage_path(base_output_dir, model_name)

341

342 if utils.file_exists(storage_paths.embeddings) then

343 local data = utils.read_json_file(storage_paths.embeddings)

344 if data and data.embeddings then

345 local count = 0

346 for _ in pairs(data.embeddings) do

347 count = count + 1

348 end

349 return {

350 exists = true,

351 count = count,

352 location = storage_paths.embeddings,

353 metadata = data.metadata

354 }

355 end

356 end

357

358 return {

359 exists = false,

360 count = 0,

361 location = storage_paths.embeddings

362 }

363end

364-- }}}

366-- {{{ function M.show_all_model_status

367function M.show_all_model_status(base_output_dir)

368 utils.log_info("Available Embedding Models:")

369 for model_name, config in pairs(embedding_models) do

370 local status = M.get_model_status(base_output_dir, model_name)

371 if status.exists then

372 local completion_rate = status.metadata and status.metadata.completion_rate or 0

373 utils.log_info(" " .. model_name .. " (" .. config.dimensions .. " dims) - " ..

374 status.count .. " cached embeddings (" ..

375 string.format("%.1f%%", completion_rate * 100) .. ")")

376 else

377 utils.log_info(" " .. model_name .. " (" .. config.dimensions .. " dims) - No cache found")

378 end

379 end

380end

381-- }}}

383-- {{{ function M.generate_all_embeddings

384function M.generate_all_embeddings(poems_file, base_output_dir, endpoint, incremental, model_name)

385 -- Issue 10-017: Use build_host_url() instead of deprecated OLLAMA_ENDPOINT

386 endpoint = endpoint or inference_config.build_host_url()

387 incremental = incremental ~= false -- Default to true

388 -- Default to the configured/overridden model, not a hardcoded literal (the

389 -- caller, generate-embeddings.sh, always passes one; this guards direct use).

390 model_name = model_name or inference_config.get_selected_model()

391

392 -- Get model-specific configuration

393 local model_config = embedding_models[model_name]

394 if not model_config then

395 utils.log_error("Unknown embedding model: " .. model_name)

396 return false

397 end

398

399 -- Generate model-specific file paths

400 local storage_paths = get_model_storage_path(base_output_dir, model_name)

401 local output_file = storage_paths.embeddings

402

403 utils.log_info("Using embedding model: " .. model_name)

404 utils.log_info("Storage location: " .. output_file)

405 utils.log_info("Expected dimensions: " .. model_config.dimensions)

406

407 -- Handle legacy cache migration

408 local legacy_cache = base_output_dir .. "/embeddings.json"

409 if utils.file_exists(legacy_cache) and output_file ~= legacy_cache then

410 migrate_legacy_cache(legacy_cache, base_output_dir .. "/embeddings/" .. model_name:gsub("[^%w%-_.]", "_"))

411 end

412

413 utils.log_info("Loading poems from: " .. poems_file)

414 local poems_data = utils.read_json_file(poems_file)

415 if not poems_data or not poems_data.poems then

416 utils.log_error("Failed to load poems from " .. poems_file)

417 return false

418 end

419 local poems = poems_data.poems

420

421 -- Load existing embeddings if incremental mode enabled

422 local existing_embeddings = {}

423 -- Pull the dimension from the model registry. If the model is unknown,

424 -- leave dim at nil here; it will be populated from the first embedding

425 -- we actually receive below, so the metadata reflects ground truth.

426 local model_dim = embedding_models[model_name] and embedding_models[model_name].dimensions or nil

427 local embeddings_data = {

428 metadata = {

429 total_poems = #poems,

430 embedding_model = model_name,

431 embedding_dimension = model_dim,

432 generated_at = os.date("%Y-%m-%d %H:%M:%S"),

433 endpoint = endpoint,

434 incremental_update = incremental

435 },

436 embeddings = {}

437 }

438

439 if incremental and utils.file_exists(output_file) then

440 utils.log_info("Incremental mode: Loading existing embeddings...")

441 local existing_data = utils.read_json_file(output_file)

442 if existing_data and existing_data.embeddings then

443 -- Handle both array and object formats for existing embeddings

444 -- Key insight (Issue 8-019): We store by poem_index, not by id, because

445 -- the same id can exist in multiple categories (e.g., fediverse/0002.txt

446 -- and messages/0002.txt both have id=2 but different poem_index values).

447 if type(existing_data.embeddings) == "table" then

448 if existing_data.embeddings[1] then

449 -- Array format (legacy format before poem_index)

450 -- Use poem_index from embedding if available, else use array position

451 for i, emb in ipairs(existing_data.embeddings) do

452 local key = emb.poem_index or i

453 existing_embeddings[key] = emb

454 end

455 else

456 -- Object format (current format) - key-value pairs by poem_index

457 for poem_index, emb in pairs(existing_data.embeddings) do

458 -- Store by poem_index for correct lookup

459 existing_embeddings[tonumber(poem_index)] = emb

460 end

461 end

462 end

463

464 -- Preserve existing metadata

465 if existing_data.metadata then

466 embeddings_data.metadata.original_generated_at = existing_data.metadata.generated_at

467 embeddings_data.metadata.previous_total = existing_data.metadata.total_poems

468 end

469

470 utils.log_info("Found " .. table_length(existing_embeddings) .. " existing embeddings")

471 end

472 end

473

474 -- Count poems that need processing

475 local poems_to_process = {}

476 local skipped_count = 0

477 local retry_count = 0

478 local retry_reasons = {}

479

480 if incremental then

481 -- Incremental mode: Check existing embeddings and only process missing/invalid ones

482 for i, poem in ipairs(poems) do

483 -- Use poem_index if available, fallback to array index for legacy poems.json

484 -- This ensures correct matching even when the same id appears in multiple categories.

485 local lookup_key = poem.poem_index or i

486

487 -- Only skip if embedding is valid AND dimensions are correct

488 if existing_embeddings[lookup_key] and

489 existing_embeddings[lookup_key].embedding and

490 type(existing_embeddings[lookup_key].embedding) == "table" and

491 #existing_embeddings[lookup_key].embedding == model_config.dimensions then

492 -- Skip: valid embedding found

493 embeddings_data.embeddings[lookup_key] = existing_embeddings[lookup_key]

494 skipped_count = skipped_count + 1

495 else

496 -- Re-process: no embedding, invalid embedding, or error state

497 table.insert(poems_to_process, {index = lookup_key, poem = poem})

498

499 -- Track retry reasons for reporting

500 if existing_embeddings[lookup_key] then

501 if existing_embeddings[lookup_key].error then

502 retry_count = retry_count + 1

503 local error_type = existing_embeddings[lookup_key].error

504 retry_reasons[error_type] = (retry_reasons[error_type] or 0) + 1

505 elseif existing_embeddings[lookup_key].embedding then

506 -- Invalid embedding dimensions

507 retry_count = retry_count + 1

508 retry_reasons["invalid_dimensions"] = (retry_reasons["invalid_dimensions"] or 0) + 1

509 end

510 end

511 end

512 end

513 end

514

515 if incremental then

516 utils.log_info("Incremental processing summary:")

517 utils.log_info(" Total poems: " .. #poems)

518 utils.log_info(" Valid existing embeddings: " .. skipped_count)

519

520 -- Enhanced retry reporting

521 if retry_count > 0 then

522 local retry_details = {}

523 for error_type, count in pairs(retry_reasons) do

524 table.insert(retry_details, error_type .. ": " .. count)

525 end

526 utils.log_info(" Error entries to retry: " .. retry_count .. " (" .. table.concat(retry_details, ", ") .. ")")

527 end

528

529 local new_poems = #poems_to_process - retry_count

530 if new_poems > 0 then

531 utils.log_info(" New poems to process: " .. new_poems)

532 end

533

534 utils.log_info(" Processing queue: " .. #poems_to_process .. " poems" ..

535 (retry_count > 0 and (" (" .. new_poems .. " new + " .. retry_count .. " retries)") or ""))

536 utils.log_info(" Processing savings: " .. string.format("%.1f%%", (skipped_count / #poems) * 100))

537

538 if #poems_to_process == 0 then

539 utils.log_info("✅ All embeddings already exist and are valid!")

540 embeddings_data.metadata.completed_embeddings = skipped_count

541 embeddings_data.metadata.completion_rate = 1.0

542 embeddings_data.metadata.processing_mode = "no_update_needed"

543 return utils.write_json_file(output_file, embeddings_data)

544 end

545 else

546 utils.log_info("Full regeneration mode: Processing all " .. #poems .. " poems...")

547 for i, poem in ipairs(poems) do

548 table.insert(poems_to_process, {index = i, poem = poem})

549 end

550 end

551

552 -- Issue 10-050: poems are embedded a WINDOW at a time. All normal text poems

553 -- in a window go out as ONE batched + chunked embedding call (was: one HTTP

554 -- request per poem). Window size is the batch primitive's BATCH_SIZE.

555 local window = fuzzy.BATCH_SIZE

556 if window < 1 then window = 1 end

557 -- Issue 8-021 Fix: Track newly processed poems separately to prevent overcounting.

558 -- The bug occurred when key lookups failed due to poem_index format mismatches,

559 -- causing poems to be added to poems_to_process even though they had valid embeddings

560 -- under different keys. This led to completed = skipped_count + #poems_to_process > #poems.

561 local newly_processed = 0 -- Track only newly processed poems

562 local total_poems = #poems -- Cache for sanity checks

563

564 -- Sanity check: detect potential key mismatch (Issue 8-021)

565 -- If skipped_count + #poems_to_process > #poems, there's likely a key lookup issue

566 if skipped_count + #poems_to_process > total_poems then

567 utils.log_warn("⚠️ Potential key mismatch detected:")

568 utils.log_warn(" skipped_count (" .. skipped_count .. ") + poems_to_process (" .. #poems_to_process .. ") = " .. (skipped_count + #poems_to_process))

569 utils.log_warn(" This exceeds total poems (" .. total_poems .. ")")

570 utils.log_warn(" Some embeddings may be stored under legacy keys.")

571 utils.log_warn(" Continuing with processing - data will be correct, only counter may be affected.")

572 end

573

574 -- Network error tracking

575 local consecutive_errors = 0

576 local total_errors = 0

577 local current_delay = network_error_config.initial_retry_delay

578

579 -- Write initial progress state (just counts, no timing)

580 local user = os.getenv("USER") or "ritz" -- fallback to ritz

581 -- Issue 8-059: shared with scripts/generate-embeddings.sh which reads

582 -- this file; both sides now agree on the project-local tmpfs path.

583 local progress_file = DIR .. "/tmp/embedding_progress_" .. user .. ".txt"

584 -- Issue 8-021 Fix: Use safe_completed to cap progress at total_poems

585 local safe_completed = math.min(skipped_count + newly_processed, total_poems)

586 local initial_progress = string.format("%d,%d", safe_completed, total_poems)

587 local pf = io.open(progress_file, "w")

588 if pf then

589 pf:write(initial_progress)

590 pf:close()

591 end

592

593 -- {{{ Issue 10-050 helpers (closures over the loop's running state)

594 -- write_progress: the count-only progress file generate-embeddings.sh tails.

595 -- Issue 8-059: project-local tmpfs path, shared with the bash monitor.

596 -- Issue 8-021: cap at total_poems so a key mismatch can't overcount.

597 local function write_progress()

598 local user = os.getenv("USER") or "ritz"

599 local progress_file = DIR .. "/tmp/embedding_progress_" .. user .. ".txt"

600 local safe_completed = math.min(skipped_count + newly_processed, total_poems)

601 local pf = io.open(progress_file, "w")

602 if pf then

603 pf:write(string.format("%d,%d", safe_completed, total_poems))

604 pf:close()

605 end

606 end

607

608 -- store_success: write the canonical success record. Shape is byte-for-byte

609 -- what the per-item path wrote (Issue 8-019 keys) so every downstream reader

610 -- is unaffected by the switch to batching.

611 local function store_success(poem, poem_index, poem_text, embedding)

612 embeddings_data.embeddings[poem_index] = {

613 poem_index = poem_index, -- Unique global identifier (Issue 8-019)

614 id = poem.id, -- Original source file ID (for display)

615 embedding = embedding,

616 content_length = #poem_text,

617 generated_at = os.date("%Y-%m-%d %H:%M:%S"),

618 updated_at = incremental and os.date("%Y-%m-%d %H:%M:%S") or nil

619 }

620 newly_processed = newly_processed + 1 -- Issue 8-021: Track separately

621 end

622

623 -- Options handed to the batch helper. We pass OUR endpoint and OUR prompt

624 -- formatter so fuzzy-computing's separate inference-server-config instance

625 -- never diverges from this file's server selection. Chunking uses EXACT token

626 -- counts via the server's /tokenize endpoint, and the per-chunk budget is

627 -- computed exactly below (Issue 10-050) — no char estimate anywhere.

628 local COMBINE_STRATEGY = "length_weighted_mean"

629 -- Compute the EXACT per-chunk token budget once (model context - BERT

630 -- specials - the tokenized prefix), via /tokenize. Raises here, before the

631 -- loop starts, if the server is unreachable — no silent fallback. (10-050)

632 local embed_max_tokens = fuzzy.embedding_chunk_budget(endpoint, inference_config.format_embedding_prompt)

633 local embed_opts = {

634 endpoint = endpoint,

635 format_fn = inference_config.format_embedding_prompt,

636 max_tokens = embed_max_tokens,

637 strategy = COMBINE_STRATEGY

638 }

639 -- Record the chunking parameters so a future tuning change is detectable and

640 -- can trigger cache regeneration rather than silently mixing vectors.

641 embeddings_data.metadata.chunking = {

642 tokenizer = "exact (/tokenize)",

643 max_tokens = embed_max_tokens,

644 combine_strategy = COMBINE_STRATEGY,

645 batch_size = window

646 }

647

648 -- handle_deferred: image-only (inherit) and empty (random) poems, handled

649 -- AFTER the window's normal embeddings land so a same-window nearest

650 -- neighbour is already inheritable. Logic preserved verbatim from the old

651 -- per-item path (Issue 9-010 inheritance, Issue 8-019 keys).

652 local function handle_deferred(poem, poem_index, poem_text)

653 if poem.is_image_only and poem.nearest_text_poem_index then

654 local nearest_index = poem.nearest_text_poem_index

655 local nearest_embedding = nil

656 if embeddings_data.embeddings[nearest_index] and

657 embeddings_data.embeddings[nearest_index].embedding then

658 nearest_embedding = embeddings_data.embeddings[nearest_index].embedding

659 elseif existing_embeddings[nearest_index] and

660 existing_embeddings[nearest_index].embedding then

661 nearest_embedding = existing_embeddings[nearest_index].embedding

662 end

663

664 if nearest_embedding then

665 local own_embedding = nil

666 if poem_text ~= "" then

667 own_embedding = generate_embedding(poem_text, endpoint, model_name)

668 end

669 local inherited = inherit_embedding(nearest_embedding, own_embedding, model_config.dimensions)

670 utils.log_info("Image-only post " .. poem_index .. " (ID: " .. (poem.id or "unknown") ..

671 ") - inheriting embedding from nearest text poem " .. nearest_index)

672 embeddings_data.embeddings[poem_index] = {

673 poem_index = poem_index,

674 id = poem.id,

675 embedding = inherited,

676 content_length = #poem_text,

677 is_inherited = true,

678 nearest_text_poem_index = nearest_index,

679 generated_at = os.date("%Y-%m-%d %H:%M:%S"),

680 updated_at = os.date("%Y-%m-%d %H:%M:%S")

681 }

682 newly_processed = newly_processed + 1

683 else

684 utils.log_info("Image-only post " .. poem_index .. " - nearest embedding not ready, generating random")

685 local random_embedding = generate_random_embedding(poem.id, model_config.dimensions)

686 embeddings_data.embeddings[poem_index] = {

687 poem_index = poem_index,

688 id = poem.id,

689 embedding = random_embedding,

690 content_length = 0,

691 is_random = true,

692 is_image_only = true,

693 needs_inheritance_update = true,

694 nearest_text_poem_index = nearest_index,

695 generated_at = os.date("%Y-%m-%d %H:%M:%S"),

696 updated_at = os.date("%Y-%m-%d %H:%M:%S")

697 }

698 newly_processed = newly_processed + 1

699 end

700 else

701 -- Empty poem: random embedding to place it semi-randomly.

702 utils.log_info("Empty poem content for ID: " .. (poem.id or "unknown") .. " - generating random embedding")

703 local random_embedding = generate_random_embedding(poem.id, model_config.dimensions)

704 embeddings_data.embeddings[poem_index] = {

705 poem_index = poem_index, -- Issue 8-019

706 id = poem.id,

707 embedding = random_embedding,

708 content_length = 0,

709 is_random = true,

710 generated_at = os.date("%Y-%m-%d %H:%M:%S"),

711 updated_at = os.date("%Y-%m-%d %H:%M:%S")

712 }

713 newly_processed = newly_processed + 1 -- Issue 8-021

714 end

715 end

716 -- }}}

717

718 -- Save the cache roughly every ~100 poems regardless of window size. The old

719 -- `i % 100 == 1` test assumed a step of 10; with a variable window we count

720 -- windows instead so the periodic checkpoint survives a crash mid-run.

721 local windows_since_save = 0

722 local SAVE_EVERY_WINDOWS = math.max(1, math.floor(100 / window))

723

724 for i = 1, #poems_to_process, window do

725 local batch_end = math.min(i + window - 1, #poems_to_process)

726 utils.log_info(string.format("Processing batch %d-%d of %d new/updated poems...", i, batch_end, #poems_to_process))

727

728 -- Partition this window: normal text poems get batched together; image-

729 -- only and empty poems are deferred to after the batch resolves.

730 local normal = {}

731 local deferred = {}

732 for j = i, batch_end do

733 local poem_data = poems_to_process[j]

734 local poem = poem_data.poem

735 local poem_index = poem_data.index

736 -- Issue 6-033: enhanced preprocessing for better embedding quality.

737 local poem_text = poem_extractor.extract_pure_poem_content_for_embedding(poem.content)

738 local entry = { poem = poem, poem_index = poem_index, poem_text = poem_text }

739 if poem.is_image_only and poem.nearest_text_poem_index then

740 table.insert(deferred, entry)

741 elseif poem_text == "" then

742 table.insert(deferred, entry)

743 else

744 table.insert(normal, entry)

745 end

746 end

747

748 -- Embed all normal poems of the window in ONE batched + chunked call.

749 -- A whole-batch transport failure is treated exactly like the old

750 -- per-poem network_error branch: count it, check the thresholds, back

751 -- off, and retry the SAME window.

752 if #normal > 0 then

753 local window_done = false

754 while not window_done do

755 local texts = {}

756 for k = 1, #normal do texts[k] = normal[k].poem_text end

757 utils.log_info(string.format(" Embedding %d text poems (batched, chunked)...", #normal))

758 local vectors, err = fuzzy.embed_texts_with_chunking(texts, model_name, embed_opts)

759

760 if not vectors then

761 consecutive_errors = consecutive_errors + 1

762 total_errors = total_errors + 1

763 utils.log_warn(string.format("Network error %d/%d for batch %d-%d: %s",

764 consecutive_errors, network_error_config.max_consecutive_errors,

765 i, batch_end, tostring(err)))

766

767 if consecutive_errors >= network_error_config.max_consecutive_errors then

768 local safe_completed = math.min(skipped_count + newly_processed, total_poems)

769 utils.log_error("❌ NETWORK ERROR THRESHOLD EXCEEDED")

770 utils.log_error(" • Consecutive errors: " .. consecutive_errors .. "/" .. network_error_config.max_consecutive_errors)

771 utils.log_error(" • Poems processed before termination: " .. safe_completed .. "/" .. total_poems)

772 utils.log_error("The embedding cache has been preserved.")

773 embeddings_data.metadata.completed_embeddings = safe_completed

774 embeddings_data.metadata.completion_rate = safe_completed / total_poems

775 embeddings_data.metadata.processing_mode = "terminated_network_error"

776 embeddings_data.metadata.termination_reason = "consecutive_network_errors"

777 embeddings_data.metadata.last_error_count = consecutive_errors

778 utils.write_json_file(output_file, embeddings_data)

779 return false

780 elseif total_errors >= network_error_config.max_total_errors then

781 utils.log_error("❌ TOTAL ERROR LIMIT EXCEEDED")

782 utils.log_error("Too many network errors in this session: " .. total_errors .. "/" .. network_error_config.max_total_errors)

783 return false

784 else

785 utils.log_info("Retrying in " .. current_delay .. " seconds...")

786 os.execute("sleep " .. current_delay)

787 current_delay = math.min(current_delay * network_error_config.backoff_multiplier,

788 network_error_config.max_retry_delay)

789 -- loop again: retry this whole window

790 end

791 else

792 -- Batch produced results: reset error counters (the server is

793 -- alive) and distribute vectors to each poem.

794 consecutive_errors = 0

795 current_delay = network_error_config.initial_retry_delay

796 for k = 1, #normal do

797 local n = normal[k]

798 local embedding = vectors[k]

799 if embedding and type(embedding) == "table" and #embedding == model_config.dimensions then

800 store_success(n.poem, n.poem_index, n.poem_text, embedding)

801 else

802 -- One poem's vector is missing/wrong-dimension. Single-

803 -- retry it once via the same chunk-aware path; if that

804 -- still fails, record a non-critical error so it is not

805 -- retried forever (matches the old `else` branch).

806 local single = fuzzy.embed_texts_with_chunking({ n.poem_text }, model_name, embed_opts)

807 local sv = single and single[1]

808 if sv and type(sv) == "table" and #sv == model_config.dimensions then

809 store_success(n.poem, n.poem_index, n.poem_text, sv)

810 else

811 embeddings_data.embeddings[n.poem_index] = {

812 poem_index = n.poem_index, -- Issue 8-019

813 id = n.poem.id,

814 embedding = nil,

815 error = "embedding_failed",

816 updated_at = os.date("%Y-%m-%d %H:%M:%S")

817 }

818 utils.log_warn("Non-critical error for poem " .. n.poem_index .. ": embedding_failed")

819 end

820 end

821 end

822 write_progress()

823 window_done = true

824 end

825 end

826 end

827

828 -- Now the deferred poems, with the window's fresh embeddings available.

829 for _, d in ipairs(deferred) do

830 handle_deferred(d.poem, d.poem_index, d.poem_text)

831 end

832 write_progress()

833

834 -- Periodic cache checkpoint (crash safety on long runs).

835 windows_since_save = windows_since_save + 1

836 if windows_since_save >= SAVE_EVERY_WINDOWS or batch_end == #poems_to_process then

837 windows_since_save = 0

838 local safe_completed = math.min(skipped_count + newly_processed, total_poems)

839 utils.log_info("Saving progress... (" .. newly_processed .. " new + " .. skipped_count .. " existing = " .. safe_completed .. " total)")

840 if not utils.write_json_file(output_file, embeddings_data) then

841 utils.log_error("Failed to save embeddings to " .. output_file)

842 return false

843 end

844 end

845 end

846

847 -- Issue 8-021 Fix: Use safe calculation for final metadata

848 local safe_completed = math.min(skipped_count + newly_processed, total_poems)

849 embeddings_data.metadata.completed_embeddings = safe_completed

850 embeddings_data.metadata.completion_rate = safe_completed / total_poems

851 embeddings_data.metadata.new_embeddings = newly_processed

852 embeddings_data.metadata.reused_embeddings = skipped_count

853 embeddings_data.metadata.processing_mode = incremental and "incremental" or "full_regeneration"

854 -- Note: timing_data feature was planned but never implemented.

855 -- Removed reference to undefined timing_data variable (Issue 8-018).

856

857 utils.log_info("Embedding generation complete!")

858 if incremental then

859 utils.log_info("Incremental processing results:")

860 utils.log_info(" New embeddings generated: " .. newly_processed)

861 utils.log_info(" Existing embeddings reused: " .. skipped_count)

862 utils.log_info(" Total embeddings: " .. safe_completed .. " out of " .. total_poems)

863 utils.log_info(" Time savings: " .. string.format("%.1f%%", (skipped_count / total_poems) * 100))

864 else

865 utils.log_info("Full regeneration results:")

866 utils.log_info(" Successfully generated " .. safe_completed .. " out of " .. total_poems .. " embeddings")

867 end

868 utils.log_info("Completion rate: " .. string.format("%.1f%%", (safe_completed / total_poems) * 100))

869

870 return utils.write_json_file(output_file, embeddings_data)

871end

872-- }}}

874-- {{{ function validate_similarity_matrix_currency

875local function validate_similarity_matrix_currency(similarity_file, embeddings_file, poems_file)

876 if not utils.file_exists(similarity_file) then

877 return {valid = false, reason = "no_matrix_found"}

878 end

879

880 local similarity_data = utils.read_json_file(similarity_file)

881 local embeddings_data = utils.read_json_file(embeddings_file)

882 local poems_data = utils.read_json_file(poems_file)

883

884 if not similarity_data or not similarity_data.metadata then

885 return {valid = false, reason = "no_metadata"}

886 end

887

888 local total_poems = #poems_data.poems

889

890 -- Count current valid embeddings

891 local current_embeddings = 0

892 if embeddings_data and embeddings_data.embeddings then

893 for _, emb in pairs(embeddings_data.embeddings) do

894 if emb.embedding and #emb.embedding > 0 then

895 current_embeddings = current_embeddings + 1

896 end

897 end

898 end

899

900 local matrix_embeddings = similarity_data.metadata.embedding_count or 0

901

902 if current_embeddings ~= matrix_embeddings then

903 return {

904 valid = false,

905 reason = "embedding_count_mismatch",

906 current_count = current_embeddings,

907 matrix_count = matrix_embeddings,

908 difference = current_embeddings - matrix_embeddings

909 }

910 end

911

912 if not similarity_data.metadata.is_complete then

913 return {

914 valid = false,

915 reason = "incomplete_dataset",

916 completeness = similarity_data.metadata.matrix_completeness or 0,

917 missing_embeddings = total_poems - current_embeddings

918 }

919 end

920

921 return {valid = true, metadata = similarity_data.metadata}

922end

923-- }}}

925-- {{{ function M.calculate_similarity_matrix

926-- DEPRECATED (Issue 8-029): This function generates a top-N array format that is incompatible

927-- with the HTML generator. Use calculate_full_similarity_matrix() instead, which generates

928-- the full pairwise format required by flat-html-generator.lua and other consumers.

929-- Kept for reference and potential future use cases where top-N is sufficient.

930function M.calculate_similarity_matrix(embeddings_file, output_file, top_n, force_regenerate)

931 top_n = top_n or 10

932 force_regenerate = force_regenerate or false

933

934 -- Need poems file for validation (use configured assets path)

935 local poems_file = utils.asset_path("poems.json")

936

937 -- Validate existing matrix unless forced to regenerate

938 if not force_regenerate then

939 local validation = validate_similarity_matrix_currency(output_file, embeddings_file, poems_file)

940 if validation.valid then

941 utils.log_info("✅ Existing similarity matrix is current and complete")

942 return true

943 else

944 utils.log_warn("⚠️ Similarity matrix validation failed: " .. validation.reason)

945 if validation.reason == "embedding_count_mismatch" then

946 utils.log_info(" Current embeddings: " .. validation.current_count)

947 utils.log_info(" Matrix embeddings: " .. validation.matrix_count)

948 utils.log_info(" Difference: " .. validation.difference)

949 elseif validation.reason == "incomplete_dataset" then

950 utils.log_info(" Completeness: " .. string.format("%.1f%%", validation.completeness * 100))

951 utils.log_info(" Missing embeddings: " .. validation.missing_embeddings)

952 end

953 utils.log_info("🗑️ Removing stale similarity matrix...")

954 os.remove(output_file)

955 end

956 end

957

958 utils.log_info("Loading embeddings from: " .. embeddings_file)

959 local embeddings_data = utils.read_json_file(embeddings_file)

960 if not embeddings_data or not embeddings_data.embeddings then

961 utils.log_error("Failed to load embeddings from " .. embeddings_file)

962 return false

963 end

964

965 local embeddings = embeddings_data.embeddings

966 local valid_embeddings = {}

967

968 -- Filter out invalid embeddings

969 for i, item in ipairs(embeddings) do

970 if item.embedding and #item.embedding > 0 then

971 table.insert(valid_embeddings, {

972 index = i,

973 id = item.id,

974 embedding = item.embedding

975 })

976 end

977 end

978

979 -- Load poems data to get actual total count

980 local poems_data = utils.read_json_file(poems_file)

981 local total_poems = poems_data and #poems_data.poems or #embeddings

982

983 -- Calculate completeness metrics

984 local embedding_count = #valid_embeddings

985 local matrix_completeness = embedding_count / total_poems

986 local is_complete = embedding_count == total_poems

987

988 -- Warn about incomplete datasets

989 if not is_complete then

990 utils.log_warn("⚠️ WARNING: Incomplete dataset detected")

991 utils.log_info(" Embeddings: " .. embedding_count .. " / " .. total_poems .. " poems (" .. string.format("%.1f%%", matrix_completeness * 100) .. " complete)")

992 utils.log_info(" Missing: " .. (total_poems - embedding_count) .. " poems will not appear in recommendations")

993 utils.log_info("")

994 utils.log_info(" For complete recommendations, generate embeddings for all poems first")

995 end

996

997 utils.log_info("Calculating similarity matrix for " .. #valid_embeddings .. " valid embeddings...")

998

999 local similarity_data = {

1000 metadata = {

1001 generated_at = os.date("%Y-%m-%d %H:%M:%S"),

1002 model_name = embeddings_data.metadata and embeddings_data.metadata.embedding_model or "unknown",

1003 total_poems = total_poems,

1004 embedding_count = embedding_count,

1005 matrix_completeness = matrix_completeness,

1006 is_complete = is_complete,

1007 top_n = top_n,

1008 algorithm = "cosine_similarity"

1009 },

1010 similarities = {}

1011 }

1012

1013 local total_comparisons = #valid_embeddings * (#valid_embeddings - 1) / 2

1014 local completed_comparisons = 0

1015

1016 for i = 1, #valid_embeddings do

1017 local poem_a = valid_embeddings[i]

1018 local similarities_for_poem = {}

1019

1020 -- Issue 8-024: Use carriage return to overwrite line in-place

1021 io.write(string.format("\r[INFO] Processing poem %d/%d (ID: %s) ", i, #valid_embeddings, poem_a.id or "unknown"))

1022 io.flush()

1023

1024 for j = 1, #valid_embeddings do

1025 if i ~= j then

1026 local poem_b = valid_embeddings[j]

1027 local similarity = cosine_similarity(poem_a.embedding, poem_b.embedding)

1028

1029 table.insert(similarities_for_poem, {

1030 id = poem_b.id,

1031 index = poem_b.index,

1032 similarity = similarity

1033 })

1034

1035 if j > i then

1036 completed_comparisons = completed_comparisons + 1

1037 end

1038 end

1039 end

1040

1041 -- Sort by similarity (highest first) and keep only top N

1042 table.sort(similarities_for_poem, function(a, b) return a.similarity > b.similarity end)

1043

1044 local top_similarities = {}

1045 for k = 1, math.min(top_n, #similarities_for_poem) do

1046 table.insert(top_similarities, similarities_for_poem[k])

1047 end

1048

1049 local poem_key = poem_a.id or ("poem_" .. poem_a.index)

1050 similarity_data.similarities[poem_key] = {

1051 poem_index = poem_a.index,

1052 top_similar = top_similarities,

1053 calculated_at = os.date("%Y-%m-%d %H:%M:%S")

1054 }

1055

1056 -- Save progress periodically

1057 if i % 50 == 0 or i == #valid_embeddings then

1058 local progress = (completed_comparisons / total_comparisons) * 100

1059 -- Issue 8-024: Newline before progress to preserve it (processing line uses \r)

1060 io.write("\n")

1061 utils.log_info(string.format("Progress: %.1f%% (%d/%d comparisons)", progress, completed_comparisons, total_comparisons))

1062

1063 if not utils.write_json_file(output_file, similarity_data) then

1064 utils.log_error("Failed to save similarity matrix to " .. output_file)

1065 return false

1066 end

1067 end

1068 end

1069

1070 utils.log_info("Similarity matrix calculation complete!")

1071 utils.log_info("Calculated similarities for " .. #valid_embeddings .. " poems")

1072 utils.log_info("Total comparisons: " .. total_comparisons)

1073

1074 return true

1075end

1076-- }}}

1078-- {{{ function M.calculate_full_similarity_matrix

1079function M.calculate_full_similarity_matrix(embeddings_file, output_file, force_regenerate)

1080 force_regenerate = force_regenerate or false

1081

1082 -- Need poems file for validation (use configured assets path)

1083 local poems_file = utils.asset_path("poems.json")

1084

1085 -- Check if full matrix already exists and is current

1086 if not force_regenerate and utils.file_exists(output_file) then

1087 local existing_data = utils.read_json_file(output_file)

1088 if existing_data and existing_data.metadata and existing_data.metadata.is_complete then

1089 utils.log_info("✅ Full similarity matrix already exists and is complete")

1090 return true

1091 end

1092 end

1093

1094 utils.log_info("🔍 Generating FULL similarity matrix (all poem pairs)...")

1095 utils.log_info("⚠️ This will generate ALL 47.1M comparisons (no symmetry optimization) and may take 4-8 hours")

1096

1097 -- Load embeddings

1098 local embeddings_data = utils.read_json_file(embeddings_file)

1099 if not embeddings_data or not embeddings_data.embeddings then

1100 utils.log_error("Failed to load embeddings from " .. embeddings_file)

1101 return false

1102 end

1103

1104 local embeddings = embeddings_data.embeddings

1105 local valid_embeddings = {}

1106

1107 -- Filter out invalid embeddings

1108 for _, embedding in ipairs(embeddings) do

1109 if embedding.embedding and #embedding.embedding > 0 and embedding.id then

1110 table.insert(valid_embeddings, embedding)

1111 end

1112 end

1113

1114 if #valid_embeddings == 0 then

1115 utils.log_error("No valid embeddings found")

1116 return false

1117 end

1118

1119 utils.log_info(string.format("Processing %d poems for full similarity matrix", #valid_embeddings))

1120

1121 local total_comparisons = #valid_embeddings * #valid_embeddings

1122 local completed_comparisons = 0

1123 local start_time = os.time()

1124

1125 -- Initialize full similarity matrix

1126 local similarity_data = {

1127 metadata = {

1128 is_complete = true,

1129 total_poems = #valid_embeddings,

1130 matrix_size = total_comparisons,

1131 algorithm = "cosine_similarity",

1132 model_name = embeddings_data.metadata.embedding_model or "unknown",

1133 generated_at = os.date("%Y-%m-%d %H:%M:%S"),

1134 embedding_count = #valid_embeddings

1135 },

1136 similarities = {}

1137 }

1138

1139 -- Generate COMPLETE similarity matrix (calculate ALL comparisons for maximum accuracy)

1140 for i = 1, #valid_embeddings do

1141 local poem_a = valid_embeddings[i]

1142 local poem_a_id = tostring(poem_a.id)

1143 similarity_data.similarities[poem_a_id] = {}

1144

1145 -- Issue 8-024: Use carriage return to overwrite line in-place

1146 io.write(string.format("\r[INFO] Processing poem %d/%d (ID: %s) ", i, #valid_embeddings, poem_a_id))

1147 io.flush()

1148

1149 for j = 1, #valid_embeddings do

1150 local poem_b = valid_embeddings[j]

1151 local poem_b_id = tostring(poem_b.id)

1152

1153 if i == j then

1154 -- Self-similarity is always 1.0

1155 similarity_data.similarities[poem_a_id][poem_b_id] = 1.0

1156 else

1157 -- Calculate similarity for EVERY comparison (no symmetry optimization)

1158 -- This ensures maximum accuracy by computing each comparison independently

1159 local similarity = cosine_similarity(poem_a.embedding, poem_b.embedding)

1160 -- Round to 4 decimal places for storage efficiency

1161 local rounded_similarity = math.floor(similarity * 10000) / 10000

1162

1163 similarity_data.similarities[poem_a_id][poem_b_id] = rounded_similarity

1164 end

1165

1166 completed_comparisons = completed_comparisons + 1

1167 end

1168

1169 -- Progressive saving every 100 poems to prevent data loss

1170 if i % 100 == 0 or i == #valid_embeddings then

1171 local progress = (completed_comparisons / total_comparisons) * 100

1172 local elapsed_time = os.time() - start_time

1173 local rate = completed_comparisons / elapsed_time

1174 local estimated_remaining = (total_comparisons - completed_comparisons) / rate

1175

1176 -- Issue 8-024: Newline before progress to preserve it (processing line uses \r)

1177 io.write("\n")

1178 utils.log_info(string.format("Progress: %.2f%% (%d/%d comparisons)",

1179 progress, completed_comparisons, total_comparisons))

1180 utils.log_info(string.format("Rate: %.0f comparisons/sec, Est. remaining: %.0f minutes",

1181 rate, estimated_remaining / 60))

1182

1183 if not utils.write_json_file(output_file, similarity_data) then

1184 utils.log_error("Failed to save similarity matrix to " .. output_file)

1185 return false

1186 end

1187 utils.log_info("✅ Progress saved to disk")

1188 end

1189

1190 -- Memory management: force garbage collection periodically

1191 if i % 500 == 0 then

1192 collectgarbage("collect")

1193 end

1194 end

1195

1196 -- Final save with completion timestamp

1197 similarity_data.metadata.completed_at = os.date("%Y-%m-%d %H:%M:%S")

1198 similarity_data.metadata.generation_time_seconds = os.time() - start_time

1199

1200 if not utils.write_json_file(output_file, similarity_data) then

1201 utils.log_error("Failed to save final similarity matrix")

1202 return false

1203 end

1204

1205 utils.log_info("🎉 Full similarity matrix generation complete!")

1206 utils.log_info(string.format("Total comparisons: %d", total_comparisons))

1207 utils.log_info(string.format("Generation time: %.1f minutes", (os.time() - start_time) / 60))

1208 utils.log_info(string.format("Matrix saved to: %s", output_file))

1209

1210 return true

1211end

1212-- }}}

1214-- {{{ function M.calculate_triangular_similarity_matrix

1215function M.calculate_triangular_similarity_matrix(embeddings_file, output_file, force_regenerate)

1216 utils.log_info("🔍 Generating TRIANGULAR similarity matrix (optimized storage)...")

1217

1218 -- Check if output already exists and not forcing regeneration

1219 if not force_regenerate and utils.file_exists(output_file) then

1220 utils.log_info("Triangular similarity matrix already exists. Use force_regenerate=true to recreate.")

1221 return true

1222 end

1223

1224 local embeddings_data = utils.read_json_file(embeddings_file)

1225 if not embeddings_data or not embeddings_data.embeddings then

1226 utils.log_error("Failed to load embeddings file: " .. embeddings_file)

1227 return false

1228 end

1229

1230 local embeddings = embeddings_data.embeddings

1231 local poems = {}

1232

1233 -- Filter out invalid embeddings (same as full matrix function)

1234 for _, embedding in ipairs(embeddings) do

1235 if embedding.embedding and #embedding.embedding > 0 and embedding.id then

1236 table.insert(poems, embedding)

1237 end

1238 end

1239

1240 if #poems == 0 then

1241 utils.log_error("No valid embeddings found")

1242 return false

1243 end

1244

1245 utils.log_info("Processing " .. #poems .. " poems for triangular similarity matrix")

1246

1247 -- Calculate storage requirements

1248 local total_unique_pairs = (#poems * (#poems - 1)) / 2

1249 utils.log_info(string.format("⚠️ This will generate %d unique comparisons (50%% reduction from full matrix)", total_unique_pairs))

1250 utils.log_info("⚠️ Expected storage: ~50% reduction from full matrix size")

1251

1252 local similarity_data = {

1253 metadata = {

1254 matrix_size = total_unique_pairs,

1255 total_poems = #poems,

1256 model_name = embeddings_data.model_name,

1257 algorithm = "cosine_similarity",

1258 embedding_count = #poems,

1259 generated_at = os.date("%Y-%m-%d %H:%M:%S"),

1260 is_complete = true,

1261 storage_format = "triangular_upper"

1262 },

1263 similarities = {}

1264 }

1265

1266 local start_time = os.time()

1267 local completed = 0

1268

1269 -- Generate upper triangular matrix only (i < j)

1270 for i = 1, #poems do

1271 local poem_a = poems[i]

1272 similarity_data.similarities[tostring(poem_a.id)] = {}

1273

1274 -- Only calculate similarities for j > i (upper triangle)

1275 for j = i + 1, #poems do

1276 local poem_b = poems[j]

1277

1278 local similarity = cosine_similarity(poem_a.embedding, poem_b.embedding)

1279 similarity_data.similarities[tostring(poem_a.id)][tostring(poem_b.id)] =

1280 math.floor(similarity * 10000) / 10000 -- 4 decimal precision

1281

1282 completed = completed + 1

1283

1284 -- Progress reporting every 10000 comparisons

1285 if completed % 10000 == 0 then

1286 local progress_percent = (completed / total_unique_pairs) * 100

1287 local elapsed = os.time() - start_time

1288 local rate = completed / elapsed

1289 local remaining_time = (total_unique_pairs - completed) / rate / 60

1290

1291 utils.log_info(string.format("Progress: %.2f%% (%d/%d comparisons)",

1292 progress_percent, completed, total_unique_pairs))

1293 utils.log_info(string.format("Rate: %.0f comparisons/sec, Est. remaining: %.1f minutes",

1294 rate, remaining_time))

1295 end

1296 end

1297

1298 -- Progressive saving every 100 poems

1299 if i % 100 == 0 then

1300 utils.write_json_file(output_file, similarity_data)

1301 utils.log_info(string.format("✅ Progress saved to disk (poem %d/%d)", i, #poems))

1302 end

1303

1304 -- Garbage collection every 500 poems

1305 if i % 500 == 0 then

1306 collectgarbage()

1307 utils.log_info(string.format("🗑️ Memory cleanup completed (poem %d/%d)", i, #poems))

1308 end

1309 end

1310

1311 -- Final save

1312 if not utils.write_json_file(output_file, similarity_data) then

1313 utils.log_error("Failed to save triangular similarity matrix")

1314 return false

1315 end

1316

1317 utils.log_info("✅ TRIANGULAR similarity matrix generation completed!")

1318 utils.log_info(string.format("Total unique comparisons: %d", total_unique_pairs))

1319 utils.log_info(string.format("Generation time: %.1f minutes", (os.time() - start_time) / 60))

1320 utils.log_info(string.format("Matrix saved to: %s", output_file))

1321 utils.log_info("📊 Storage optimized: ~50% reduction from full matrix")

1322

1323 return true

1324end

1325-- }}}

1327-- {{{ function M.get_similarity_triangular

1328function M.get_similarity_triangular(matrix, id1, id2)

1329 -- Handle diagonal (self-similarity)

1330 if id1 == id2 then return 1.0 end

1331

1332 -- Ensure consistent ordering for triangle lookup (min_id -> max_id)

1333 local min_id = math.min(tonumber(id1), tonumber(id2))

1334 local max_id = math.max(tonumber(id1), tonumber(id2))

1335

1336 -- Look up in upper triangle

1337 if matrix.similarities[tostring(min_id)] and

1338 matrix.similarities[tostring(min_id)][tostring(max_id)] then

1339 return matrix.similarities[tostring(min_id)][tostring(max_id)]

1340 end

1341

1342 -- Fallback (should not happen with complete matrix)

1343 utils.log_warning(string.format("Similarity not found for poems %s and %s", id1, id2))

1344 return 0.0

1345end

1346-- }}}

1348-- {{{ function M.get_all_similarities_for_poem_triangular

1349function M.get_all_similarities_for_poem_triangular(matrix, poem_id, poem_ids)

1350 local similarities = {}

1351

1352 for _, other_id in ipairs(poem_ids) do

1353 if other_id ~= poem_id then

1354 local score = M.get_similarity_triangular(matrix, poem_id, other_id)

1355 table.insert(similarities, {

1356 target_id = other_id,

1357 score = score

1358 })

1359 end

1360 end

1361

1362 -- Sort by similarity score (descending)

1363 table.sort(similarities, function(a, b)

1364 return a.score > b.score

1365 end)

1366

1367 return similarities

1368end

1369-- }}}

1371-- {{{ function M.generate_similarity_report

1372function M.generate_similarity_report(similarity_file, poems_file, output_file)

1373 utils.log_info("Generating similarity analysis report...")

1374

1375 local similarity_data = utils.read_json_file(similarity_file)

1376 local poems_data = utils.read_json_file(poems_file)

1377

1378 if not similarity_data or not poems_data then

1379 utils.log_error("Failed to load required data files")

1380 return false

1381 end

1382

1383 local report = {

1384 metadata = {

1385 generated_at = os.date("%Y-%m-%d %H:%M:%S"),

1386 total_poems = #poems_data,

1387 poems_with_similarities = 0,

1388 average_similarity = 0,

1389 max_similarity = 0,

1390 min_similarity = 1

1391 },

1392 statistics = {},

1393 sample_similarities = {}

1394 }

1395

1396 local total_similarity = 0

1397 local similarity_count = 0

1398

1399 for poem_id, data in pairs(similarity_data.similarities) do

1400 report.metadata.poems_with_similarities = report.metadata.poems_with_similarities + 1

1401

1402 if data.top_similar and #data.top_similar > 0 then

1403 local max_sim = data.top_similar[1].similarity

1404 local min_sim = data.top_similar[#data.top_similar].similarity

1405

1406 report.metadata.max_similarity = math.max(report.metadata.max_similarity, max_sim)

1407 report.metadata.min_similarity = math.min(report.metadata.min_similarity, min_sim)

1408

1409 for _, sim in ipairs(data.top_similar) do

1410 total_similarity = total_similarity + sim.similarity

1411 similarity_count = similarity_count + 1

1412 end

1413

1414 -- Add sample for high-similarity pairs

1415 if max_sim > 0.8 then

1416 table.insert(report.sample_similarities, {

1417 poem_a_id = poem_id,

1418 poem_b_id = data.top_similar[1].id,

1419 similarity = max_sim

1420 })

1421 end

1422 end

1423 end

1424

1425 if similarity_count > 0 then

1426 report.metadata.average_similarity = total_similarity / similarity_count

1427 end

1428

1429 utils.log_info("Similarity analysis complete!")

1430 utils.log_info("Poems with similarities: " .. report.metadata.poems_with_similarities)

1431 utils.log_info("Average similarity: " .. string.format("%.3f", report.metadata.average_similarity))

1432 utils.log_info("Similarity range: " .. string.format("%.3f - %.3f", report.metadata.min_similarity, report.metadata.max_similarity))

1433

1434 return utils.write_json_file(output_file, report)

1435end

1436-- }}}

1438-- {{{ function M.generate_all_model_similarity_matrices

1439function M.generate_all_model_similarity_matrices(base_output_dir, min_completeness, use_full_matrix)

1440 min_completeness = min_completeness or 0.8 -- 80% minimum completeness

1441 use_full_matrix = use_full_matrix or false -- Default to sparse matrices

1442

1443 utils.log_info("🔄 Generating similarity matrices for all eligible models...")

1444 utils.log_info("⚙️ Minimum completeness required: " .. (min_completeness * 100) .. "%")

1445 utils.log_info("📊 Matrix type: " .. (use_full_matrix and "FULL (all comparisons)" or "SPARSE (top-N)"))

1446

1447 local models = M.list_available_models()

1448 local results = {}

1449 local eligible_count = 0

1450 local total_poems = 6860 -- Known total poem count

1451

1452 -- First pass: check eligibility

1453 for model_name, config in pairs(models) do

1454 local status = M.get_model_status(base_output_dir, model_name)

1455

1456 if status.exists then

1457 local completeness = status.count / total_poems

1458

1459 if completeness >= min_completeness then

1460 eligible_count = eligible_count + 1

1461 utils.log_info("✅ " .. model_name .. " (" .. string.format("%.1f%% complete, %d poems)", completeness * 100, status.count) .. ")")

1462 else

1463 utils.log_warn("⚠️ Skipping " .. model_name ..

1464 " (only " .. string.format("%.1f%% complete, %d poems)", completeness * 100, status.count) .. ")")

1465 end

1466 else

1467 utils.log_info("❌ No embeddings found for " .. model_name)

1468 end

1469 end

1470

1471 if eligible_count == 0 then

1472 utils.log_warn("No models meet the minimum completeness requirement")

1473 return {}

1474 end

1475

1476 utils.log_info("📈 Processing " .. eligible_count .. " eligible models")

1477

1478 local current_model = 0

1479

1480 -- Second pass: generate matrices

1481 for model_name, config in pairs(models) do

1482 local status = M.get_model_status(base_output_dir, model_name)

1483

1484 if status.exists then

1485 local completeness = status.count / total_poems

1486

1487 if completeness >= min_completeness then

1488 current_model = current_model + 1

1489

1490 utils.log_info(string.format("🔄 [%d/%d] Processing %s", current_model, eligible_count, model_name))

1491

1492 local storage_paths = get_model_storage_path(base_output_dir, model_name)

1493 local matrix_file = use_full_matrix and

1494 storage_paths.similarity_matrix:gsub("%.json$", "_full.json") or

1495 storage_paths.similarity_matrix

1496

1497 local start_time = os.time()

1498 local success

1499

1500 if use_full_matrix then

1501 success = M.calculate_full_similarity_matrix(

1502 storage_paths.embeddings,

1503 matrix_file,

1504 false -- Don't force regenerate unless needed

1505 )

1506 else

1507 success = M.calculate_similarity_matrix(

1508 storage_paths.embeddings,

1509 matrix_file

1510 )

1511 end

1512

1513 local generation_time = os.time() - start_time

1514

1515 results[model_name] = {

1516 success = success,

1517 completeness = completeness,

1518 embedding_count = status.count,

1519 matrix_file = matrix_file,

1520 generation_time = generation_time,

1521 matrix_type = use_full_matrix and "full" or "sparse"

1522 }

1523

1524 if success then

1525 utils.log_info(string.format("✅ Matrix generation complete for %s (took %d seconds)", model_name, generation_time))

1526 else

1527 utils.log_error("❌ Matrix generation failed for " .. model_name)

1528 end

1529 else

1530 results[model_name] = {

1531 success = false,

1532 reason = "insufficient_completeness",

1533 completeness = completeness,

1534 embedding_count = status.count,

1535 required_completeness = min_completeness

1536 }

1537 end

1538 else

1539 results[model_name] = {

1540 success = false,

1541 reason = "no_embeddings",

1542 completeness = 0,

1543 embedding_count = 0

1544 }

1545 end

1546 end

1547

1548 -- Summary report

1549 local successful_models = 0

1550 local skipped_models = 0

1551 local failed_models = 0

1552

1553 for model_name, result in pairs(results) do

1554 if result.success then

1555 successful_models = successful_models + 1

1556 elseif result.reason then

1557 skipped_models = skipped_models + 1

1558 else

1559 failed_models = failed_models + 1

1560 end

1561 end

1562

1563 utils.log_info("📊 Generation Summary:")

1564 utils.log_info(" ✅ Successful: " .. successful_models .. " models")

1565 utils.log_info(" ⚠️ Skipped: " .. skipped_models .. " models")

1566 utils.log_info(" ❌ Failed: " .. failed_models .. " models")

1567

1568 return results

1569end

1570-- }}}

1572-- {{{ function M.compare_model_similarities

1573function M.compare_model_similarities(poem_id, base_output_dir, models, use_full_matrix)

1574 use_full_matrix = use_full_matrix or false

1575 models = models or {}

1576

1577 -- If no models specified, use all available models

1578 if #models == 0 then

1579 local available_models = M.list_available_models()

1580 for model_name, _ in pairs(available_models) do

1581 table.insert(models, model_name)

1582 end

1583 end

1584

1585 utils.log_info("🔍 Comparing similarities for poem " .. poem_id .. " across models")

1586

1587 local comparisons = {}

1588

1589 for _, model_name in ipairs(models) do

1590 local storage_paths = get_model_storage_path(base_output_dir, model_name)

1591 local matrix_file = use_full_matrix and

1592 storage_paths.similarity_matrix:gsub("%.json$", "_full.json") or

1593 storage_paths.similarity_matrix

1594

1595 if utils.file_exists(matrix_file) then

1596 -- For now, generate basic similarity data - this would integrate with recommendation system

1597 comparisons[model_name] = {

1598 matrix_available = true,

1599 matrix_type = use_full_matrix and "full" or "sparse",

1600 matrix_file = matrix_file

1601 }

1602 utils.log_info("✅ " .. model_name .. " - Matrix available")

1603 else

1604 comparisons[model_name] = {

1605 matrix_available = false,

1606 reason = "matrix_not_found"

1607 }

1608 utils.log_info("❌ " .. model_name .. " - Matrix not found")

1609 end

1610 end

1611

1612 return comparisons

1613end

1614-- }}}

1616-- {{{ function M.get_multi_model_status

1617function M.get_multi_model_status(base_output_dir)

1618 utils.log_info("📊 Per-Model Similarity Matrix Status:")

1619

1620 local models = M.list_available_models()

1621 local total_poems = 6860

1622 local status_summary = {}

1623

1624 for model_name, config in pairs(models) do

1625 local status = M.get_model_status(base_output_dir, model_name)

1626 local storage_paths = get_model_storage_path(base_output_dir, model_name)

1627

1628 local sparse_matrix_exists = utils.file_exists(storage_paths.similarity_matrix)

1629 local full_matrix_file = storage_paths.similarity_matrix:gsub("%.json$", "_full.json")

1630 local full_matrix_exists = utils.file_exists(full_matrix_file)

1631

1632 local completeness = status.exists and (status.count / total_poems) or 0

1633

1634 utils.log_info(" " .. model_name .. " (" .. config.dimensions .. " dims)")

1635

1636 if status.exists then

1637 utils.log_info(string.format(" ✅ Embeddings: %d/%d (%.1f%%)",

1638 status.count, total_poems, completeness * 100))

1639 else

1640 utils.log_info(" ❌ Embeddings: 0/" .. total_poems .. " (0%)")

1641 end

1642

1643 if sparse_matrix_exists then

1644 utils.log_info(" ✅ Sparse Matrix: Generated")

1645 else

1646 utils.log_info(" ❌ Sparse Matrix: Not generated")

1647 end

1648

1649 if full_matrix_exists then

1650 utils.log_info(" ✅ Full Matrix: Generated")

1651 else

1652 utils.log_info(" ❌ Full Matrix: Not generated")

1653 end

1654

1655 if completeness < 0.8 then

1656 local needed = math.ceil((0.8 * total_poems) - status.count)

1657 utils.log_info(" 🔄 Recommendation: Complete " .. needed .. " more embeddings")

1658 end

1659

1660 status_summary[model_name] = {

1661 dimensions = config.dimensions,

1662 embedding_count = status.count,

1663 completeness = completeness,

1664 sparse_matrix_exists = sparse_matrix_exists,

1665 full_matrix_exists = full_matrix_exists,

1666 eligible_for_generation = completeness >= 0.8

1667 }

1668 end

1669

1670 return status_summary

1671end

1672-- }}}

1674-- {{{ function M.main

1675function M.main(interactive_mode)

1676 if interactive_mode then

1677 utils.log_info("=== Similarity Engine Interactive Mode ===")

1678 print("1. Generate embeddings for all poems")

1679 print("2. Calculate similarity matrix (sparse, top-N)")

1680 print("3. Calculate FULL similarity matrix (all pairs)")

1681 print("4. Generate similarity analysis report")

1682 print("5. Run complete pipeline")

1683 print("6. Generate matrices for ALL eligible models")

1684 print("7. Show multi-model status")

1685 print("8. Compare model similarities")

1686 io.write("Select option (1-8): ")

1687 local choice = io.read()

1688

1689 if choice == "1" then

1690 local poems_file = utils.asset_path("poems.json")

1691 local base_output_dir = utils.get_assets_root()

1692 io.write("Use incremental processing? (Y/n): ")

1693 local incremental_choice = io.read()

1694 local incremental = not (incremental_choice:lower() == "n" or incremental_choice:lower() == "no")

1695 io.write("Embedding model (default: EmbeddingGemma:latest): ")

1696 local model_input = io.read()

1697 local model_name = model_input ~= "" and model_input or "embeddinggemma:latest"

1698 M.generate_all_embeddings(poems_file, base_output_dir, nil, incremental, model_name)

1699 elseif choice == "2" then

1700 io.write("Embedding model (default: EmbeddingGemma:latest): ")

1701 local model_input = io.read()

1702 local model_name = model_input ~= "" and model_input or "embeddinggemma:latest"

1703 local base_output_dir = utils.get_assets_root()

1704 local storage_paths = get_model_storage_path(base_output_dir, model_name)

1705 local embeddings_file = storage_paths.embeddings

1706 local output_file = storage_paths.similarity_matrix

1707 M.calculate_similarity_matrix(embeddings_file, output_file)

1708 elseif choice == "3" then

1709 io.write("Embedding model (default: EmbeddingGemma:latest): ")

1710 local model_input = io.read()

1711 local model_name = model_input ~= "" and model_input or "embeddinggemma:latest"

1712 local base_output_dir = utils.get_assets_root()

1713 local storage_paths = get_model_storage_path(base_output_dir, model_name)

1714 local embeddings_file = storage_paths.embeddings

1715 local output_file = storage_paths.similarity_matrix:gsub("%.json$", "_full.json")

1716

1717 utils.log_info("⚠️ FULL matrix generation will take 2-4 hours and create ~100MB file")

1718 io.write("Continue? (y/N): ")

1719 local confirm = io.read()

1720 if confirm:lower() == "y" or confirm:lower() == "yes" then

1721 M.calculate_full_similarity_matrix(embeddings_file, output_file, false)

1722 else

1723 utils.log_info("Full matrix generation cancelled")

1724 end

1725 elseif choice == "4" then

1726 local similarity_file = utils.asset_path("similarity-matrix.json")

1727 local poems_file = utils.asset_path("poems.json")

1728 local output_file = utils.asset_path("similarity-report.json")

1729 M.generate_similarity_report(similarity_file, poems_file, output_file)

1730 elseif choice == "5" then

1731 utils.log_info("Running complete similarity engine pipeline...")

1732 local poems_file = utils.asset_path("poems.json")

1733 local base_output_dir = utils.get_assets_root()

1734 local similarity_file = utils.asset_path("similarity-matrix.json")

1735 local report_file = utils.asset_path("similarity-report.json")

1736

1737 if M.generate_all_embeddings(poems_file, base_output_dir) then

1738 local storage_paths = get_model_storage_path(base_output_dir, "embeddinggemma:latest")

1739 local embeddings_file = storage_paths.embeddings

1740 if M.calculate_similarity_matrix(embeddings_file, similarity_file) then

1741 M.generate_similarity_report(similarity_file, poems_file, report_file)

1742 utils.log_info("✅ Complete pipeline executed successfully!")

1743 else

1744 utils.log_error("Pipeline failed at similarity matrix calculation")

1745 end

1746 else

1747 utils.log_error("Pipeline failed at embedding generation")

1748 end

1749 elseif choice == "6" then

1750 local base_output_dir = utils.get_assets_root()

1751 io.write("Matrix type - (s)parse or (f)ull? (default: sparse): ")

1752 local matrix_type = io.read()

1753 local use_full_matrix = matrix_type:lower():sub(1,1) == "f"

1754

1755 io.write("Minimum completeness percentage (default: 80): ")

1756 local completeness_input = io.read()

1757 local min_completeness = tonumber(completeness_input) or 80

1758 min_completeness = min_completeness / 100 -- Convert percentage to decimal

1759

1760 local results = M.generate_all_model_similarity_matrices(base_output_dir, min_completeness, use_full_matrix)

1761 utils.log_info("Multi-model generation complete. Results available in similarity engine.")

1762 elseif choice == "7" then

1763 local base_output_dir = utils.get_assets_root()

1764 M.get_multi_model_status(base_output_dir)

1765 elseif choice == "8" then

1766 io.write("Poem ID to compare: ")

1767 local poem_id = tonumber(io.read())

1768 local base_output_dir = utils.get_assets_root()

1769 io.write("Use (s)parse or (f)ull matrices? (default: sparse): ")

1770 local matrix_type = io.read()

1771 local use_full_matrix = matrix_type:lower():sub(1,1) == "f"

1772

1773 local results = M.compare_model_similarities(poem_id, base_output_dir, {}, use_full_matrix)

1774 utils.log_info("Model comparison complete.")

1775 else

1776 print("Invalid choice")

1777 end

1778 else

1779 -- Default: run similarity analysis on existing data

1780 utils.log_info("Running similarity engine analysis...")

1781 -- Issue 8-032: Fixed filename inconsistency (was similarity-matrix.json with hyphen)

1782 local similarity_file = utils.asset_path("similarity_matrix.json")

1783 local poems_file = utils.asset_path("poems.json")

1784 local report_file = utils.asset_path("similarity-report.json")

1785

1786 if utils.file_exists(similarity_file) then

1787 M.generate_similarity_report(similarity_file, poems_file, report_file)

1788 else

1789 utils.log_info("No similarity matrix found. Use interactive mode (-I) to generate embeddings and similarities.")

1790 end

1791 end

1792end

1793-- }}}

1795-- {{{ function M.flush_embeddings_cache

1796function M.flush_embeddings_cache(output_file, flush_type, backup)

1797 flush_type = flush_type or "all" -- "all", "errors", "model_specific"

1798 backup = backup ~= false -- Default to true

1799

1800 if not utils.file_exists(output_file) then

1801 utils.log_info("No cache file found at: " .. output_file)

1802 return true

1803 end

1804

1805 -- Get file info for reporting

1806 local file_size = os.execute("du -h '" .. output_file .. "' 2>/dev/null") and

1807 io.popen("du -h '" .. output_file .. "' | cut -f1"):read("*l") or "unknown"

1808

1809 utils.log_info("Cache flush operation: " .. flush_type)

1810 utils.log_info("Target file: " .. output_file)

1811 utils.log_info("File size: " .. file_size)

1812

1813 if backup then

1814 local backup_file = output_file .. ".backup." .. os.date("%Y%m%d_%H%M%S")

1815

1816 -- Use Lua file operations for better cross-platform compatibility

1817 local source_file = io.open(output_file, "rb")

1818 if not source_file then

1819 utils.log_error("Failed to open source file for backup")

1820 return false

1821 end

1822

1823 local content = source_file:read("*a")

1824 source_file:close()

1825

1826 local backup_dest = io.open(backup_file, "wb")

1827 if not backup_dest then

1828 utils.log_error("Failed to create backup file: " .. backup_file)

1829 return false

1830 end

1831

1832 backup_dest:write(content)

1833 backup_dest:close()

1834

1835 utils.log_info("Backup created: " .. backup_file)

1836 end

1837

1838 if flush_type == "all" then

1839 -- Complete cache flush

1840 local remove_result = os.remove(output_file)

1841 if remove_result then

1842 utils.log_info("✅ Complete embedding cache flushed")

1843 return true

1844 else

1845 utils.log_error("Failed to remove cache file")

1846 return false

1847 end

1848

1849 elseif flush_type == "errors" then

1850 -- Flush only error entries, keep valid embeddings

1851 local existing_data = utils.read_json_file(output_file)

1852 if not existing_data or not existing_data.embeddings then

1853 utils.log_warn("No embeddings data found in cache file")

1854 return true

1855 end

1856

1857 local clean_embeddings = {}

1858 local removed_count = 0

1859 local kept_count = 0

1860

1861 -- "Valid" means "matches the cache file's declared dimension." That

1862 -- declared dimension comes from the metadata block of this same

1863 -- file, written when the cache was first created — so this check

1864 -- is model-agnostic now (was hardcoded to 768 for embeddinggemma).

1865 local expected_dim = existing_data.metadata and existing_data.metadata.embedding_dimension

1866 for i, emb in pairs(existing_data.embeddings) do

1867 local dim_ok = emb.embedding and type(emb.embedding) == "table"

1868 and #emb.embedding > 0

1869 and (not expected_dim or #emb.embedding == expected_dim)

1870 if dim_ok then

1871 -- Keep valid embeddings

1872 clean_embeddings[i] = emb

1873 kept_count = kept_count + 1

1874 else

1875 -- Remove error entries

1876 removed_count = removed_count + 1

1877 end

1878 end

1879

1880 existing_data.embeddings = clean_embeddings

1881

1882 -- Update metadata

1883 if existing_data.metadata then

1884 existing_data.metadata.completed_embeddings = kept_count

1885 existing_data.metadata.last_flush_operation = {

1886 type = "errors_only",

1887 timestamp = os.date("%Y-%m-%d %H:%M:%S"),

1888 removed_entries = removed_count,

1889 kept_entries = kept_count

1890 }

1891 end

1892

1893 local write_success = utils.write_json_file(output_file, existing_data)

1894 if write_success then

1895 utils.log_info("✅ Error entries flushed: " .. removed_count .. " entries removed, " .. kept_count .. " kept")

1896 return true

1897 else

1898 utils.log_error("Failed to write cleaned cache file")

1899 return false

1900 end

1901

1902 else

1903 utils.log_error("Unknown flush type: " .. flush_type)

1904 return false

1905 end

1906end

1907-- }}}