src/similarity-engine.lua
1#!/usr/bin/env lua
2
3-- {{{ local function setup_dir_path
4local function setup_dir_path(provided_dir)
5 if provided_dir then
6 return provided_dir
7 end
8 return "/mnt/mtwo/programming/ai-stuff/neocities-modernization"
9end
10-- }}}
11
12-- Script configuration
13local DIR = setup_dir_path()
14
15-- Load required libraries
16package.path = DIR .. "/libs/?.lua;" .. DIR .. "/src/?.lua;" .. package.path
17local utils = require("utils")
18local dkjson = require("dkjson")
19local inference_config = require("inference-server-config")
20local poem_extractor = require("poem-extractor")
21-- Issue 10-050: batched + chunked embedding generation. We pass this module our
22-- OWN endpoint and prompt-formatter (inference_config above) so the batch path
23-- shares this file's server selection instead of fuzzy-computing's separate
24-- inference-server-config instance.
25local fuzzy = require("fuzzy-computing")
26
27-- Initialize asset path configuration for standalone execution
28utils.init_assets_root(arg)
29
30local M = {}
31
32-- {{{ Model configurations
33local embedding_models = {
34 -- Key is the GGUF-basename form ("nomic-embed-text-v1.5") to match what
35 -- config.lua, run.sh, and generate-embeddings.sh actually pass. The old
36 -- Ollama-era "model:tag" colon form ("nomic-embed-text:v1.5") never resolved
37 -- after the 10-049 migration, so a full regen aborted with "Unknown
38 -- embedding model" before sending any request. (Leftover from 10-049.)
39 ["nomic-embed-text-v1.5"] = {
40 dimensions = 768,
41 timeout = 30,
42 -- v1.5 routes through task-specific weights based on prompt prefix;
43 -- the active prefix is configured per inference_servers entry.
44 requires_prompt_prefix = true,
45 },
46 ["embeddinggemma:latest"] = {
47 dimensions = 768,
48 timeout = 30
49 },
50 -- Issue 10-031: GGUF-basename forms for the local model-comparison set.
51 ["mxbai-embed-large-v1"] = {
52 dimensions = 1024,
53 timeout = 30
54 },
55 ["embeddinggemma-300m"] = {
56 dimensions = 768,
57 timeout = 30,
58 -- Uses a clustering task prompt, configured per inference_servers entry.
59 requires_prompt_prefix = true,
60 },
61 ["qwen3-embedding:4b"] = {
62 dimensions = 2560,
63 timeout = 60 -- bigger model, longer per-call
64 },
65 ["qwen3-embedding:8b"] = {
66 dimensions = 4096,
67 timeout = 90
68 },
69 ["text-embedding-ada-002"] = {
70 dimensions = 1536,
71 timeout = 60
72 },
73 ["all-MiniLM-L6-v2"] = {
74 dimensions = 384,
75 timeout = 20
76 }
77}
78-- }}}
79
80-- {{{ local function get_model_storage_path
81local function get_model_storage_path(base_dir, model_name)
82 -- Issue 10-054: the model's cache dir comes from embeddings_dir() so it
83 -- follows the RAM/disk switch -- this function is the EMBEDDING GENERATOR's
84 -- write path (similarity-engine.lua is the embedder behind generate-
85 -- embeddings.sh, not just legacy matrix code), so leaving it on disk is what
86 -- made the flip write embeddings where no reader looked. base_dir is now
87 -- ignored (kept in the signature for callers that still pass get_assets_root);
88 -- embeddings_dir builds the same <root>/embeddings/<safe_model> path and is
89 -- identical to the old base_dir/embeddings/<safe_model> while the switch is off.
90 local model_dir = utils.embeddings_dir(model_name)
91
92 -- Create directory if it doesn't exist
93 os.execute("mkdir -p " .. model_dir)
94
95 return {
96 embeddings = model_dir .. "/embeddings.json",
97 similarity_matrix = model_dir .. "/similarity_matrix.json",
98 metadata = model_dir .. "/metadata.json"
99 }
100end
101-- }}}
102
103-- {{{ local function cosine_similarity
104local function cosine_similarity(vec1, vec2)
105 if #vec1 ~= #vec2 then
106 error("Vectors must have same dimension")
107 end
108
109 local dot_product = 0
110 local norm1 = 0
111 local norm2 = 0
112
113 for i = 1, #vec1 do
114 dot_product = dot_product + (vec1[i] * vec2[i])
115 norm1 = norm1 + (vec1[i] * vec1[i])
116 norm2 = norm2 + (vec2[i] * vec2[i])
117 end
118
119 norm1 = math.sqrt(norm1)
120 norm2 = math.sqrt(norm2)
121
122 if norm1 == 0 or norm2 == 0 then
123 return 0
124 end
125
126 return dot_product / (norm1 * norm2)
127end
128-- }}}
129
130-- {{{ local function generate_embedding
131-- model_name is required; it ends up in the request payload AND determines
132-- which dimension downstream validators expect. Defaults are dangerous here
133-- because the wrong model silently produces wrong-shape embeddings.
134local function generate_embedding(text, endpoint, model_name)
135 -- Create a temporary file to avoid shell escaping issues.
136 -- Issue 8-059: route through the project's tmpfs-backed tmp/ symlink so
137 -- parallel checkouts of this repository do not collide on a single shared
138 -- /tmp/ filename.
139 os.execute(string.format('"%s/scripts/ensure-tmp-symlink" "%s"', DIR, DIR))
140 local temp_file = DIR .. "/tmp/embedding_input.json"
141 local payload = {
142 model = model_name,
143 -- Apply the active server's task-prefix (e.g. "clustering: " for
144 -- nomic-embed-text v1.5+). No-op for models that don't need one.
145 input = inference_config.format_embedding_prompt(text)
146 }
147
148 local f = io.open(temp_file, "w")
149 if not f then
150 utils.log_error("Failed to create temporary file")
151 return nil, "file_error"
152 end
153 f:write(dkjson.encode(payload))
154 f:close()
155
156 -- 10-049: /v1/embeddings (OpenAI shape) replaces Ollama's /api/embed.
157 -- llama.cpp exposes a single endpoint regardless of which model is
158 -- loaded, so the endpoint path is the same for every model in the
159 -- embedding_models table above (the per-model endpoint_path field
160 -- was removed in the same migration).
161 local cmd = string.format(
162 'curl -s --connect-timeout 10 --max-time 30 "%s/v1/embeddings" -H "Content-Type: application/json" -d @%s',
163 endpoint, temp_file
164 )
165
166 local handle = io.popen(cmd)
167 local result = handle:read("*a")
168 local success, exit_type, exit_code = handle:close()
169
170 -- Clean up temp file
171 os.remove(temp_file)
172
173 -- Check for network/connection errors
174 if not success or exit_code ~= 0 then
175 utils.log_error("Network error: curl failed with exit code " .. (exit_code or "unknown"))
176 return nil, "network_error"
177 end
178
179 -- Check for empty or invalid response
180 if not result or result:match("^%s*$") then
181 utils.log_error("Empty response from API endpoint")
182 return nil, "empty_response"
183 end
184
185 -- Check for curl error messages
186 if result:match("curl:") or result:match("Could not resolve host") or result:match("Connection refused") then
187 utils.log_error("Connection error: " .. result:gsub("\n", " "))
188 return nil, "connection_error"
189 end
190
191 local parsed = dkjson.decode(result)
192 -- 10-049: OpenAI shape — vectors live under data[N].embedding rather
193 -- than directly under .embeddings[N]. We send one input per call here,
194 -- so we read data[1].embedding.
195 if parsed and parsed.data and parsed.data[1] and parsed.data[1].embedding then
196 -- Accept any positive-dimension embedding. The hardcoded "== 768"
197 -- that used to live here would have rejected every output from
198 -- qwen3-embedding (2560-D) or any other non-gemma model. Downstream
199 -- code reads the dimension off the embedding itself rather than
200 -- relying on a fixed value, so there is nothing to gain from
201 -- gating here.
202 local embedding = parsed.data[1].embedding
203 if type(embedding) == "table" and #embedding > 0 then
204 return embedding, "success"
205 else
206 utils.log_error("Invalid embedding response: " .. (type(embedding) == "table" and "empty table" or type(embedding)))
207 return nil, "invalid_dimensions"
208 end
209 else
210 utils.log_error("Failed to parse API response: " .. (result:sub(1, 200) or "nil"))
211 return nil, "parse_error"
212 end
213end
214-- }}}
215
216-- {{{ local function table_length
217local function table_length(t)
218 local count = 0
219 for _ in pairs(t) do
220 count = count + 1
221 end
222 return count
223end
224-- }}}
225
226-- {{{ local function generate_random_embedding
227-- Generates a random 768-dimensional embedding for empty poems
228-- Seeded by poem_id for reproducibility
229local function generate_random_embedding(poem_id, dimension)
230 dimension = dimension or 768
231
232 -- Seed with poem_id for reproducibility
233 local seed = type(poem_id) == "number" and poem_id or 12345
234 math.randomseed(seed)
235
236 local embedding = {}
237 local norm = 0
238
239 -- Generate random values
240 for i = 1, dimension do
241 embedding[i] = math.random() * 2 - 1 -- Range: -1 to 1
242 norm = norm + embedding[i] * embedding[i]
243 end
244
245 -- Normalize to unit vector for consistent similarity calculations
246 norm = math.sqrt(norm)
247 if norm > 0 then
248 for i = 1, dimension do
249 embedding[i] = embedding[i] / norm
250 end
251 end
252
253 return embedding
254end
255-- }}}
256
257-- {{{ local function inherit_embedding
258-- Issue 9-010: For image-only posts, inherit embedding from nearest text poem
259-- Optionally combine with own text embedding if the post has any content
260local function inherit_embedding(nearest_embedding, own_embedding, dimension)
261 dimension = dimension or 768
262
263 if not nearest_embedding then
264 return nil -- No embedding to inherit
265 end
266
267 if not own_embedding then
268 -- Pure inheritance: just copy the nearest embedding
269 local result = {}
270 for i = 1, dimension do
271 result[i] = nearest_embedding[i]
272 end
273 return result
274 end
275
276 -- Combine embeddings: average of nearest and own
277 -- This gives semantic meaning from context while preserving any content the post has
278 local result = {}
279 local norm = 0
280
281 for i = 1, dimension do
282 result[i] = (nearest_embedding[i] + own_embedding[i]) / 2
283 norm = norm + result[i] * result[i]
284 end
285
286 -- Normalize to unit vector for consistent similarity calculations
287 norm = math.sqrt(norm)
288 if norm > 0 then
289 for i = 1, dimension do
290 result[i] = result[i] / norm
291 end
292 end
293
294 return result
295end
296-- }}}
297
298-- {{{ local network_error_config
299local network_error_config = {
300 max_consecutive_errors = 5, -- Max consecutive network errors before abort
301 max_total_errors = 20, -- Max total network errors in session
302 initial_retry_delay = 2, -- Initial delay in seconds
303 max_retry_delay = 60, -- Maximum delay in seconds
304 backoff_multiplier = 2 -- Exponential backoff multiplier
305}
306-- }}}
307
308-- {{{ function migrate_legacy_cache
309function migrate_legacy_cache(legacy_file, target_model_dir)
310 if utils.file_exists(legacy_file) then
311 utils.log_info("Migrating legacy cache to model-specific storage...")
312
313 local backup_file = legacy_file .. ".legacy_backup"
314 os.rename(legacy_file, backup_file)
315
316 local legacy_data = utils.read_json_file(backup_file)
317 if legacy_data then
318 utils.write_json_file(target_model_dir .. "/embeddings.json", legacy_data)
319 utils.log_info("Legacy cache migrated successfully")
320 end
321 end
322end
323-- }}}
324
325-- {{{ function M.list_available_models
326function M.list_available_models()
327 utils.log_info("Available Embedding Models:")
328 for model_name, config in pairs(embedding_models) do
329 utils.log_info(" " .. model_name .. " (" .. config.dimensions .. " dims)")
330 end
331 return embedding_models
332end
333-- }}}
334
335-- {{{ function M.get_model_status
336function M.get_model_status(base_output_dir, model_name)
337 -- Default to the configured/overridden model, not a hardcoded literal, so a
338 -- model swap in config.lua (or a --model on the CLI) is reflected here too.
339 model_name = model_name or inference_config.get_selected_model()
340 local storage_paths = get_model_storage_path(base_output_dir, model_name)
341
342 if utils.file_exists(storage_paths.embeddings) then
343 local data = utils.read_json_file(storage_paths.embeddings)
344 if data and data.embeddings then
345 local count = 0
346 for _ in pairs(data.embeddings) do
347 count = count + 1
348 end
349 return {
350 exists = true,
351 count = count,
352 location = storage_paths.embeddings,
353 metadata = data.metadata
354 }
355 end
356 end
357
358 return {
359 exists = false,
360 count = 0,
361 location = storage_paths.embeddings
362 }
363end
364-- }}}
365
366-- {{{ function M.show_all_model_status
367function M.show_all_model_status(base_output_dir)
368 utils.log_info("Available Embedding Models:")
369 for model_name, config in pairs(embedding_models) do
370 local status = M.get_model_status(base_output_dir, model_name)
371 if status.exists then
372 local completion_rate = status.metadata and status.metadata.completion_rate or 0
373 utils.log_info(" " .. model_name .. " (" .. config.dimensions .. " dims) - " ..
374 status.count .. " cached embeddings (" ..
375 string.format("%.1f%%", completion_rate * 100) .. ")")
376 else
377 utils.log_info(" " .. model_name .. " (" .. config.dimensions .. " dims) - No cache found")
378 end
379 end
380end
381-- }}}
382
383-- {{{ function M.generate_all_embeddings
384function M.generate_all_embeddings(poems_file, base_output_dir, endpoint, incremental, model_name)
385 -- Issue 10-017: Use build_host_url() instead of deprecated OLLAMA_ENDPOINT
386 endpoint = endpoint or inference_config.build_host_url()
387 incremental = incremental ~= false -- Default to true
388 -- Default to the configured/overridden model, not a hardcoded literal (the
389 -- caller, generate-embeddings.sh, always passes one; this guards direct use).
390 model_name = model_name or inference_config.get_selected_model()
391
392 -- Get model-specific configuration
393 local model_config = embedding_models[model_name]
394 if not model_config then
395 utils.log_error("Unknown embedding model: " .. model_name)
396 return false
397 end
398
399 -- Generate model-specific file paths
400 local storage_paths = get_model_storage_path(base_output_dir, model_name)
401 local output_file = storage_paths.embeddings
402
403 utils.log_info("Using embedding model: " .. model_name)
404 utils.log_info("Storage location: " .. output_file)
405 utils.log_info("Expected dimensions: " .. model_config.dimensions)
406
407 -- Handle legacy cache migration
408 local legacy_cache = base_output_dir .. "/embeddings.json"
409 if utils.file_exists(legacy_cache) and output_file ~= legacy_cache then
410 migrate_legacy_cache(legacy_cache, base_output_dir .. "/embeddings/" .. model_name:gsub("[^%w%-_.]", "_"))
411 end
412
413 utils.log_info("Loading poems from: " .. poems_file)
414 local poems_data = utils.read_json_file(poems_file)
415 if not poems_data or not poems_data.poems then
416 utils.log_error("Failed to load poems from " .. poems_file)
417 return false
418 end
419 local poems = poems_data.poems
420
421 -- Load existing embeddings if incremental mode enabled
422 local existing_embeddings = {}
423 -- Pull the dimension from the model registry. If the model is unknown,
424 -- leave dim at nil here; it will be populated from the first embedding
425 -- we actually receive below, so the metadata reflects ground truth.
426 local model_dim = embedding_models[model_name] and embedding_models[model_name].dimensions or nil
427 local embeddings_data = {
428 metadata = {
429 total_poems = #poems,
430 embedding_model = model_name,
431 embedding_dimension = model_dim,
432 generated_at = os.date("%Y-%m-%d %H:%M:%S"),
433 endpoint = endpoint,
434 incremental_update = incremental
435 },
436 embeddings = {}
437 }
438
439 if incremental and utils.file_exists(output_file) then
440 utils.log_info("Incremental mode: Loading existing embeddings...")
441 local existing_data = utils.read_json_file(output_file)
442 if existing_data and existing_data.embeddings then
443 -- Handle both array and object formats for existing embeddings
444 -- Key insight (Issue 8-019): We store by poem_index, not by id, because
445 -- the same id can exist in multiple categories (e.g., fediverse/0002.txt
446 -- and messages/0002.txt both have id=2 but different poem_index values).
447 if type(existing_data.embeddings) == "table" then
448 if existing_data.embeddings[1] then
449 -- Array format (legacy format before poem_index)
450 -- Use poem_index from embedding if available, else use array position
451 for i, emb in ipairs(existing_data.embeddings) do
452 local key = emb.poem_index or i
453 existing_embeddings[key] = emb
454 end
455 else
456 -- Object format (current format) - key-value pairs by poem_index
457 for poem_index, emb in pairs(existing_data.embeddings) do
458 -- Store by poem_index for correct lookup
459 existing_embeddings[tonumber(poem_index)] = emb
460 end
461 end
462 end
463
464 -- Preserve existing metadata
465 if existing_data.metadata then
466 embeddings_data.metadata.original_generated_at = existing_data.metadata.generated_at
467 embeddings_data.metadata.previous_total = existing_data.metadata.total_poems
468 end
469
470 utils.log_info("Found " .. table_length(existing_embeddings) .. " existing embeddings")
471 end
472 end
473
474 -- Count poems that need processing
475 local poems_to_process = {}
476 local skipped_count = 0
477 local retry_count = 0
478 local retry_reasons = {}
479
480 if incremental then
481 -- Incremental mode: Check existing embeddings and only process missing/invalid ones
482 for i, poem in ipairs(poems) do
483 -- Use poem_index if available, fallback to array index for legacy poems.json
484 -- This ensures correct matching even when the same id appears in multiple categories.
485 local lookup_key = poem.poem_index or i
486
487 -- Only skip if embedding is valid AND dimensions are correct
488 if existing_embeddings[lookup_key] and
489 existing_embeddings[lookup_key].embedding and
490 type(existing_embeddings[lookup_key].embedding) == "table" and
491 #existing_embeddings[lookup_key].embedding == model_config.dimensions then
492 -- Skip: valid embedding found
493 embeddings_data.embeddings[lookup_key] = existing_embeddings[lookup_key]
494 skipped_count = skipped_count + 1
495 else
496 -- Re-process: no embedding, invalid embedding, or error state
497 table.insert(poems_to_process, {index = lookup_key, poem = poem})
498
499 -- Track retry reasons for reporting
500 if existing_embeddings[lookup_key] then
501 if existing_embeddings[lookup_key].error then
502 retry_count = retry_count + 1
503 local error_type = existing_embeddings[lookup_key].error
504 retry_reasons[error_type] = (retry_reasons[error_type] or 0) + 1
505 elseif existing_embeddings[lookup_key].embedding then
506 -- Invalid embedding dimensions
507 retry_count = retry_count + 1
508 retry_reasons["invalid_dimensions"] = (retry_reasons["invalid_dimensions"] or 0) + 1
509 end
510 end
511 end
512 end
513 end
514
515 if incremental then
516 utils.log_info("Incremental processing summary:")
517 utils.log_info(" Total poems: " .. #poems)
518 utils.log_info(" Valid existing embeddings: " .. skipped_count)
519
520 -- Enhanced retry reporting
521 if retry_count > 0 then
522 local retry_details = {}
523 for error_type, count in pairs(retry_reasons) do
524 table.insert(retry_details, error_type .. ": " .. count)
525 end
526 utils.log_info(" Error entries to retry: " .. retry_count .. " (" .. table.concat(retry_details, ", ") .. ")")
527 end
528
529 local new_poems = #poems_to_process - retry_count
530 if new_poems > 0 then
531 utils.log_info(" New poems to process: " .. new_poems)
532 end
533
534 utils.log_info(" Processing queue: " .. #poems_to_process .. " poems" ..
535 (retry_count > 0 and (" (" .. new_poems .. " new + " .. retry_count .. " retries)") or ""))
536 utils.log_info(" Processing savings: " .. string.format("%.1f%%", (skipped_count / #poems) * 100))
537
538 if #poems_to_process == 0 then
539 utils.log_info("✅ All embeddings already exist and are valid!")
540 embeddings_data.metadata.completed_embeddings = skipped_count
541 embeddings_data.metadata.completion_rate = 1.0
542 embeddings_data.metadata.processing_mode = "no_update_needed"
543 return utils.write_json_file(output_file, embeddings_data)
544 end
545 else
546 utils.log_info("Full regeneration mode: Processing all " .. #poems .. " poems...")
547 for i, poem in ipairs(poems) do
548 table.insert(poems_to_process, {index = i, poem = poem})
549 end
550 end
551
552 -- Issue 10-050: poems are embedded a WINDOW at a time. All normal text poems
553 -- in a window go out as ONE batched + chunked embedding call (was: one HTTP
554 -- request per poem). Window size is the batch primitive's BATCH_SIZE.
555 local window = fuzzy.BATCH_SIZE
556 if window < 1 then window = 1 end
557 -- Issue 8-021 Fix: Track newly processed poems separately to prevent overcounting.
558 -- The bug occurred when key lookups failed due to poem_index format mismatches,
559 -- causing poems to be added to poems_to_process even though they had valid embeddings
560 -- under different keys. This led to completed = skipped_count + #poems_to_process > #poems.
561 local newly_processed = 0 -- Track only newly processed poems
562 local total_poems = #poems -- Cache for sanity checks
563
564 -- Sanity check: detect potential key mismatch (Issue 8-021)
565 -- If skipped_count + #poems_to_process > #poems, there's likely a key lookup issue
566 if skipped_count + #poems_to_process > total_poems then
567 utils.log_warn("⚠️ Potential key mismatch detected:")
568 utils.log_warn(" skipped_count (" .. skipped_count .. ") + poems_to_process (" .. #poems_to_process .. ") = " .. (skipped_count + #poems_to_process))
569 utils.log_warn(" This exceeds total poems (" .. total_poems .. ")")
570 utils.log_warn(" Some embeddings may be stored under legacy keys.")
571 utils.log_warn(" Continuing with processing - data will be correct, only counter may be affected.")
572 end
573
574 -- Network error tracking
575 local consecutive_errors = 0
576 local total_errors = 0
577 local current_delay = network_error_config.initial_retry_delay
578
579 -- Write initial progress state (just counts, no timing)
580 local user = os.getenv("USER") or "ritz" -- fallback to ritz
581 -- Issue 8-059: shared with scripts/generate-embeddings.sh which reads
582 -- this file; both sides now agree on the project-local tmpfs path.
583 local progress_file = DIR .. "/tmp/embedding_progress_" .. user .. ".txt"
584 -- Issue 8-021 Fix: Use safe_completed to cap progress at total_poems
585 local safe_completed = math.min(skipped_count + newly_processed, total_poems)
586 local initial_progress = string.format("%d,%d", safe_completed, total_poems)
587 local pf = io.open(progress_file, "w")
588 if pf then
589 pf:write(initial_progress)
590 pf:close()
591 end
592
593 -- {{{ Issue 10-050 helpers (closures over the loop's running state)
594 -- write_progress: the count-only progress file generate-embeddings.sh tails.
595 -- Issue 8-059: project-local tmpfs path, shared with the bash monitor.
596 -- Issue 8-021: cap at total_poems so a key mismatch can't overcount.
597 local function write_progress()
598 local user = os.getenv("USER") or "ritz"
599 local progress_file = DIR .. "/tmp/embedding_progress_" .. user .. ".txt"
600 local safe_completed = math.min(skipped_count + newly_processed, total_poems)
601 local pf = io.open(progress_file, "w")
602 if pf then
603 pf:write(string.format("%d,%d", safe_completed, total_poems))
604 pf:close()
605 end
606 end
607
608 -- store_success: write the canonical success record. Shape is byte-for-byte
609 -- what the per-item path wrote (Issue 8-019 keys) so every downstream reader
610 -- is unaffected by the switch to batching.
611 local function store_success(poem, poem_index, poem_text, embedding)
612 embeddings_data.embeddings[poem_index] = {
613 poem_index = poem_index, -- Unique global identifier (Issue 8-019)
614 id = poem.id, -- Original source file ID (for display)
615 embedding = embedding,
616 content_length = #poem_text,
617 generated_at = os.date("%Y-%m-%d %H:%M:%S"),
618 updated_at = incremental and os.date("%Y-%m-%d %H:%M:%S") or nil
619 }
621 end
622
623 -- Options handed to the batch helper. We pass OUR endpoint and OUR prompt
624 -- formatter so fuzzy-computing's separate inference-server-config instance
625 -- never diverges from this file's server selection. Chunking uses EXACT token
626 -- counts via the server's /tokenize endpoint, and the per-chunk budget is
627 -- computed exactly below (Issue 10-050) — no char estimate anywhere.
628 local COMBINE_STRATEGY = "length_weighted_mean"
629 -- Compute the EXACT per-chunk token budget once (model context - BERT
630 -- specials - the tokenized prefix), via /tokenize. Raises here, before the
631 -- loop starts, if the server is unreachable — no silent fallback. (10-050)
632 local embed_max_tokens = fuzzy.embedding_chunk_budget(endpoint, inference_config.format_embedding_prompt)
633 local embed_opts = {
634 endpoint = endpoint,
635 format_fn = inference_config.format_embedding_prompt,
636 max_tokens = embed_max_tokens,
637 strategy = COMBINE_STRATEGY
638 }
639 -- Record the chunking parameters so a future tuning change is detectable and
640 -- can trigger cache regeneration rather than silently mixing vectors.
641 embeddings_data.metadata.chunking = {
642 tokenizer = "exact (/tokenize)",
643 max_tokens = embed_max_tokens,
644 combine_strategy = COMBINE_STRATEGY,
645 batch_size = window
646 }
647
648 -- handle_deferred: image-only (inherit) and empty (random) poems, handled
649 -- AFTER the window's normal embeddings land so a same-window nearest
650 -- neighbour is already inheritable. Logic preserved verbatim from the old
652 local function handle_deferred(poem, poem_index, poem_text)
653 if poem.is_image_only and poem.nearest_text_poem_index then
654 local nearest_index = poem.nearest_text_poem_index
655 local nearest_embedding = nil
656 if embeddings_data.embeddings[nearest_index] and
657 embeddings_data.embeddings[nearest_index].embedding then
658 nearest_embedding = embeddings_data.embeddings[nearest_index].embedding
659 elseif existing_embeddings[nearest_index] and
660 existing_embeddings[nearest_index].embedding then
661 nearest_embedding = existing_embeddings[nearest_index].embedding
662 end
663
664 if nearest_embedding then
665 local own_embedding = nil
666 if poem_text ~= "" then
667 own_embedding = generate_embedding(poem_text, endpoint, model_name)
668 end
669 local inherited = inherit_embedding(nearest_embedding, own_embedding, model_config.dimensions)
670 utils.log_info("Image-only post " .. poem_index .. " (ID: " .. (poem.id or "unknown") ..
671 ") - inheriting embedding from nearest text poem " .. nearest_index)
672 embeddings_data.embeddings[poem_index] = {
673 poem_index = poem_index,
674 id = poem.id,
675 embedding = inherited,
676 content_length = #poem_text,
677 is_inherited = true,
678 nearest_text_poem_index = nearest_index,
679 generated_at = os.date("%Y-%m-%d %H:%M:%S"),
680 updated_at = os.date("%Y-%m-%d %H:%M:%S")
681 }
682 newly_processed = newly_processed + 1
683 else
684 utils.log_info("Image-only post " .. poem_index .. " - nearest embedding not ready, generating random")
685 local random_embedding = generate_random_embedding(poem.id, model_config.dimensions)
686 embeddings_data.embeddings[poem_index] = {
687 poem_index = poem_index,
688 id = poem.id,
689 embedding = random_embedding,
690 content_length = 0,
691 is_random = true,
692 is_image_only = true,
693 needs_inheritance_update = true,
694 nearest_text_poem_index = nearest_index,
695 generated_at = os.date("%Y-%m-%d %H:%M:%S"),
696 updated_at = os.date("%Y-%m-%d %H:%M:%S")
697 }
698 newly_processed = newly_processed + 1
699 end
700 else
701 -- Empty poem: random embedding to place it semi-randomly.
702 utils.log_info("Empty poem content for ID: " .. (poem.id or "unknown") .. " - generating random embedding")
703 local random_embedding = generate_random_embedding(poem.id, model_config.dimensions)
704 embeddings_data.embeddings[poem_index] = {
705 poem_index = poem_index, -- Issue 8-019
706 id = poem.id,
707 embedding = random_embedding,
708 content_length = 0,
709 is_random = true,
710 generated_at = os.date("%Y-%m-%d %H:%M:%S"),
711 updated_at = os.date("%Y-%m-%d %H:%M:%S")
712 }
714 end
715 end
716 -- }}}
717
718 -- Save the cache roughly every ~100 poems regardless of window size. The old
719 -- `i % 100 == 1` test assumed a step of 10; with a variable window we count
720 -- windows instead so the periodic checkpoint survives a crash mid-run.
721 local windows_since_save = 0
722 local SAVE_EVERY_WINDOWS = math.max(1, math.floor(100 / window))
723
724 for i = 1, #poems_to_process, window do
725 local batch_end = math.min(i + window - 1, #poems_to_process)
726 utils.log_info(string.format("Processing batch %d-%d of %d new/updated poems...", i, batch_end, #poems_to_process))
727
728 -- Partition this window: normal text poems get batched together; image-
729 -- only and empty poems are deferred to after the batch resolves.
730 local normal = {}
731 local deferred = {}
732 for j = i, batch_end do
733 local poem_data = poems_to_process[j]
734 local poem = poem_data.poem
735 local poem_index = poem_data.index
736 -- Issue 6-033: enhanced preprocessing for better embedding quality.
737 local poem_text = poem_extractor.extract_pure_poem_content_for_embedding(poem.content)
738 local entry = { poem = poem, poem_index = poem_index, poem_text = poem_text }
739 if poem.is_image_only and poem.nearest_text_poem_index then
740 table.insert(deferred, entry)
741 elseif poem_text == "" then
742 table.insert(deferred, entry)
743 else
744 table.insert(normal, entry)
745 end
746 end
747
748 -- Embed all normal poems of the window in ONE batched + chunked call.
749 -- A whole-batch transport failure is treated exactly like the old
750 -- per-poem network_error branch: count it, check the thresholds, back
751 -- off, and retry the SAME window.
752 if #normal > 0 then
753 local window_done = false
754 while not window_done do
755 local texts = {}
756 for k = 1, #normal do texts[k] = normal[k].poem_text end
757 utils.log_info(string.format(" Embedding %d text poems (batched, chunked)...", #normal))
758 local vectors, err = fuzzy.embed_texts_with_chunking(texts, model_name, embed_opts)
759
760 if not vectors then
761 consecutive_errors = consecutive_errors + 1
762 total_errors = total_errors + 1
763 utils.log_warn(string.format("Network error %d/%d for batch %d-%d: %s",
764 consecutive_errors, network_error_config.max_consecutive_errors,
765 i, batch_end, tostring(err)))
766
767 if consecutive_errors >= network_error_config.max_consecutive_errors then
768 local safe_completed = math.min(skipped_count + newly_processed, total_poems)
769 utils.log_error("❌ NETWORK ERROR THRESHOLD EXCEEDED")
770 utils.log_error(" • Consecutive errors: " .. consecutive_errors .. "/" .. network_error_config.max_consecutive_errors)
771 utils.log_error(" • Poems processed before termination: " .. safe_completed .. "/" .. total_poems)
772 utils.log_error("The embedding cache has been preserved.")
773 embeddings_data.metadata.completed_embeddings = safe_completed
774 embeddings_data.metadata.completion_rate = safe_completed / total_poems
775 embeddings_data.metadata.processing_mode = "terminated_network_error"
776 embeddings_data.metadata.termination_reason = "consecutive_network_errors"
777 embeddings_data.metadata.last_error_count = consecutive_errors
778 utils.write_json_file(output_file, embeddings_data)
779 return false
780 elseif total_errors >= network_error_config.max_total_errors then
781 utils.log_error("❌ TOTAL ERROR LIMIT EXCEEDED")
782 utils.log_error("Too many network errors in this session: " .. total_errors .. "/" .. network_error_config.max_total_errors)
783 return false
784 else
785 utils.log_info("Retrying in " .. current_delay .. " seconds...")
786 os.execute("sleep " .. current_delay)
787 current_delay = math.min(current_delay * network_error_config.backoff_multiplier,
788 network_error_config.max_retry_delay)
789 -- loop again: retry this whole window
790 end
791 else
792 -- Batch produced results: reset error counters (the server is
793 -- alive) and distribute vectors to each poem.
794 consecutive_errors = 0
795 current_delay = network_error_config.initial_retry_delay
796 for k = 1, #normal do
797 local n = normal[k]
798 local embedding = vectors[k]
799 if embedding and type(embedding) == "table" and #embedding == model_config.dimensions then
800 store_success(n.poem, n.poem_index, n.poem_text, embedding)
801 else
802 -- One poem's vector is missing/wrong-dimension. Single-
803 -- retry it once via the same chunk-aware path; if that
804 -- still fails, record a non-critical error so it is not
805 -- retried forever (matches the old `else` branch).
806 local single = fuzzy.embed_texts_with_chunking({ n.poem_text }, model_name, embed_opts)
807 local sv = single and single[1]
808 if sv and type(sv) == "table" and #sv == model_config.dimensions then
809 store_success(n.poem, n.poem_index, n.poem_text, sv)
810 else
811 embeddings_data.embeddings[n.poem_index] = {
812 poem_index = n.poem_index, -- Issue 8-019
813 id = n.poem.id,
814 embedding = nil,
815 error = "embedding_failed",
816 updated_at = os.date("%Y-%m-%d %H:%M:%S")
817 }
818 utils.log_warn("Non-critical error for poem " .. n.poem_index .. ": embedding_failed")
819 end
820 end
821 end
822 write_progress()
823 window_done = true
824 end
825 end
826 end
827
828 -- Now the deferred poems, with the window's fresh embeddings available.
829 for _, d in ipairs(deferred) do
830 handle_deferred(d.poem, d.poem_index, d.poem_text)
831 end
832 write_progress()
833
834 -- Periodic cache checkpoint (crash safety on long runs).
835 windows_since_save = windows_since_save + 1
836 if windows_since_save >= SAVE_EVERY_WINDOWS or batch_end == #poems_to_process then
837 windows_since_save = 0
838 local safe_completed = math.min(skipped_count + newly_processed, total_poems)
839 utils.log_info("Saving progress... (" .. newly_processed .. " new + " .. skipped_count .. " existing = " .. safe_completed .. " total)")
840 if not utils.write_json_file(output_file, embeddings_data) then
841 utils.log_error("Failed to save embeddings to " .. output_file)
842 return false
843 end
844 end
845 end
846
847 -- Issue 8-021 Fix: Use safe calculation for final metadata
848 local safe_completed = math.min(skipped_count + newly_processed, total_poems)
849 embeddings_data.metadata.completed_embeddings = safe_completed
850 embeddings_data.metadata.completion_rate = safe_completed / total_poems
851 embeddings_data.metadata.new_embeddings = newly_processed
852 embeddings_data.metadata.reused_embeddings = skipped_count
853 embeddings_data.metadata.processing_mode = incremental and "incremental" or "full_regeneration"
854 -- Note: timing_data feature was planned but never implemented.
855 -- Removed reference to undefined timing_data variable (Issue 8-018).
856
857 utils.log_info("Embedding generation complete!")
858 if incremental then
859 utils.log_info("Incremental processing results:")
860 utils.log_info(" New embeddings generated: " .. newly_processed)
861 utils.log_info(" Existing embeddings reused: " .. skipped_count)
862 utils.log_info(" Total embeddings: " .. safe_completed .. " out of " .. total_poems)
863 utils.log_info(" Time savings: " .. string.format("%.1f%%", (skipped_count / total_poems) * 100))
864 else
865 utils.log_info("Full regeneration results:")
866 utils.log_info(" Successfully generated " .. safe_completed .. " out of " .. total_poems .. " embeddings")
867 end
868 utils.log_info("Completion rate: " .. string.format("%.1f%%", (safe_completed / total_poems) * 100))
869
870 return utils.write_json_file(output_file, embeddings_data)
871end
872-- }}}
873
874-- {{{ function validate_similarity_matrix_currency
875local function validate_similarity_matrix_currency(similarity_file, embeddings_file, poems_file)
876 if not utils.file_exists(similarity_file) then
877 return {valid = false, reason = "no_matrix_found"}
878 end
879
880 local similarity_data = utils.read_json_file(similarity_file)
881 local embeddings_data = utils.read_json_file(embeddings_file)
882 local poems_data = utils.read_json_file(poems_file)
883
884 if not similarity_data or not similarity_data.metadata then
885 return {valid = false, reason = "no_metadata"}
886 end
887
888 local total_poems = #poems_data.poems
889
890 -- Count current valid embeddings
891 local current_embeddings = 0
892 if embeddings_data and embeddings_data.embeddings then
893 for _, emb in pairs(embeddings_data.embeddings) do
894 if emb.embedding and #emb.embedding > 0 then
895 current_embeddings = current_embeddings + 1
896 end
897 end
898 end
899
900 local matrix_embeddings = similarity_data.metadata.embedding_count or 0
901
902 if current_embeddings ~= matrix_embeddings then
903 return {
904 valid = false,
905 reason = "embedding_count_mismatch",
906 current_count = current_embeddings,
907 matrix_count = matrix_embeddings,
908 difference = current_embeddings - matrix_embeddings
909 }
910 end
911
912 if not similarity_data.metadata.is_complete then
913 return {
914 valid = false,
915 reason = "incomplete_dataset",
916 completeness = similarity_data.metadata.matrix_completeness or 0,
917 missing_embeddings = total_poems - current_embeddings
918 }
919 end
920
921 return {valid = true, metadata = similarity_data.metadata}
922end
923-- }}}
924
925-- {{{ function M.calculate_similarity_matrix
926-- DEPRECATED (Issue 8-029): This function generates a top-N array format that is incompatible
927-- with the HTML generator. Use calculate_full_similarity_matrix() instead, which generates
928-- the full pairwise format required by flat-html-generator.lua and other consumers.
929-- Kept for reference and potential future use cases where top-N is sufficient.
930function M.calculate_similarity_matrix(embeddings_file, output_file, top_n, force_regenerate)
931 top_n = top_n or 10
932 force_regenerate = force_regenerate or false
933
934 -- Need poems file for validation (use configured assets path)
935 local poems_file = utils.asset_path("poems.json")
936
937 -- Validate existing matrix unless forced to regenerate
938 if not force_regenerate then
939 local validation = validate_similarity_matrix_currency(output_file, embeddings_file, poems_file)
940 if validation.valid then
941 utils.log_info("✅ Existing similarity matrix is current and complete")
942 return true
943 else
944 utils.log_warn("⚠️ Similarity matrix validation failed: " .. validation.reason)
945 if validation.reason == "embedding_count_mismatch" then
946 utils.log_info(" Current embeddings: " .. validation.current_count)
947 utils.log_info(" Matrix embeddings: " .. validation.matrix_count)
948 utils.log_info(" Difference: " .. validation.difference)
949 elseif validation.reason == "incomplete_dataset" then
950 utils.log_info(" Completeness: " .. string.format("%.1f%%", validation.completeness * 100))
951 utils.log_info(" Missing embeddings: " .. validation.missing_embeddings)
952 end
953 utils.log_info("🗑️ Removing stale similarity matrix...")
954 os.remove(output_file)
955 end
956 end
957
958 utils.log_info("Loading embeddings from: " .. embeddings_file)
959 local embeddings_data = utils.read_json_file(embeddings_file)
960 if not embeddings_data or not embeddings_data.embeddings then
961 utils.log_error("Failed to load embeddings from " .. embeddings_file)
962 return false
963 end
964
965 local embeddings = embeddings_data.embeddings
966 local valid_embeddings = {}
967
968 -- Filter out invalid embeddings
969 for i, item in ipairs(embeddings) do
970 if item.embedding and #item.embedding > 0 then
971 table.insert(valid_embeddings, {
972 index = i,
973 id = item.id,
974 embedding = item.embedding
975 })
976 end
977 end
978
979 -- Load poems data to get actual total count
980 local poems_data = utils.read_json_file(poems_file)
981 local total_poems = poems_data and #poems_data.poems or #embeddings
982
983 -- Calculate completeness metrics
984 local embedding_count = #valid_embeddings
985 local matrix_completeness = embedding_count / total_poems
986 local is_complete = embedding_count == total_poems
987
988 -- Warn about incomplete datasets
989 if not is_complete then
990 utils.log_warn("⚠️ WARNING: Incomplete dataset detected")
991 utils.log_info(" Embeddings: " .. embedding_count .. " / " .. total_poems .. " poems (" .. string.format("%.1f%%", matrix_completeness * 100) .. " complete)")
992 utils.log_info(" Missing: " .. (total_poems - embedding_count) .. " poems will not appear in recommendations")
993 utils.log_info("")
994 utils.log_info(" For complete recommendations, generate embeddings for all poems first")
995 end
996
997 utils.log_info("Calculating similarity matrix for " .. #valid_embeddings .. " valid embeddings...")
998
999 local similarity_data = {
1000 metadata = {
1001 generated_at = os.date("%Y-%m-%d %H:%M:%S"),
1002 model_name = embeddings_data.metadata and embeddings_data.metadata.embedding_model or "unknown",
1003 total_poems = total_poems,
1004 embedding_count = embedding_count,
1005 matrix_completeness = matrix_completeness,
1006 is_complete = is_complete,
1007 top_n = top_n,
1008 algorithm = "cosine_similarity"
1009 },
1010 similarities = {}
1011 }
1012
1013 local total_comparisons = #valid_embeddings * (#valid_embeddings - 1) / 2
1014 local completed_comparisons = 0
1015
1016 for i = 1, #valid_embeddings do
1017 local poem_a = valid_embeddings[i]
1018 local similarities_for_poem = {}
1019
1020 -- Issue 8-024: Use carriage return to overwrite line in-place
1021 io.write(string.format("\r[INFO] Processing poem %d/%d (ID: %s) ", i, #valid_embeddings, poem_a.id or "unknown"))
1022 io.flush()
1023
1024 for j = 1, #valid_embeddings do
1025 if i ~= j then
1026 local poem_b = valid_embeddings[j]
1027 local similarity = cosine_similarity(poem_a.embedding, poem_b.embedding)
1028
1029 table.insert(similarities_for_poem, {
1030 id = poem_b.id,
1031 index = poem_b.index,
1032 similarity = similarity
1033 })
1034
1035 if j > i then
1036 completed_comparisons = completed_comparisons + 1
1037 end
1038 end
1039 end
1040
1041 -- Sort by similarity (highest first) and keep only top N
1042 table.sort(similarities_for_poem, function(a, b) return a.similarity > b.similarity end)
1043
1044 local top_similarities = {}
1045 for k = 1, math.min(top_n, #similarities_for_poem) do
1046 table.insert(top_similarities, similarities_for_poem[k])
1047 end
1048
1049 local poem_key = poem_a.id or ("poem_" .. poem_a.index)
1050 similarity_data.similarities[poem_key] = {
1051 poem_index = poem_a.index,
1052 top_similar = top_similarities,
1053 calculated_at = os.date("%Y-%m-%d %H:%M:%S")
1054 }
1055
1056 -- Save progress periodically
1057 if i % 50 == 0 or i == #valid_embeddings then
1058 local progress = (completed_comparisons / total_comparisons) * 100
1059 -- Issue 8-024: Newline before progress to preserve it (processing line uses \r)
1060 io.write("\n")
1061 utils.log_info(string.format("Progress: %.1f%% (%d/%d comparisons)", progress, completed_comparisons, total_comparisons))
1062
1063 if not utils.write_json_file(output_file, similarity_data) then
1064 utils.log_error("Failed to save similarity matrix to " .. output_file)
1065 return false
1066 end
1067 end
1068 end
1069
1070 utils.log_info("Similarity matrix calculation complete!")
1071 utils.log_info("Calculated similarities for " .. #valid_embeddings .. " poems")
1072 utils.log_info("Total comparisons: " .. total_comparisons)
1073
1074 return true
1075end
1076-- }}}
1077
1078-- {{{ function M.calculate_full_similarity_matrix
1079function M.calculate_full_similarity_matrix(embeddings_file, output_file, force_regenerate)
1080 force_regenerate = force_regenerate or false
1081
1082 -- Need poems file for validation (use configured assets path)
1083 local poems_file = utils.asset_path("poems.json")
1084
1085 -- Check if full matrix already exists and is current
1086 if not force_regenerate and utils.file_exists(output_file) then
1087 local existing_data = utils.read_json_file(output_file)
1088 if existing_data and existing_data.metadata and existing_data.metadata.is_complete then
1089 utils.log_info("✅ Full similarity matrix already exists and is complete")
1090 return true
1091 end
1092 end
1093
1094 utils.log_info("🔍 Generating FULL similarity matrix (all poem pairs)...")
1095 utils.log_info("⚠️ This will generate ALL 47.1M comparisons (no symmetry optimization) and may take 4-8 hours")
1096
1097 -- Load embeddings
1098 local embeddings_data = utils.read_json_file(embeddings_file)
1099 if not embeddings_data or not embeddings_data.embeddings then
1100 utils.log_error("Failed to load embeddings from " .. embeddings_file)
1101 return false
1102 end
1103
1104 local embeddings = embeddings_data.embeddings
1105 local valid_embeddings = {}
1106
1107 -- Filter out invalid embeddings
1108 for _, embedding in ipairs(embeddings) do
1109 if embedding.embedding and #embedding.embedding > 0 and embedding.id then
1110 table.insert(valid_embeddings, embedding)
1111 end
1112 end
1113
1114 if #valid_embeddings == 0 then
1115 utils.log_error("No valid embeddings found")
1116 return false
1117 end
1118
1119 utils.log_info(string.format("Processing %d poems for full similarity matrix", #valid_embeddings))
1120
1121 local total_comparisons = #valid_embeddings * #valid_embeddings
1122 local completed_comparisons = 0
1123 local start_time = os.time()
1124
1125 -- Initialize full similarity matrix
1126 local similarity_data = {
1127 metadata = {
1128 is_complete = true,
1129 total_poems = #valid_embeddings,
1130 matrix_size = total_comparisons,
1131 algorithm = "cosine_similarity",
1132 model_name = embeddings_data.metadata.embedding_model or "unknown",
1133 generated_at = os.date("%Y-%m-%d %H:%M:%S"),
1134 embedding_count = #valid_embeddings
1135 },
1136 similarities = {}
1137 }
1138
1139 -- Generate COMPLETE similarity matrix (calculate ALL comparisons for maximum accuracy)
1140 for i = 1, #valid_embeddings do
1141 local poem_a = valid_embeddings[i]
1142 local poem_a_id = tostring(poem_a.id)
1143 similarity_data.similarities[poem_a_id] = {}
1144
1145 -- Issue 8-024: Use carriage return to overwrite line in-place
1146 io.write(string.format("\r[INFO] Processing poem %d/%d (ID: %s) ", i, #valid_embeddings, poem_a_id))
1147 io.flush()
1148
1149 for j = 1, #valid_embeddings do
1150 local poem_b = valid_embeddings[j]
1151 local poem_b_id = tostring(poem_b.id)
1152
1153 if i == j then
1154 -- Self-similarity is always 1.0
1155 similarity_data.similarities[poem_a_id][poem_b_id] = 1.0
1156 else
1157 -- Calculate similarity for EVERY comparison (no symmetry optimization)
1158 -- This ensures maximum accuracy by computing each comparison independently
1159 local similarity = cosine_similarity(poem_a.embedding, poem_b.embedding)
1160 -- Round to 4 decimal places for storage efficiency
1161 local rounded_similarity = math.floor(similarity * 10000) / 10000
1162
1163 similarity_data.similarities[poem_a_id][poem_b_id] = rounded_similarity
1164 end
1165
1166 completed_comparisons = completed_comparisons + 1
1167 end
1168
1169 -- Progressive saving every 100 poems to prevent data loss
1170 if i % 100 == 0 or i == #valid_embeddings then
1171 local progress = (completed_comparisons / total_comparisons) * 100
1172 local elapsed_time = os.time() - start_time
1173 local rate = completed_comparisons / elapsed_time
1174 local estimated_remaining = (total_comparisons - completed_comparisons) / rate
1175
1176 -- Issue 8-024: Newline before progress to preserve it (processing line uses \r)
1177 io.write("\n")
1178 utils.log_info(string.format("Progress: %.2f%% (%d/%d comparisons)",
1179 progress, completed_comparisons, total_comparisons))
1180 utils.log_info(string.format("Rate: %.0f comparisons/sec, Est. remaining: %.0f minutes",
1181 rate, estimated_remaining / 60))
1182
1183 if not utils.write_json_file(output_file, similarity_data) then
1184 utils.log_error("Failed to save similarity matrix to " .. output_file)
1185 return false
1186 end
1187 utils.log_info("✅ Progress saved to disk")
1188 end
1189
1190 -- Memory management: force garbage collection periodically
1191 if i % 500 == 0 then
1192 collectgarbage("collect")
1193 end
1194 end
1195
1196 -- Final save with completion timestamp
1197 similarity_data.metadata.completed_at = os.date("%Y-%m-%d %H:%M:%S")
1198 similarity_data.metadata.generation_time_seconds = os.time() - start_time
1199
1200 if not utils.write_json_file(output_file, similarity_data) then
1201 utils.log_error("Failed to save final similarity matrix")
1202 return false
1203 end
1204
1205 utils.log_info("🎉 Full similarity matrix generation complete!")
1206 utils.log_info(string.format("Total comparisons: %d", total_comparisons))
1207 utils.log_info(string.format("Generation time: %.1f minutes", (os.time() - start_time) / 60))
1208 utils.log_info(string.format("Matrix saved to: %s", output_file))
1209
1210 return true
1211end
1212-- }}}
1213
1214-- {{{ function M.calculate_triangular_similarity_matrix
1215function M.calculate_triangular_similarity_matrix(embeddings_file, output_file, force_regenerate)
1216 utils.log_info("🔍 Generating TRIANGULAR similarity matrix (optimized storage)...")
1217
1218 -- Check if output already exists and not forcing regeneration
1219 if not force_regenerate and utils.file_exists(output_file) then
1220 utils.log_info("Triangular similarity matrix already exists. Use force_regenerate=true to recreate.")
1221 return true
1222 end
1223
1224 local embeddings_data = utils.read_json_file(embeddings_file)
1225 if not embeddings_data or not embeddings_data.embeddings then
1226 utils.log_error("Failed to load embeddings file: " .. embeddings_file)
1227 return false
1228 end
1229
1230 local embeddings = embeddings_data.embeddings
1231 local poems = {}
1232
1233 -- Filter out invalid embeddings (same as full matrix function)
1234 for _, embedding in ipairs(embeddings) do
1235 if embedding.embedding and #embedding.embedding > 0 and embedding.id then
1236 table.insert(poems, embedding)
1237 end
1238 end
1239
1240 if #poems == 0 then
1241 utils.log_error("No valid embeddings found")
1242 return false
1243 end
1244
1245 utils.log_info("Processing " .. #poems .. " poems for triangular similarity matrix")
1246
1247 -- Calculate storage requirements
1248 local total_unique_pairs = (#poems * (#poems - 1)) / 2
1249 utils.log_info(string.format("⚠️ This will generate %d unique comparisons (50%% reduction from full matrix)", total_unique_pairs))
1250 utils.log_info("⚠️ Expected storage: ~50% reduction from full matrix size")
1251
1252 local similarity_data = {
1253 metadata = {
1254 matrix_size = total_unique_pairs,
1255 total_poems = #poems,
1256 model_name = embeddings_data.model_name,
1257 algorithm = "cosine_similarity",
1258 embedding_count = #poems,
1259 generated_at = os.date("%Y-%m-%d %H:%M:%S"),
1260 is_complete = true,
1261 storage_format = "triangular_upper"
1262 },
1263 similarities = {}
1264 }
1265
1266 local start_time = os.time()
1267 local completed = 0
1268
1269 -- Generate upper triangular matrix only (i < j)
1270 for i = 1, #poems do
1271 local poem_a = poems[i]
1272 similarity_data.similarities[tostring(poem_a.id)] = {}
1273
1274 -- Only calculate similarities for j > i (upper triangle)
1275 for j = i + 1, #poems do
1276 local poem_b = poems[j]
1277
1278 local similarity = cosine_similarity(poem_a.embedding, poem_b.embedding)
1279 similarity_data.similarities[tostring(poem_a.id)][tostring(poem_b.id)] =
1280 math.floor(similarity * 10000) / 10000 -- 4 decimal precision
1281
1282 completed = completed + 1
1283
1284 -- Progress reporting every 10000 comparisons
1285 if completed % 10000 == 0 then
1286 local progress_percent = (completed / total_unique_pairs) * 100
1287 local elapsed = os.time() - start_time
1288 local rate = completed / elapsed
1289 local remaining_time = (total_unique_pairs - completed) / rate / 60
1290
1291 utils.log_info(string.format("Progress: %.2f%% (%d/%d comparisons)",
1292 progress_percent, completed, total_unique_pairs))
1293 utils.log_info(string.format("Rate: %.0f comparisons/sec, Est. remaining: %.1f minutes",
1294 rate, remaining_time))
1295 end
1296 end
1297
1298 -- Progressive saving every 100 poems
1299 if i % 100 == 0 then
1300 utils.write_json_file(output_file, similarity_data)
1301 utils.log_info(string.format("✅ Progress saved to disk (poem %d/%d)", i, #poems))
1302 end
1303
1304 -- Garbage collection every 500 poems
1305 if i % 500 == 0 then
1306 collectgarbage()
1307 utils.log_info(string.format("🗑️ Memory cleanup completed (poem %d/%d)", i, #poems))
1308 end
1309 end
1310
1311 -- Final save
1312 if not utils.write_json_file(output_file, similarity_data) then
1313 utils.log_error("Failed to save triangular similarity matrix")
1314 return false
1315 end
1316
1317 utils.log_info("✅ TRIANGULAR similarity matrix generation completed!")
1318 utils.log_info(string.format("Total unique comparisons: %d", total_unique_pairs))
1319 utils.log_info(string.format("Generation time: %.1f minutes", (os.time() - start_time) / 60))
1320 utils.log_info(string.format("Matrix saved to: %s", output_file))
1321 utils.log_info("📊 Storage optimized: ~50% reduction from full matrix")
1322
1323 return true
1324end
1325-- }}}
1326
1327-- {{{ function M.get_similarity_triangular
1328function M.get_similarity_triangular(matrix, id1, id2)
1329 -- Handle diagonal (self-similarity)
1330 if id1 == id2 then return 1.0 end
1331
1332 -- Ensure consistent ordering for triangle lookup (min_id -> max_id)
1333 local min_id = math.min(tonumber(id1), tonumber(id2))
1334 local max_id = math.max(tonumber(id1), tonumber(id2))
1335
1336 -- Look up in upper triangle
1337 if matrix.similarities[tostring(min_id)] and
1338 matrix.similarities[tostring(min_id)][tostring(max_id)] then
1339 return matrix.similarities[tostring(min_id)][tostring(max_id)]
1340 end
1341
1342 -- Fallback (should not happen with complete matrix)
1343 utils.log_warning(string.format("Similarity not found for poems %s and %s", id1, id2))
1344 return 0.0
1345end
1346-- }}}
1347
1348-- {{{ function M.get_all_similarities_for_poem_triangular
1349function M.get_all_similarities_for_poem_triangular(matrix, poem_id, poem_ids)
1350 local similarities = {}
1351
1352 for _, other_id in ipairs(poem_ids) do
1353 if other_id ~= poem_id then
1354 local score = M.get_similarity_triangular(matrix, poem_id, other_id)
1355 table.insert(similarities, {
1356 target_id = other_id,
1357 score = score
1358 })
1359 end
1360 end
1361
1362 -- Sort by similarity score (descending)
1363 table.sort(similarities, function(a, b)
1364 return a.score > b.score
1365 end)
1366
1367 return similarities
1368end
1369-- }}}
1370
1371-- {{{ function M.generate_similarity_report
1372function M.generate_similarity_report(similarity_file, poems_file, output_file)
1373 utils.log_info("Generating similarity analysis report...")
1374
1375 local similarity_data = utils.read_json_file(similarity_file)
1376 local poems_data = utils.read_json_file(poems_file)
1377
1378 if not similarity_data or not poems_data then
1379 utils.log_error("Failed to load required data files")
1380 return false
1381 end
1382
1383 local report = {
1384 metadata = {
1385 generated_at = os.date("%Y-%m-%d %H:%M:%S"),
1386 total_poems = #poems_data,
1387 poems_with_similarities = 0,
1388 average_similarity = 0,
1389 max_similarity = 0,
1390 min_similarity = 1
1391 },
1392 statistics = {},
1393 sample_similarities = {}
1394 }
1395
1396 local total_similarity = 0
1397 local similarity_count = 0
1398
1399 for poem_id, data in pairs(similarity_data.similarities) do
1400 report.metadata.poems_with_similarities = report.metadata.poems_with_similarities + 1
1401
1402 if data.top_similar and #data.top_similar > 0 then
1403 local max_sim = data.top_similar[1].similarity
1404 local min_sim = data.top_similar[#data.top_similar].similarity
1405
1406 report.metadata.max_similarity = math.max(report.metadata.max_similarity, max_sim)
1407 report.metadata.min_similarity = math.min(report.metadata.min_similarity, min_sim)
1408
1409 for _, sim in ipairs(data.top_similar) do
1410 total_similarity = total_similarity + sim.similarity
1411 similarity_count = similarity_count + 1
1412 end
1413
1414 -- Add sample for high-similarity pairs
1415 if max_sim > 0.8 then
1416 table.insert(report.sample_similarities, {
1417 poem_a_id = poem_id,
1418 poem_b_id = data.top_similar[1].id,
1419 similarity = max_sim
1420 })
1421 end
1422 end
1423 end
1424
1425 if similarity_count > 0 then
1426 report.metadata.average_similarity = total_similarity / similarity_count
1427 end
1428
1429 utils.log_info("Similarity analysis complete!")
1430 utils.log_info("Poems with similarities: " .. report.metadata.poems_with_similarities)
1431 utils.log_info("Average similarity: " .. string.format("%.3f", report.metadata.average_similarity))
1432 utils.log_info("Similarity range: " .. string.format("%.3f - %.3f", report.metadata.min_similarity, report.metadata.max_similarity))
1433
1434 return utils.write_json_file(output_file, report)
1435end
1436-- }}}
1437
1438-- {{{ function M.generate_all_model_similarity_matrices
1439function M.generate_all_model_similarity_matrices(base_output_dir, min_completeness, use_full_matrix)
1440 min_completeness = min_completeness or 0.8 -- 80% minimum completeness
1441 use_full_matrix = use_full_matrix or false -- Default to sparse matrices
1442
1443 utils.log_info("🔄 Generating similarity matrices for all eligible models...")
1444 utils.log_info("⚙️ Minimum completeness required: " .. (min_completeness * 100) .. "%")
1445 utils.log_info("📊 Matrix type: " .. (use_full_matrix and "FULL (all comparisons)" or "SPARSE (top-N)"))
1446
1447 local models = M.list_available_models()
1448 local results = {}
1449 local eligible_count = 0
1450 local total_poems = 6860 -- Known total poem count
1451
1452 -- First pass: check eligibility
1453 for model_name, config in pairs(models) do
1454 local status = M.get_model_status(base_output_dir, model_name)
1455
1456 if status.exists then
1457 local completeness = status.count / total_poems
1458
1459 if completeness >= min_completeness then
1460 eligible_count = eligible_count + 1
1461 utils.log_info("✅ " .. model_name .. " (" .. string.format("%.1f%% complete, %d poems)", completeness * 100, status.count) .. ")")
1462 else
1463 utils.log_warn("⚠️ Skipping " .. model_name ..
1464 " (only " .. string.format("%.1f%% complete, %d poems)", completeness * 100, status.count) .. ")")
1465 end
1466 else
1467 utils.log_info("❌ No embeddings found for " .. model_name)
1468 end
1469 end
1470
1471 if eligible_count == 0 then
1472 utils.log_warn("No models meet the minimum completeness requirement")
1473 return {}
1474 end
1475
1476 utils.log_info("📈 Processing " .. eligible_count .. " eligible models")
1477
1478 local current_model = 0
1479
1480 -- Second pass: generate matrices
1481 for model_name, config in pairs(models) do
1482 local status = M.get_model_status(base_output_dir, model_name)
1483
1484 if status.exists then
1485 local completeness = status.count / total_poems
1486
1487 if completeness >= min_completeness then
1488 current_model = current_model + 1
1489
1490 utils.log_info(string.format("🔄 [%d/%d] Processing %s", current_model, eligible_count, model_name))
1491
1492 local storage_paths = get_model_storage_path(base_output_dir, model_name)
1493 local matrix_file = use_full_matrix and
1494 storage_paths.similarity_matrix:gsub("%.json$", "_full.json") or
1495 storage_paths.similarity_matrix
1496
1497 local start_time = os.time()
1498 local success
1499
1500 if use_full_matrix then
1501 success = M.calculate_full_similarity_matrix(
1502 storage_paths.embeddings,
1503 matrix_file,
1504 false -- Don't force regenerate unless needed
1505 )
1506 else
1507 success = M.calculate_similarity_matrix(
1508 storage_paths.embeddings,
1509 matrix_file
1510 )
1511 end
1512
1513 local generation_time = os.time() - start_time
1514
1515 results[model_name] = {
1516 success = success,
1517 completeness = completeness,
1518 embedding_count = status.count,
1519 matrix_file = matrix_file,
1520 generation_time = generation_time,
1521 matrix_type = use_full_matrix and "full" or "sparse"
1522 }
1523
1524 if success then
1525 utils.log_info(string.format("✅ Matrix generation complete for %s (took %d seconds)", model_name, generation_time))
1526 else
1527 utils.log_error("❌ Matrix generation failed for " .. model_name)
1528 end
1529 else
1530 results[model_name] = {
1531 success = false,
1532 reason = "insufficient_completeness",
1533 completeness = completeness,
1534 embedding_count = status.count,
1535 required_completeness = min_completeness
1536 }
1537 end
1538 else
1539 results[model_name] = {
1540 success = false,
1541 reason = "no_embeddings",
1542 completeness = 0,
1543 embedding_count = 0
1544 }
1545 end
1546 end
1547
1548 -- Summary report
1549 local successful_models = 0
1550 local skipped_models = 0
1551 local failed_models = 0
1552
1553 for model_name, result in pairs(results) do
1554 if result.success then
1555 successful_models = successful_models + 1
1556 elseif result.reason then
1557 skipped_models = skipped_models + 1
1558 else
1559 failed_models = failed_models + 1
1560 end
1561 end
1562
1563 utils.log_info("📊 Generation Summary:")
1564 utils.log_info(" ✅ Successful: " .. successful_models .. " models")
1565 utils.log_info(" ⚠️ Skipped: " .. skipped_models .. " models")
1566 utils.log_info(" ❌ Failed: " .. failed_models .. " models")
1567
1568 return results
1569end
1570-- }}}
1571
1572-- {{{ function M.compare_model_similarities
1573function M.compare_model_similarities(poem_id, base_output_dir, models, use_full_matrix)
1574 use_full_matrix = use_full_matrix or false
1575 models = models or {}
1576
1577 -- If no models specified, use all available models
1578 if #models == 0 then
1579 local available_models = M.list_available_models()
1580 for model_name, _ in pairs(available_models) do
1581 table.insert(models, model_name)
1582 end
1583 end
1584
1585 utils.log_info("🔍 Comparing similarities for poem " .. poem_id .. " across models")
1586
1587 local comparisons = {}
1588
1589 for _, model_name in ipairs(models) do
1590 local storage_paths = get_model_storage_path(base_output_dir, model_name)
1591 local matrix_file = use_full_matrix and
1592 storage_paths.similarity_matrix:gsub("%.json$", "_full.json") or
1593 storage_paths.similarity_matrix
1594
1595 if utils.file_exists(matrix_file) then
1596 -- For now, generate basic similarity data - this would integrate with recommendation system
1597 comparisons[model_name] = {
1598 matrix_available = true,
1599 matrix_type = use_full_matrix and "full" or "sparse",
1600 matrix_file = matrix_file
1601 }
1602 utils.log_info("✅ " .. model_name .. " - Matrix available")
1603 else
1604 comparisons[model_name] = {
1605 matrix_available = false,
1606 reason = "matrix_not_found"
1607 }
1608 utils.log_info("❌ " .. model_name .. " - Matrix not found")
1609 end
1610 end
1611
1612 return comparisons
1613end
1614-- }}}
1615
1616-- {{{ function M.get_multi_model_status
1617function M.get_multi_model_status(base_output_dir)
1618 utils.log_info("📊 Per-Model Similarity Matrix Status:")
1619
1620 local models = M.list_available_models()
1621 local total_poems = 6860
1622 local status_summary = {}
1623
1624 for model_name, config in pairs(models) do
1625 local status = M.get_model_status(base_output_dir, model_name)
1626 local storage_paths = get_model_storage_path(base_output_dir, model_name)
1627
1628 local sparse_matrix_exists = utils.file_exists(storage_paths.similarity_matrix)
1629 local full_matrix_file = storage_paths.similarity_matrix:gsub("%.json$", "_full.json")
1630 local full_matrix_exists = utils.file_exists(full_matrix_file)
1631
1632 local completeness = status.exists and (status.count / total_poems) or 0
1633
1634 utils.log_info(" " .. model_name .. " (" .. config.dimensions .. " dims)")
1635
1636 if status.exists then
1637 utils.log_info(string.format(" ✅ Embeddings: %d/%d (%.1f%%)",
1638 status.count, total_poems, completeness * 100))
1639 else
1640 utils.log_info(" ❌ Embeddings: 0/" .. total_poems .. " (0%)")
1641 end
1642
1643 if sparse_matrix_exists then
1644 utils.log_info(" ✅ Sparse Matrix: Generated")
1645 else
1646 utils.log_info(" ❌ Sparse Matrix: Not generated")
1647 end
1648
1649 if full_matrix_exists then
1650 utils.log_info(" ✅ Full Matrix: Generated")
1651 else
1652 utils.log_info(" ❌ Full Matrix: Not generated")
1653 end
1654
1655 if completeness < 0.8 then
1656 local needed = math.ceil((0.8 * total_poems) - status.count)
1657 utils.log_info(" 🔄 Recommendation: Complete " .. needed .. " more embeddings")
1658 end
1659
1660 status_summary[model_name] = {
1661 dimensions = config.dimensions,
1662 embedding_count = status.count,
1663 completeness = completeness,
1664 sparse_matrix_exists = sparse_matrix_exists,
1665 full_matrix_exists = full_matrix_exists,
1666 eligible_for_generation = completeness >= 0.8
1667 }
1668 end
1669
1670 return status_summary
1671end
1672-- }}}
1673
1674-- {{{ function M.main
1675function M.main(interactive_mode)
1676 if interactive_mode then
1677 utils.log_info("=== Similarity Engine Interactive Mode ===")
1678 print("1. Generate embeddings for all poems")
1679 print("2. Calculate similarity matrix (sparse, top-N)")
1680 print("3. Calculate FULL similarity matrix (all pairs)")
1681 print("4. Generate similarity analysis report")
1682 print("5. Run complete pipeline")
1683 print("6. Generate matrices for ALL eligible models")
1684 print("7. Show multi-model status")
1685 print("8. Compare model similarities")
1686 io.write("Select option (1-8): ")
1687 local choice = io.read()
1688
1689 if choice == "1" then
1690 local poems_file = utils.asset_path("poems.json")
1691 local base_output_dir = utils.get_assets_root()
1692 io.write("Use incremental processing? (Y/n): ")
1693 local incremental_choice = io.read()
1694 local incremental = not (incremental_choice:lower() == "n" or incremental_choice:lower() == "no")
1695 io.write("Embedding model (default: EmbeddingGemma:latest): ")
1696 local model_input = io.read()
1697 local model_name = model_input ~= "" and model_input or "embeddinggemma:latest"
1698 M.generate_all_embeddings(poems_file, base_output_dir, nil, incremental, model_name)
1699 elseif choice == "2" then
1700 io.write("Embedding model (default: EmbeddingGemma:latest): ")
1701 local model_input = io.read()
1702 local model_name = model_input ~= "" and model_input or "embeddinggemma:latest"
1703 local base_output_dir = utils.get_assets_root()
1704 local storage_paths = get_model_storage_path(base_output_dir, model_name)
1705 local embeddings_file = storage_paths.embeddings
1706 local output_file = storage_paths.similarity_matrix
1707 M.calculate_similarity_matrix(embeddings_file, output_file)
1708 elseif choice == "3" then
1709 io.write("Embedding model (default: EmbeddingGemma:latest): ")
1710 local model_input = io.read()
1711 local model_name = model_input ~= "" and model_input or "embeddinggemma:latest"
1712 local base_output_dir = utils.get_assets_root()
1713 local storage_paths = get_model_storage_path(base_output_dir, model_name)
1714 local embeddings_file = storage_paths.embeddings
1715 local output_file = storage_paths.similarity_matrix:gsub("%.json$", "_full.json")
1716
1717 utils.log_info("⚠️ FULL matrix generation will take 2-4 hours and create ~100MB file")
1718 io.write("Continue? (y/N): ")
1719 local confirm = io.read()
1720 if confirm:lower() == "y" or confirm:lower() == "yes" then
1721 M.calculate_full_similarity_matrix(embeddings_file, output_file, false)
1722 else
1723 utils.log_info("Full matrix generation cancelled")
1724 end
1725 elseif choice == "4" then
1726 local similarity_file = utils.asset_path("similarity-matrix.json")
1727 local poems_file = utils.asset_path("poems.json")
1728 local output_file = utils.asset_path("similarity-report.json")
1729 M.generate_similarity_report(similarity_file, poems_file, output_file)
1730 elseif choice == "5" then
1731 utils.log_info("Running complete similarity engine pipeline...")
1732 local poems_file = utils.asset_path("poems.json")
1733 local base_output_dir = utils.get_assets_root()
1734 local similarity_file = utils.asset_path("similarity-matrix.json")
1735 local report_file = utils.asset_path("similarity-report.json")
1736
1737 if M.generate_all_embeddings(poems_file, base_output_dir) then
1738 local storage_paths = get_model_storage_path(base_output_dir, "embeddinggemma:latest")
1739 local embeddings_file = storage_paths.embeddings
1740 if M.calculate_similarity_matrix(embeddings_file, similarity_file) then
1741 M.generate_similarity_report(similarity_file, poems_file, report_file)
1742 utils.log_info("✅ Complete pipeline executed successfully!")
1743 else
1744 utils.log_error("Pipeline failed at similarity matrix calculation")
1745 end
1746 else
1747 utils.log_error("Pipeline failed at embedding generation")
1748 end
1749 elseif choice == "6" then
1750 local base_output_dir = utils.get_assets_root()
1751 io.write("Matrix type - (s)parse or (f)ull? (default: sparse): ")
1752 local matrix_type = io.read()
1753 local use_full_matrix = matrix_type:lower():sub(1,1) == "f"
1754
1755 io.write("Minimum completeness percentage (default: 80): ")
1756 local completeness_input = io.read()
1757 local min_completeness = tonumber(completeness_input) or 80
1758 min_completeness = min_completeness / 100 -- Convert percentage to decimal
1759
1760 local results = M.generate_all_model_similarity_matrices(base_output_dir, min_completeness, use_full_matrix)
1761 utils.log_info("Multi-model generation complete. Results available in similarity engine.")
1762 elseif choice == "7" then
1763 local base_output_dir = utils.get_assets_root()
1764 M.get_multi_model_status(base_output_dir)
1765 elseif choice == "8" then
1766 io.write("Poem ID to compare: ")
1767 local poem_id = tonumber(io.read())
1768 local base_output_dir = utils.get_assets_root()
1769 io.write("Use (s)parse or (f)ull matrices? (default: sparse): ")
1770 local matrix_type = io.read()
1771 local use_full_matrix = matrix_type:lower():sub(1,1) == "f"
1772
1773 local results = M.compare_model_similarities(poem_id, base_output_dir, {}, use_full_matrix)
1774 utils.log_info("Model comparison complete.")
1775 else
1776 print("Invalid choice")
1777 end
1778 else
1779 -- Default: run similarity analysis on existing data
1780 utils.log_info("Running similarity engine analysis...")
1781 -- Issue 8-032: Fixed filename inconsistency (was similarity-matrix.json with hyphen)
1782 local similarity_file = utils.asset_path("similarity_matrix.json")
1783 local poems_file = utils.asset_path("poems.json")
1784 local report_file = utils.asset_path("similarity-report.json")
1785
1786 if utils.file_exists(similarity_file) then
1787 M.generate_similarity_report(similarity_file, poems_file, report_file)
1788 else
1789 utils.log_info("No similarity matrix found. Use interactive mode (-I) to generate embeddings and similarities.")
1790 end
1791 end
1792end
1793-- }}}
1794
1795-- {{{ function M.flush_embeddings_cache
1796function M.flush_embeddings_cache(output_file, flush_type, backup)
1797 flush_type = flush_type or "all" -- "all", "errors", "model_specific"
1798 backup = backup ~= false -- Default to true
1799
1800 if not utils.file_exists(output_file) then
1801 utils.log_info("No cache file found at: " .. output_file)
1802 return true
1803 end
1804
1805 -- Get file info for reporting
1806 local file_size = os.execute("du -h '" .. output_file .. "' 2>/dev/null") and
1807 io.popen("du -h '" .. output_file .. "' | cut -f1"):read("*l") or "unknown"
1808
1809 utils.log_info("Cache flush operation: " .. flush_type)
1810 utils.log_info("Target file: " .. output_file)
1811 utils.log_info("File size: " .. file_size)
1812
1813 if backup then
1814 local backup_file = output_file .. ".backup." .. os.date("%Y%m%d_%H%M%S")
1815
1816 -- Use Lua file operations for better cross-platform compatibility
1817 local source_file = io.open(output_file, "rb")
1818 if not source_file then
1819 utils.log_error("Failed to open source file for backup")
1820 return false
1821 end
1822
1823 local content = source_file:read("*a")
1824 source_file:close()
1825
1826 local backup_dest = io.open(backup_file, "wb")
1827 if not backup_dest then
1828 utils.log_error("Failed to create backup file: " .. backup_file)
1829 return false
1830 end
1831
1832 backup_dest:write(content)
1833 backup_dest:close()
1834
1835 utils.log_info("Backup created: " .. backup_file)
1836 end
1837
1838 if flush_type == "all" then
1839 -- Complete cache flush
1840 local remove_result = os.remove(output_file)
1841 if remove_result then
1842 utils.log_info("✅ Complete embedding cache flushed")
1843 return true
1844 else
1845 utils.log_error("Failed to remove cache file")
1846 return false
1847 end
1848
1849 elseif flush_type == "errors" then
1850 -- Flush only error entries, keep valid embeddings
1851 local existing_data = utils.read_json_file(output_file)
1852 if not existing_data or not existing_data.embeddings then
1853 utils.log_warn("No embeddings data found in cache file")
1854 return true
1855 end
1856
1857 local clean_embeddings = {}
1858 local removed_count = 0
1859 local kept_count = 0
1860
1861 -- "Valid" means "matches the cache file's declared dimension." That
1862 -- declared dimension comes from the metadata block of this same
1863 -- file, written when the cache was first created — so this check
1864 -- is model-agnostic now (was hardcoded to 768 for embeddinggemma).
1865 local expected_dim = existing_data.metadata and existing_data.metadata.embedding_dimension
1866 for i, emb in pairs(existing_data.embeddings) do
1867 local dim_ok = emb.embedding and type(emb.embedding) == "table"
1868 and #emb.embedding > 0
1869 and (not expected_dim or #emb.embedding == expected_dim)
1870 if dim_ok then
1871 -- Keep valid embeddings
1872 clean_embeddings[i] = emb
1873 kept_count = kept_count + 1
1874 else
1875 -- Remove error entries
1876 removed_count = removed_count + 1
1877 end
1878 end
1879
1880 existing_data.embeddings = clean_embeddings
1881
1882 -- Update metadata
1883 if existing_data.metadata then
1884 existing_data.metadata.completed_embeddings = kept_count
1885 existing_data.metadata.last_flush_operation = {
1886 type = "errors_only",
1887 timestamp = os.date("%Y-%m-%d %H:%M:%S"),
1888 removed_entries = removed_count,
1889 kept_entries = kept_count
1890 }
1891 end
1892
1893 local write_success = utils.write_json_file(output_file, existing_data)
1894 if write_success then
1895 utils.log_info("✅ Error entries flushed: " .. removed_count .. " entries removed, " .. kept_count .. " kept")
1896 return true
1897 else
1898 utils.log_error("Failed to write cleaned cache file")
1899 return false
1900 end
1901
1902 else
1903 utils.log_error("Unknown flush type: " .. flush_type)
1904 return false
1905 end
1906end
1907-- }}}
1908
1909-- Command line execution
1910-- Issue 8-032: Only run main() when executed as script (arg[0] exists),
1911-- not when required as module from luajit -e (where arg exists but arg[0] is nil)
1912if arg and arg[0] then
1913 local interactive_mode = false
1914 for i, arg_val in ipairs(arg) do
1915 if arg_val == "-I" then
1916 interactive_mode = true
1917 break
1918 end
1919 end
1920
1921 M.main(interactive_mode)
1922end
1923
1924return M