src/html-generator/embedding-list-generator.lua
1#!/usr/bin/env lua
2
3-- Embedding-Based Similarity and Diversity List Generator
4-- Pre-generates similarity and diversity data for modular HTML generation
5
6package.path = package.path .. ';./?.lua;./libs/?.lua'
7
8local utils = require("libs.utils")
9-- Issue 10-051 family: shared progress renderer (animated bar on a TTY, plain
10-- lines under --debug, silent when piped) so these long loops show one updating
11-- line instead of scrolling a "[INFO] Progress:" line every N items.
12local progress = require("libs.progress-display")
13
14local M = {}
15local DIR = "/mnt/mtwo/programming/ai-stuff/neocities-modernization"
16
17-- {{{ function generate_most_similar_lists
18function M.generate_most_similar_lists(embeddings_dir, model_name)
19 local model_dir = embeddings_dir .. "/" .. model_name
20 local similarity_matrix_file = model_dir .. "/similarity_matrix.json"
21
22 utils.log_info(string.format("Loading similarity matrix from: %s", similarity_matrix_file))
23 local similarity_data = utils.read_json_file(similarity_matrix_file)
24
25 if not similarity_data then
26 utils.log_error("Failed to load similarity matrix")
27 return false
28 end
29
30 local output_dir = model_dir .. "/similarity_lists/most_similar"
31 os.execute("mkdir -p " .. output_dir)
32
33 local total_poems = 0
34 for _ in pairs(similarity_data.similarities) do
35 total_poems = total_poems + 1
36 end
37
38 utils.log_info(string.format("Generating most similar lists for %d poems", total_poems))
39
40 local processed_count = 0
41
42 for poem_id, similarities in pairs(similarity_data.similarities) do
43 processed_count = processed_count + 1
44
45 -- Animate one progress line; throttle sparser under --debug (verbose).
46 local step = (progress.mode() == 2) and 100 or 25
47 if processed_count % step == 0 then
48 progress.update(" 📋 Most-similar lists", processed_count, total_poems)
49 end
50
51 -- Convert similarities to sorted list
52 local similarity_list = {}
53
54 -- Extract similarities from top_similar array structure
55 if similarities.top_similar then
56 for _, similarity_entry in ipairs(similarities.top_similar) do
57 table.insert(similarity_list, {
58 poem_id = similarity_entry.id or similarity_entry.index,
59 similarity_score = similarity_entry.similarity,
60 rank = nil
61 })
62 end
63 else
64 -- Fallback: treat as direct poem_id -> score mapping
65 for target_id, score in pairs(similarities) do
66 if target_id ~= "poem_index" and target_id ~= "calculated_at" then
67 table.insert(similarity_list, {
68 poem_id = tonumber(target_id),
69 similarity_score = score,
70 rank = nil
71 })
72 end
73 end
74 end
75
76 -- Sort by similarity score (highest first)
77 table.sort(similarity_list, function(a, b)
78 return (a.similarity_score or 0) > (b.similarity_score or 0)
79 end)
80
81 -- Add rank information
82 for i, item in ipairs(similarity_list) do
83 item.rank = i
84 end
85
86 local output_data = {
87 source_poem_id = tonumber(poem_id) or 0,
88 model_name = model_name,
89 generation_timestamp = os.date("%Y-%m-%dT%H:%M:%S"),
90 total_similar_poems = #similarity_list,
91 most_similar_poems = similarity_list
92 }
93
94 local output_file = string.format("%s/poem-%03d-most-similar.json", output_dir, tonumber(poem_id) or 0)
95 local success = utils.write_json_file(output_file, output_data)
96
97 if not success then
98 utils.log_error(string.format("Failed to write most similar list for poem %s", poem_id))
99 return false
100 end
101 end
102 progress.finish()
103
104 utils.log_info(string.format("Generated %d most similar lists", processed_count))
105 return true
106end
107-- }}}
108
109-- {{{ function generate_least_similar_chain
110function M.generate_least_similar_chain(starting_poem_id, similarity_data, max_length)
111 local chain = {
112 {
113 poem_id = starting_poem_id,
114 position = 1,
115 similarity_to_previous = nil,
116 selection_reason = "starting_poem"
117 }
118 }
119
120 local used_poems = {[tonumber(starting_poem_id) or 0] = true}
121 local current_poem_id = starting_poem_id
122
123 for position = 2, max_length do
124 local least_similar = M.find_least_similar_poem(current_poem_id, similarity_data, used_poems)
125
126 if not least_similar then
127 break
128 end
129
130 table.insert(chain, {
131 poem_id = least_similar.poem_id,
132 position = position,
133 similarity_to_previous = least_similar.similarity_score,
134 selection_reason = "least_similar_to_previous"
135 })
136
137 used_poems[least_similar.poem_id] = true
138 current_poem_id = least_similar.poem_id
139 end
140
141 return chain
142end
143-- }}}
144
145-- {{{ function find_least_similar_poem
146function M.find_least_similar_poem(current_poem_id, similarity_data, used_poems)
147 local current_similarities = similarity_data.similarities[tostring(current_poem_id)]
148
149 if not current_similarities then
150 return nil
151 end
152
153 local least_similar_poem = nil
154 local lowest_similarity = math.huge
155
156 -- Handle both top_similar array structure and direct mapping
157 local similarities_to_check = current_similarities.top_similar or current_similarities
158
159 if current_similarities.top_similar then
160 -- Array structure with similarity entries
161 for _, similarity_entry in ipairs(similarities_to_check) do
162 local target_id = similarity_entry.id or similarity_entry.index
163 local similarity_score = similarity_entry.similarity
164
165 if not used_poems[target_id] then
166 if similarity_score < lowest_similarity then
167 lowest_similarity = similarity_score
168 least_similar_poem = {
169 poem_id = target_id,
170 similarity_score = similarity_score
171 }
172 end
173 end
174 end
175 else
176 -- Direct poem_id -> score mapping
177 for target_poem_id, similarity_score in pairs(similarities_to_check) do
178 if target_poem_id ~= "poem_index" and target_poem_id ~= "calculated_at" then
179 local target_id = tonumber(target_poem_id)
180
181 if not used_poems[target_id] then
182 if similarity_score < lowest_similarity then
183 lowest_similarity = similarity_score
184 least_similar_poem = {
185 poem_id = target_id,
186 similarity_score = similarity_score
187 }
188 end
189 end
190 end
191 end
192 end
193
194 return least_similar_poem
195end
196-- }}}
197
198-- {{{ function generate_diversity_chain_lists
199function M.generate_diversity_chain_lists(embeddings_dir, model_name, chain_length)
200 chain_length = chain_length or 20
201
202 local model_dir = embeddings_dir .. "/" .. model_name
203 local similarity_matrix_file = model_dir .. "/similarity_matrix.json"
204
205 utils.log_info(string.format("Loading similarity matrix from: %s", similarity_matrix_file))
206 local similarity_data = utils.read_json_file(similarity_matrix_file)
207
208 if not similarity_data then
209 utils.log_error("Failed to load similarity data")
210 return false
211 end
212
213 local output_dir = model_dir .. "/similarity_lists/diversity_chains"
214 os.execute("mkdir -p " .. output_dir)
215
216 local total_poems = 0
217 for _ in pairs(similarity_data.similarities) do
218 total_poems = total_poems + 1
219 end
220
221 utils.log_info(string.format("Generating diversity chains for %d poems", total_poems))
222
223 local processed_count = 0
224
225 for starting_poem_id, _ in pairs(similarity_data.similarities) do
226 processed_count = processed_count + 1
227
228 local step = (progress.mode() == 2) and 100 or 25
229 if processed_count % step == 0 then
230 progress.update(" 🎲 Diversity chains", processed_count, total_poems)
231 end
232
233 local diversity_chain = M.generate_least_similar_chain(
234 tonumber(starting_poem_id),
235 similarity_data,
236 chain_length
237 )
238
239 local output_data = {
240 starting_poem_id = tonumber(starting_poem_id) or 0,
241 model_name = model_name,
242 generation_timestamp = os.date("%Y-%m-%dT%H:%M:%S"),
243 chain_length = #diversity_chain,
244 target_chain_length = chain_length,
245 diversity_chain = diversity_chain
246 }
247
248 local output_file = string.format("%s/poem-%03d-diversity-chain.json", output_dir, tonumber(starting_poem_id) or 0)
249 local success = utils.write_json_file(output_file, output_data)
250
251 if not success then
252 utils.log_error(string.format("Failed to write diversity chain for poem %s", starting_poem_id))
253 return false
254 end
255 end
256 progress.finish()
257
258 utils.log_info(string.format("Generated %d diversity chain lists", processed_count))
259 return true
260end
261-- }}}
262
263-- {{{ function generate_all_embedding_lists
264function M.generate_all_embedding_lists(embeddings_dir, model_name, options)
265 options = options or {}
266 local chain_length = options.chain_length or 20
267
268 utils.log_info(string.format("Generating all embedding lists for model: %s", model_name))
269
270 -- Generate most similar lists
271 local similar_success = M.generate_most_similar_lists(embeddings_dir, model_name)
272 if not similar_success then
273 utils.log_error("Failed to generate most similar lists")
274 return false
275 end
276
277 -- Generate diversity chain lists
278 local diversity_success = M.generate_diversity_chain_lists(embeddings_dir, model_name, chain_length)
279 if not diversity_success then
280 utils.log_error("Failed to generate diversity chain lists")
281 return false
282 end
283
284 utils.log_info("Successfully generated all embedding lists")
285 return true
286end
287-- }}}
288
289return M