src/centroid-html-generator.lua

509 lines

1#!/usr/bin/env lua
2
3-- Centroid HTML Generator
4-- Generates exploration pages based on user-defined centroid embeddings.
5-- Each centroid gets a similarity page (poems most like that mood) and
6-- a diversity page (poems least like that mood).
7--
8-- This module extends the flat-html-generator with centroid-based navigation.
9
10-- {{{ local function setup_dir_path
11local function setup_dir_path(provided_dir)
12 if provided_dir then
13 return provided_dir
14 end
15 return "/mnt/mtwo/programming/ai-stuff/neocities-modernization"
16end
17-- }}}
18
19-- Script configuration
20local DIR = setup_dir_path()
21
22-- Load required libraries
23package.path = DIR .. "/libs/?.lua;" .. DIR .. "/src/?.lua;" .. package.path
24local utils = require("utils")
25local dkjson = require("dkjson")
26local inference_config = require("inference-server-config")
27
28-- Helper: directory name (sanitized) for the currently selected model.
29-- Used as the default for the two functions below that take an optional
30-- model_name. Was hardcoded to "embeddinggemma_latest"; now follows the
31-- active model so a config swap propagates automatically.
32local function current_model_dir()
33 return inference_config.get_selected_model():gsub("[^%w%-_.]", "_")
34end
35
36local M = {}
37
38-- {{{ local function cosine_similarity
39-- Calculates cosine similarity between two embedding vectors
40local function cosine_similarity(vec1, vec2)
41 if not vec1 or not vec2 or #vec1 ~= #vec2 then
42 return 0
43 end
44
45 local dot_product = 0
46 local norm1 = 0
47 local norm2 = 0
48
49 for i = 1, #vec1 do
50 dot_product = dot_product + (vec1[i] * vec2[i])
51 norm1 = norm1 + (vec1[i] * vec1[i])
52 norm2 = norm2 + (vec2[i] * vec2[i])
53 end
54
55 norm1 = math.sqrt(norm1)
56 norm2 = math.sqrt(norm2)
57
58 if norm1 == 0 or norm2 == 0 then
59 return 0
60 end
61
62 return dot_product / (norm1 * norm2)
63end
64-- }}}
65
66-- {{{ function M.load_centroids
67-- Loads generated centroid embeddings from the embeddings directory
68function M.load_centroids(model_name)
69 model_name = model_name or current_model_dir()
70 local centroids_file = utils.embeddings_dir(model_name) .. "/centroids.json"
71
72 local content, err = utils.read_file(centroids_file)
73 if not content then
74 utils.log_warn("Could not load centroids: " .. (err or "file not found"))
75 utils.log_warn("Run centroid-generator.lua first to create centroid embeddings")
76 return nil
77 end
78
79 local data = dkjson.decode(content)
80 if not data or not data.centroids then
81 utils.log_error("Invalid centroids file format")
82 return nil
83 end
84
85 local count = 0
86 for _ in pairs(data.centroids) do count = count + 1 end
87 utils.log_info(string.format("Loaded %d centroid embeddings", count))
88
89 return data
90end
91-- }}}
92
93-- {{{ local function build_embeddings_lookup
94-- Converts the array-format embeddings to a lookup table by poem ID
95local function build_embeddings_lookup(embeddings_data)
96 local lookup = {}
97
98 if not embeddings_data or not embeddings_data.embeddings then
99 return lookup
100 end
101
102 -- Handle array format: [{id: 1, embedding: [...]}, {id: 2, embedding: [...]}, ...]
103 if type(embeddings_data.embeddings) == "table" then
104 for _, entry in ipairs(embeddings_data.embeddings) do
105 if entry.id and entry.embedding then
106 lookup[tostring(entry.id)] = entry.embedding
107 end
108 end
109 end
110
111 return lookup
112end
113-- }}}
114
115-- {{{ function M.generate_centroid_similarity_ranking
116-- Ranks all poems by their similarity to a centroid embedding
117function M.generate_centroid_similarity_ranking(centroid_data, poems_data, embeddings_data)
118 local centroid_embedding = centroid_data.embedding
119
120 if not centroid_embedding then
121 utils.log_error("Centroid has no embedding: " .. (centroid_data.name or "unknown"))
122 return nil
123 end
124
125 -- Build lookup table from array-format embeddings
126 local embeddings_lookup = build_embeddings_lookup(embeddings_data)
127
128 local ranked_poems = {}
129
130 -- Calculate similarity for each poem
131 for poem_id, poem in ipairs(poems_data.poems) do
132 if poem.id then
133 local poem_embedding = embeddings_lookup[tostring(poem.id)]
134
135 local similarity = 0
136 if poem_embedding and type(poem_embedding) == "table" and #poem_embedding > 0 then
137 similarity = cosine_similarity(centroid_embedding, poem_embedding)
138 end
139
140 table.insert(ranked_poems, {
141 id = poem.id,
142 poem = poem,
143 similarity = similarity
144 })
145 end
146 end
147
148 -- Sort by similarity (descending = most similar first)
149 table.sort(ranked_poems, function(a, b)
150 return a.similarity > b.similarity
151 end)
152
153 -- Add rank numbers
154 for i, poem_info in ipairs(ranked_poems) do
155 poem_info.rank = i
156 end
157
158 return ranked_poems
159end
160-- }}}
161
162-- {{{ function M.generate_centroid_diversity_ranking
163-- Ranks all poems by their diversity from a centroid (least similar first)
164function M.generate_centroid_diversity_ranking(centroid_data, poems_data, embeddings_data)
165 -- Get similarity ranking first
166 local ranked_poems = M.generate_centroid_similarity_ranking(centroid_data, poems_data, embeddings_data)
167
168 if not ranked_poems then
169 return nil
170 end
171
172 -- Reverse the order (least similar first)
173 local reversed = {}
174 for i = #ranked_poems, 1, -1 do
175 table.insert(reversed, ranked_poems[i])
176 end
177
178 -- Re-assign rank numbers
179 for i, poem_info in ipairs(reversed) do
180 poem_info.rank = i
181 -- Convert similarity to "diversity" score (1 - similarity)
182 poem_info.diversity = 1 - poem_info.similarity
183 end
184
185 return reversed
186end
187-- }}}
188
189-- {{{ local function generate_centroid_header
190-- Creates the header section for centroid pages
191local function generate_centroid_header(centroid_data, page_type, total_poems)
192 local title = string.format("%s - %s exploration",
193 centroid_data.name,
194 page_type == "similar" and "Similar" or "Different")
195
196 local description = centroid_data.description or ""
197
198 local header = string.format([[<!DOCTYPE html>
199<html lang="en">
200<head>
201<meta charset="UTF-8">
202<meta name="viewport" content="width=device-width, initial-scale=1.0">
203<title>%s</title>
204</head>
205<body bgcolor="#000000" text="#FFFFFF" link="#6699FF" vlink="#9966FF">
206<pre>
207================================================================================
208 %s
209================================================================================
210
211Theme: %s
212Description: %s
213Mode: %s poems %s
214Total poems: %d
215
216Navigation: <a href="../chronological.html">chronological</a> │ <a href="../explore.html">explore</a> │ <a href="../centroid/">all moods</a>
217This page: <a href="%s-similar.html">similar</a> │ <a href="%s-different.html">different</a>
218
219================================================================================
220
221]],
222 title,
223 string.upper(centroid_data.name),
224 centroid_data.name,
225 description,
226 page_type == "similar" and "most" or "least",
227 page_type == "similar" and "like this mood" or "like this mood",
228 total_poems,
229 centroid_data.output_slug,
230 centroid_data.output_slug
231 )
232
233 return header
234end
235-- }}}
236
237-- {{{ local function generate_centroid_footer
238local function generate_centroid_footer()
239 return [[
240================================================================================
241 END OF COLLECTION
242================================================================================
243</pre>
244</body>
245</html>
246]]
247end
248-- }}}
249
250-- {{{ local function get_unique_poem_filename_id
251-- Generates a unique identifier for poem filenames using category prefix
252-- See Issue 8-019 for rationale on cross-category ID collision prevention.
253local function get_unique_poem_filename_id(poem)
254 local category = poem.category or "unknown"
255 local id = poem.id or 0
256 return string.format("%s-%04d", category, id)
257end
258-- }}}
259
260-- {{{ local function format_poem_for_centroid_page
261-- Formats a single poem entry for the centroid page
262local function format_poem_for_centroid_page(poem_info, page_type)
263 local poem = poem_info.poem
264 local content = poem.content or poem.text or "[No content]"
265
266 -- Clean up content
267 content = content:gsub("\r\n", "\n"):gsub("\r", "\n")
268
269 local score_label = page_type == "similar" and "similarity" or "diversity"
270 local score = page_type == "similar" and poem_info.similarity or poem_info.diversity
271
272 -- Use category prefix for unique filenames
273 local unique_id = get_unique_poem_filename_id(poem)
274
275 local entry = string.format([[
276--------------------------------------------------------------------------------
277#%d │ Poem %s │ %s: %.4f
278--------------------------------------------------------------------------------
279<a href="../similar/%s.html">[similar]</a> <a href="../different/%s.html">[different]</a>
280
281%s
282
283]],
284 poem_info.rank,
285 poem.id,
286 score_label,
287 score or 0,
288 unique_id,
289 unique_id,
290 content
291 )
292
293 return entry
294end
295-- }}}
296
297-- {{{ function M.generate_centroid_html_page
298-- Generates a complete HTML page for a centroid (similar or different)
299function M.generate_centroid_html_page(centroid_data, ranked_poems, page_type)
300 local parts = {}
301
302 -- Header
303 table.insert(parts, generate_centroid_header(centroid_data, page_type, #ranked_poems))
304
305 -- Poem entries
306 for _, poem_info in ipairs(ranked_poems) do
307 table.insert(parts, format_poem_for_centroid_page(poem_info, page_type))
308 end
309
310 -- Footer
311 table.insert(parts, generate_centroid_footer())
312
313 return table.concat(parts)
314end
315-- }}}
316
317-- {{{ function M.generate_centroid_index_page
318-- Generates an index page listing all available mood centroids
319function M.generate_centroid_index_page(centroids_data)
320 local parts = {}
321
322 table.insert(parts, [[<!DOCTYPE html>
323<html lang="en">
324<head>
325<meta charset="UTF-8">
326<meta name="viewport" content="width=device-width, initial-scale=1.0">
327<title>Mood Exploration - Centroids</title>
328</head>
329<body bgcolor="#000000" text="#FFFFFF" link="#6699FF" vlink="#9966FF">
330<pre>
331================================================================================
332 MOOD-BASED EXPLORATION
333================================================================================
334
335Browse poems by emotional tone or thematic cluster. Each mood has two views:
336- Similar: poems that resonate with this mood
337- Different: poems that contrast with this mood
338
339Navigation: <a href="../chronological.html">chronological</a> │ <a href="../explore.html">explore</a>
340
341================================================================================
342 AVAILABLE MOODS
343================================================================================
344
345]])
346
347 -- Sort centroids by name for consistent ordering
348 local sorted_slugs = {}
349 for slug, _ in pairs(centroids_data.centroids) do
350 table.insert(sorted_slugs, slug)
351 end
352 table.sort(sorted_slugs)
353
354 for _, slug in ipairs(sorted_slugs) do
355 local centroid = centroids_data.centroids[slug]
356 local entry = string.format([[
357--- %s ---
358%s
359
360 <a href="%s-similar.html">[similar poems]</a> <a href="%s-different.html">[different poems]</a>
361
362]],
363 string.upper(centroid.name),
364 centroid.description or "",
365 slug,
366 slug
367 )
368 table.insert(parts, entry)
369 end
370
371 table.insert(parts, [[
372================================================================================
373 ABOUT MOOD CENTROIDS
374================================================================================
375
376These mood pages are generated from custom "centroid" embeddings - semantic
377fingerprints created by combining keywords and source texts that represent
378each mood. When you visit a mood's "similar" page, you see all poems ranked
379by how closely they match that emotional signature.
380
381To add new moods, edit: assets/centroids.json
382Then run: lua src/centroid-generator.lua
383
384================================================================================
385</pre>
386</body>
387</html>
388]])
389
390 return table.concat(parts)
391end
392-- }}}
393
394-- {{{ function M.generate_all_centroid_pages
395-- Main function to generate all centroid-based HTML pages
396function M.generate_all_centroid_pages(poems_data, embeddings_data, output_dir)
397 local model_name = current_model_dir()
398
399 -- Load centroids
400 local centroids_data = M.load_centroids(model_name)
401 if not centroids_data then
402 utils.log_warn("No centroids available - skipping centroid page generation")
403 return nil
404 end
405
406 -- Create output directory
407 local centroid_dir = output_dir .. "/centroid"
408 os.execute("mkdir -p " .. centroid_dir)
409
410 local results = {
411 similar_pages = {},
412 different_pages = {},
413 index_page = nil
414 }
415
416 -- Generate pages for each centroid
417 for slug, centroid_data in pairs(centroids_data.centroids) do
418 utils.log_info(string.format("Generating centroid pages: %s", centroid_data.name))
419
420 -- Generate similarity ranking
421 local similar_ranking = M.generate_centroid_similarity_ranking(
422 centroid_data, poems_data, embeddings_data)
423
424 if similar_ranking then
425 -- Generate similar page
426 local similar_html = M.generate_centroid_html_page(centroid_data, similar_ranking, "similar")
427 local similar_file = centroid_dir .. "/" .. slug .. "-similar.html"
428
429 if utils.write_file(similar_file, similar_html) then
430 table.insert(results.similar_pages, similar_file)
431 utils.log_info(string.format(" Created: %s-similar.html", slug))
432 end
433
434 -- Generate different page (reverse order)
435 local different_ranking = M.generate_centroid_diversity_ranking(
436 centroid_data, poems_data, embeddings_data)
437
438 if different_ranking then
439 local different_html = M.generate_centroid_html_page(centroid_data, different_ranking, "different")
440 local different_file = centroid_dir .. "/" .. slug .. "-different.html"
441
442 if utils.write_file(different_file, different_html) then
443 table.insert(results.different_pages, different_file)
444 utils.log_info(string.format(" Created: %s-different.html", slug))
445 end
446 end
447 else
448 utils.log_warn(string.format(" Could not generate ranking for: %s", centroid_data.name))
449 end
450 end
451
452 -- Generate index page
453 local index_html = M.generate_centroid_index_page(centroids_data)
454 local index_file = centroid_dir .. "/index.html"
455
456 if utils.write_file(index_file, index_html) then
457 results.index_page = index_file
458 utils.log_info("Created: centroid/index.html")
459 end
460
461 utils.log_info(string.format("Centroid generation complete: %d similar, %d different pages",
462 #results.similar_pages, #results.different_pages))
463
464 return results
465end
466-- }}}
467
468-- {{{ Main execution
469if arg and arg[0] and arg[0]:match("centroid%-html%-generator%.lua$") then
470 utils.log_info("=== Centroid HTML Generator ===")
471 utils.log_info("Generating mood-based exploration pages")
472 utils.log_info("")
473
474 -- Load poems data
475 local poems_file = utils.get_assets_root() .. "/poems.json"
476 local poems_content = utils.read_file(poems_file)
477 if not poems_content then
478 utils.log_error("Could not load poems.json")
479 os.exit(1)
480 end
481 local poems_data = dkjson.decode(poems_content)
482
483 -- Load embeddings data
484 local embeddings_file = utils.embeddings_dir() .. "/embeddings.json"
485 local embeddings_content = utils.read_file(embeddings_file)
486 if not embeddings_content then
487 utils.log_error("Could not load embeddings.json")
488 os.exit(1)
489 end
490 local embeddings_data = dkjson.decode(embeddings_content)
491
492 -- Generate pages
493 local output_dir = DIR .. "/output"
494 local results = M.generate_all_centroid_pages(poems_data, embeddings_data, output_dir)
495
496 if results then
497 utils.log_info("\n=== Summary ===")
498 utils.log_info(string.format("Similar pages: %d", #results.similar_pages))
499 utils.log_info(string.format("Different pages: %d", #results.different_pages))
500 utils.log_info(string.format("Index page: %s", results.index_page and "created" or "failed"))
501 else
502 utils.log_error("Centroid page generation failed")
503 os.exit(1)
504 end
505end
506-- }}}
507
508return M
509