src/generate-word-pages.lua

1172 lines

1#!/usr/bin/env luajit
2
3-- {{{ generate-word-pages.lua
4-- Issue 8-043: Generate similarity pages for word cloud words
5-- Issue 8-043b: Separated into two stages for proper pipeline integration
6--
7-- For each word in the word cloud, generates a page showing poems ranked by
8-- their semantic similarity to that word's embedding.
9--
10-- Modes:
11-- --embeddings-only Stage 6: Generate word embeddings (expensive, via the inference server)
12-- --html-only Stage 9: Generate HTML pages (fast, uses cached embeddings)
13-- (no flag) Both stages (backward compatible)
14--
15-- Word Count Options:
16-- --all Include all words (no max_words limit)
17-- --words N Set maximum words to process (default: 200 from config)
18--
19-- Usage:
20-- luajit src/generate-word-pages.lua [DIR] [--embeddings-only|--html-only] [--all|--words N]
21-- luajit src/generate-word-pages.lua --help
22-- }}}
23
24-- {{{ Setup
25local function setup_dir_path(provided_dir)
26 if provided_dir then
27 return provided_dir
28 end
29 return "/mnt/mtwo/programming/ai-stuff/neocities-modernization"
30end
31
32-- {{{ parse_args
33-- Parse arguments, extracting DIR, mode flags, and word/page count options
34local function parse_args(args)
35 local dir = nil
36 local mode = "both" -- default: both embeddings and HTML
37 local all_words = false
38 local max_words = nil -- nil means use config default
39 local poems_per_page = nil -- Issue 8-050d: nil means use config default
40 local chrono_per_page = nil -- nil means fall back to config (never a literal)
41 local i = 1
42
43 while i <= #(args or {}) do
44 local a = args[i]
45 if a == "--embeddings-only" then
46 mode = "embeddings"
47 i = i + 1
48 elseif a == "--html-only" then
49 mode = "html"
50 i = i + 1
51 elseif a == "--help" or a == "-h" then
52 mode = "help"
53 i = i + 1
54 elseif a == "--all" then
55 all_words = true
56 i = i + 1
57 elseif a == "--words" then
58 -- Accept "all" as a synonym for --all (the two flags are combined).
59 if args[i + 1] == "all" then all_words = true else max_words = tonumber(args[i + 1]) end
60 i = i + 2
61 elseif a:match("^--words=") then
62 local v = a:match("^--words=(.+)$")
63 if v == "all" then all_words = true else max_words = tonumber(v) end
64 i = i + 1
65 -- Issue 8-050d: Parse poems-per-page argument
66 elseif a == "--poems-per-page" then
67 poems_per_page = tonumber(args[i + 1])
68 i = i + 2
69 elseif a:match("^--poems%-per%-page=") then
70 poems_per_page = tonumber(a:match("^--poems%-per%-page=(.+)$"))
71 i = i + 1
72 -- Issue 10-036: chronological page size, threaded from run.sh so the
73 -- word-page "chronological" links paginate identically to the actual
74 -- chronological pages (this is a separate process from the one that
75 -- built them). nil means fall back to config, never to a literal.
76 elseif a == "--chrono-per-page" then
77 chrono_per_page = tonumber(args[i + 1])
78 i = i + 2
79 elseif a:match("^--chrono%-per%-page=") then
80 chrono_per_page = tonumber(a:match("^--chrono%-per%-page=(.+)$"))
81 i = i + 1
82 elseif a:sub(1, 1) ~= "-" then
83 dir = a
84 i = i + 1
85 else
86 -- Skip unknown flags
87 i = i + 1
88 end
89 end
90
91 return dir, mode, all_words, max_words, poems_per_page, chrono_per_page
92end
93-- }}}
94
95local parsed_dir, RUN_MODE, CLI_ALL_WORDS, CLI_MAX_WORDS, CLI_POEMS_PER_PAGE, CLI_CHRONO_PER_PAGE = parse_args(arg)
96local DIR = setup_dir_path(parsed_dir)
97package.path = DIR .. "/libs/?.lua;" .. DIR .. "/src/?.lua;" .. package.path
98
99local dkjson = require("dkjson")
100local utils = require("utils")
101local inference_config = require("inference-server-config")
102-- Issue 10-050: shared batched embedding primitive (endpoint + prompt formatter
103-- threaded in so fuzzy-computing's separate config instance is never consulted).
104local fuzzy = require("fuzzy-computing")
105
106-- Issue 10-003: Load unified config from config.lua
107local config_loader = require("config-loader")
108config_loader.set_project_root(DIR)
109local unified_config = config_loader.load()
110-- Shared box/bar drawing so word-cloud poem pages can't drift from the
111-- similar/different + chronological pages (they had a third, divergent copy).
112local poem_bars = require("poem-bars")
113-- Shared chronological mapping (sort order + page numbers + timeline progress).
114-- Reused here so the word-page "chronological" links resolve to the SAME page
115-- and anchor the chronological pages actually emit (Issue 10-049 follow-up).
116local flat_html = require("flat-html-generator")
117
118utils.init_assets_root(arg)
119-- }}}
120
121local M = {}
122
123-- {{{ Configuration
124-- Determine effective max_words: CLI --all > CLI --words > config
125local wc = unified_config.word_cloud or {}
126local effective_max_words
127if CLI_ALL_WORDS then
128 effective_max_words = math.huge -- No limit
129elseif CLI_MAX_WORDS then
130 effective_max_words = CLI_MAX_WORDS
131else
132 effective_max_words = wc.max_words or 200
133end
134
135-- Issue 8-050d: Determine effective poems_per_page: CLI > config > default
136local effective_poems_per_page = CLI_POEMS_PER_PAGE or wc.poems_per_page or 50
137
138-- {{{ resolve_chrono_per_page()
139-- Chronological page size for the "chronological" poem links: the build's
140-- --chrono-per-page if given, else the config value (which hard-errors if the
141-- key is missing). No literal fallback -- a wrong size sends every link to the
142-- wrong page, so an absent value is an error, not a guess (Issue 10-036).
143local function resolve_chrono_per_page()
144 return CLI_CHRONO_PER_PAGE or flat_html.default_chrono_per_page()
145end
146-- }}}
147
148-- The model name lives in config.lua / --server / --model. Resolving it
149-- here through inference-server-config means a model swap propagates to this script
150-- automatically; we no longer need to remember to update a hardcoded string.
151inference_config.set_project_root(DIR)
152local CONFIG = {
153 model_name = inference_config.get_selected_model(),
154 max_poems_per_page = 100, -- Poems per word page
155 max_pages_per_word = 1, -- For now, just one page per word
156 word_embeddings_file = "word_embeddings.json",
157 poems_per_word_page = effective_poems_per_page, -- Issue 8-050d: configurable via CLI/config
158 max_words = effective_max_words, -- Max words to process (from CLI or config)
159}
160-- }}}
161
162-- {{{ local function cosine_similarity
163local function cosine_similarity(vec1, vec2)
164 if not vec1 or not vec2 or #vec1 ~= #vec2 then
165 return 0
166 end
167
168 local dot_product = 0
169 local norm1 = 0
170 local norm2 = 0
171
172 for i = 1, #vec1 do
173 dot_product = dot_product + (vec1[i] * vec2[i])
174 norm1 = norm1 + (vec1[i] * vec1[i])
175 norm2 = norm2 + (vec2[i] * vec2[i])
176 end
177
178 norm1 = math.sqrt(norm1)
179 norm2 = math.sqrt(norm2)
180
181 if norm1 == 0 or norm2 == 0 then
182 return 0
183 end
184
185 return dot_product / (norm1 * norm2)
186end
187-- }}}
188
189-- {{{ local function load_word_embeddings_cache
190local function load_word_embeddings_cache()
191 local cache_file = utils.embeddings_dir() .. "/" .. CONFIG.word_embeddings_file
192 local data = utils.read_json_file(cache_file)
193 return data and data.embeddings or {}
194end
195-- }}}
196
197-- {{{ local function save_word_embeddings_cache
198local function save_word_embeddings_cache(embeddings)
199 local cache_file = utils.embeddings_dir() .. "/" .. CONFIG.word_embeddings_file
200 local data = {
201 embeddings = embeddings,
202 model = CONFIG.model_name,
203 generated = os.date("%Y-%m-%d %H:%M:%S"),
204 count = 0
205 }
206 for _ in pairs(embeddings) do data.count = data.count + 1 end
207
208 return utils.write_json_file(cache_file, data)
209end
210-- }}}
211
212-- {{{ local function load_color_embeddings
213-- Issue 8-050a: Load color embeddings for semantic color assignment
214local function load_color_embeddings()
215 local color_file = utils.embeddings_dir() .. "/color_embeddings.json"
216 local data = utils.read_json_file(color_file)
217 return data and data.embeddings or nil
218end
219-- }}}
220
221-- {{{ local function compute_color_ranking
222-- Issue 8-050a: Rank EVERY palette color for a word by cosine similarity, strongest
223-- first. ranking[1] is the word's semantic color (same as the old "nearest color").
224-- Storing the whole ranking -- not just the winner -- is cheap and future-proof: the
225-- word cloud reads it to pick each large word's strongest NON-gray color (large
226-- words must never render gray, which is reserved for the de-emphasised small ones)
227-- without recomputing embeddings, and the rest is there if a later feature wants it.
228-- Returns an array of { color = name, similarity = sim }, sorted descending.
229local function compute_color_ranking(word_embedding, color_embeddings)
230 if not word_embedding or not color_embeddings then
231 return {}
232 end
233
234 local ranking = {}
235 for color_name, color_embedding in pairs(color_embeddings) do
236 ranking[#ranking + 1] = {
237 color = color_name,
238 similarity = cosine_similarity(word_embedding, color_embedding),
239 }
240 end
241 table.sort(ranking, function(a, b) return a.similarity > b.similarity end)
242 return ranking
243end
244-- }}}
245
246-- {{{ local function load_word_colors_cache
247-- Issue 8-050a: Load cached word colors
248local function load_word_colors_cache()
249 local cache_file = utils.embeddings_dir() .. "/word_colors.json"
250 local data = utils.read_json_file(cache_file)
251 if data and data.word_colors then
252 -- Convert array to lookup table for easy access
253 local lookup = {}
254 for _, entry in ipairs(data.word_colors) do
255 lookup[entry.word] = entry
256 end
257 return lookup
258 end
259 return {}
260end
261-- }}}
262
263-- {{{ local function save_word_colors_cache
264-- Issue 8-050a: Save word colors to cache
265local function save_word_colors_cache(word_colors_array)
266 local cache_file = utils.embeddings_dir() .. "/word_colors.json"
267 local data = {
268 word_colors = word_colors_array,
269 model = CONFIG.model_name,
270 generated = os.date("%Y-%m-%d %H:%M:%S"),
271 count = #word_colors_array
272 }
273 return utils.write_json_file(cache_file, data)
274end
275-- }}}
276
277-- {{{ local function compute_word_colors
278-- Issue 8-050a: Compute semantic colors for all word embeddings
279local function compute_word_colors(word_embeddings)
280 local color_embeddings = load_color_embeddings()
281 if not color_embeddings then
282 -- Hard error, not a silent skip (author's call): the color embeddings are
283 -- produced by the semantic-color stage and MUST exist by the time word
284 -- colors are computed. Skipping just shipped colorless words while hiding
285 -- a real upstream problem (e.g. the cache written to a path this reader
286 -- does not look at -- see the CACHE_IN_RAM desync, Issue 10-054).
287 error("word color computation needs color embeddings, but "
288 .. utils.embeddings_dir() .. "/color_embeddings.json was not found. "
289 .. "The semantic-color stage must run before this AND must write where "
290 .. "this reads. (Poem coloring regenerates them when absent; word "
291 .. "coloring could be unified to do the same instead of erroring.)")
292 end
293
294 local word_colors = {}
295 local count = 0
296 for word, embedding in pairs(word_embeddings) do
297 local ranking = compute_color_ranking(embedding, color_embeddings)
298 -- ranking[1] is the winner (gray is a valid winner here -- the word pages and
299 -- other consumers keep using `color`). The full ranking rides along in
300 -- `colors` so the word cloud can choose a non-gray color for large words.
301 local best = ranking[1] or { color = "gray", similarity = 0 }
302 table.insert(word_colors, {
303 word = word,
304 color = best.color,
305 similarity = best.similarity,
306 colors = ranking
307 })
308 count = count + 1
309 end
310
311 -- Sort by word for consistent output
312 table.sort(word_colors, function(a, b) return a.word < b.word end)
313
314 utils.log_info(string.format("Computed semantic colors for %d words", count))
315 return word_colors
316end
317-- }}}
318
319-- {{{ local function balanced_color_select
320-- Issue 8-050b: Selects N poems using cumulative-similarity-balanced round-robin
321-- Ensures roughly equal color representation while maintaining word relevance
322-- Uses cumulative totals to prevent high-affinity colors from dominating
323local function balanced_color_select(candidates, color_embeddings, color_names, N)
324 -- Phase 2: Compute color affinities for each candidate
325 for _, candidate in ipairs(candidates) do
326 local best_color = "gray"
327 local best_color_sim = -1
328 candidate.color_sims = {}
329 for _, color_name in ipairs(color_names) do
330 local color_emb = color_embeddings[color_name]
331 if color_emb then
332 local sim = cosine_similarity(color_emb, candidate.embedding)
333 candidate.color_sims[color_name] = sim
334 if sim > best_color_sim then
335 best_color_sim = sim
336 best_color = color_name
337 end
338 end
339 end
340 candidate.best_color = best_color
341 candidate.best_color_sim = best_color_sim
342 end
343
344 -- Phase 3: Build color buckets (sorted by word_similarity descending)
345 local buckets = {}
346 for _, color_name in ipairs(color_names) do
347 buckets[color_name] = {}
348 end
349 for _, candidate in ipairs(candidates) do
350 table.insert(buckets[candidate.best_color], candidate)
351 end
352 for _, color_name in ipairs(color_names) do
353 table.sort(buckets[color_name], function(a, b)
354 return a.word_similarity > b.word_similarity
355 end)
356 end
357
358 -- Phase 4: Balanced round-robin selection
359 -- Give priority to colors with lowest cumulative color-similarity totals
360 local cumulative = {}
361 local bucket_idx = {} -- next pick index per color
362 for _, color_name in ipairs(color_names) do
363 cumulative[color_name] = 0
364 bucket_idx[color_name] = 1
365 end
366
367 local selected = {}
368 while #selected < N do
369 -- Find color with lowest cumulative score that still has candidates
370 local pick_color = nil
371 local lowest_cum = math.huge
372 local most_remaining = -1
373 for _, color_name in ipairs(color_names) do
374 local remaining = #buckets[color_name] - bucket_idx[color_name] + 1
375 if remaining > 0 then
376 local cum = cumulative[color_name]
377 -- Tiebreak: prefer color with more remaining candidates
378 if cum < lowest_cum or (cum == lowest_cum and remaining > most_remaining) then
379 lowest_cum = cum
380 pick_color = color_name
381 most_remaining = remaining
382 end
383 end
384 end
385
386 if not pick_color then break end -- all buckets exhausted
387
388 -- Pop top candidate from this color's bucket
389 local idx = bucket_idx[pick_color]
390 local poem = buckets[pick_color][idx]
391 bucket_idx[pick_color] = idx + 1
392
393 -- Track cumulative color similarity (high-affinity colors "spend" budget faster)
394 cumulative[pick_color] = cumulative[pick_color] + poem.best_color_sim
395
396 table.insert(selected, poem)
397 end
398
399 return selected
400end
401-- }}}
402
403-- {{{ local function compute_centroid
404-- Issue 8-050e: Compute the centroid (average embedding) of selected poems
405-- Returns nil if no valid embeddings found
406local function compute_centroid(poems, poem_lookup)
407 if not poems or #poems == 0 then return nil end
408
409 -- Find dimension from first valid embedding
410 local dim = nil
411 for _, entry in ipairs(poems) do
412 local poem_id = tostring(entry.poem and entry.poem.poem_index)
413 local emb = poem_lookup[poem_id]
414 if emb then
415 dim = #emb
416 break
417 end
418 end
419 if not dim then return nil end
420
421 -- Initialize centroid to zeros
422 local centroid = {}
423 for d = 1, dim do centroid[d] = 0 end
424
425 -- Sum embeddings of selected poems
426 local count = 0
427 for _, entry in ipairs(poems) do
428 local poem_id = tostring(entry.poem and entry.poem.poem_index)
429 local emb = poem_lookup[poem_id]
430 if emb then
431 for d = 1, dim do centroid[d] = centroid[d] + emb[d] end
432 count = count + 1
433 end
434 end
435
436 if count == 0 then return nil end
437
438 -- Average
439 for d = 1, dim do centroid[d] = centroid[d] / count end
440
441 return centroid
442end
443-- }}}
444
445-- {{{ local function find_closest_poem_to_centroid
446-- Issue 8-050e: Find the poem whose embedding is closest to the given centroid
447-- Returns poem data or nil if no match found
448local function find_closest_poem_to_centroid(centroid, poem_lookup, poems_by_index)
449 if not centroid then return nil end
450
451 local best_poem = nil
452 local best_similarity = -1
453
454 for poem_id_str, poem_embedding in pairs(poem_lookup) do
455 local sim = cosine_similarity(centroid, poem_embedding)
456 if sim > best_similarity then
457 best_similarity = sim
458 best_poem = poems_by_index[tonumber(poem_id_str)]
459 end
460 end
461
462 return best_poem
463end
464-- }}}
465
466-- {{{ local function get_word_list
467-- Extracts word list from poems (same logic as wordcloud-generator)
468local function get_word_list(poems_data, stop_words, min_occurrences, max_words, min_word_length)
469 local word_counts = {}
470
471 for _, poem in ipairs(poems_data.poems or {}) do
472 local content = poem.content or ""
473 for word in content:gmatch("[%w]+") do
474 local normalized = word:lower()
475 if #normalized >= min_word_length
476 and not stop_words[normalized]
477 and not normalized:match("^%d+$") then
478 word_counts[normalized] = (word_counts[normalized] or 0) + 1
479 end
480 end
481 end
482
483 -- Filter and sort
484 local filtered = {}
485 for word, count in pairs(word_counts) do
486 if count >= min_occurrences then
487 table.insert(filtered, {word = word, count = count})
488 end
489 end
490 -- Sort by count descending, with an ALPHABETICAL tiebreaker. The tiebreak
491 -- is load-bearing, not cosmetic: `filtered` is built by iterating a Lua
492 -- hash (pairs), whose order differs from process to process, and
493 -- table.sort is not stable. Without a deterministic tiebreak, two separate
494 -- runs (stage 6 generating embeddings, the HTML stage consuming them) sort
495 -- ties differently, so the max_words cutoff keeps DIFFERENT words each run
496 -- -- which is exactly how a word ends up in the HTML list with no
497 -- embedding ("Missing embedding for word X"). Sorting ties by word makes
498 -- the cutoff identical across processes, closing that gap.
499 table.sort(filtered, function(a, b)
500 if a.count ~= b.count then return a.count > b.count end
501 return a.word < b.word
502 end)
503
504 -- Limit to max_words
505 local result = {}
506 for i = 1, math.min(#filtered, max_words) do
507 result[i] = filtered[i].word
508 end
509
510 return result
511end
512-- }}}
513
514-- {{{ local function load_stop_words
515-- Issue 10-003: Load stop words from embedded config.word_cloud.stop_words array
516local function load_stop_words()
517 local stop_words = {}
518 local wc = unified_config.word_cloud or {}
519 for _, word in ipairs(wc.stop_words or {}) do
520 stop_words[word:lower()] = true
521 end
522 return stop_words
523end
524-- }}}
525
526-- {{{ local function build_poem_embeddings_lookup
527local function build_poem_embeddings_lookup(embeddings_data)
528 local lookup = {}
529 if not embeddings_data or not embeddings_data.embeddings then
530 return lookup
531 end
532
533 for _, entry in ipairs(embeddings_data.embeddings) do
534 if entry.id and entry.embedding then
535 lookup[tostring(entry.id)] = entry.embedding
536 end
537 end
538
539 return lookup
540end
541-- }}}
542
543-- {{{ local function format_poem_for_word_page
544-- Issue 8-043c: Format poem entry using same box-drawing style as similar/different pages
545-- Issue 10-036: Added chrono_page_map for correct per-poem pagination links
546-- Uses CHRONOLOGICAL position for progress bar (same as similar/different pages)
547-- This helps users orient themselves in the timeline/story
548local function format_poem_for_word_page(poem, rank, similarity, poem_colors, color_config, chrono_map, chrono_page_map)
549 local poem_idx = poem.poem_index or 0
550
551 -- Get semantic color for this poem (default to gray)
552 local poem_color_data = poem_colors and poem_colors[poem_idx]
553 local semantic_color = poem_color_data and poem_color_data.color or "gray"
554 local hex_color = color_config and color_config[semantic_color] or "#888888"
555
556 -- Check if golden poem (metadata-based detection)
557 local is_golden = poem.metadata and poem.metadata.is_golden_poem
558
559 -- Use CHRONOLOGICAL position for progress bar (not similarity score)
560 -- This matches similar/different pages and helps orient the reader in the story
561 -- Issue 8-045: Use timeline_progress (time-based) instead of position-based
562 local chrono_info = chrono_map and chrono_map[poem_idx] or {position = 1, total_poems = 1, timeline_progress = 50}
563 local progress_pct = chrono_info.timeline_progress or ((chrono_info.position / chrono_info.total_poems) * 100)
564
565 -- Calculate progress bar chars
566 -- Regular: 83 chars total, Golden: 82 interior + 2 corners = 84 total
567 local total_bar_chars = is_golden and 82 or 83
568 local progress_chars = math.floor((progress_pct / 100) * total_bar_chars)
569 local remaining_chars = total_bar_chars - progress_chars
570
571 -- Top progress bar from the shared poem-bars module (canonical geometry,
572 -- same as the similar/different + chronological pages).
573 poem_bars.configure(color_config)
574 local colored_progress = poem_bars.progress_dashes(
575 { percentage = progress_pct }, semantic_color, is_golden, "top", false).visual
576
577 -- Navigation links
578 local base_path = ".."
579 local similar_link = string.format("<a href='%s/similar/%04d-01.html'>similar</a>", base_path, poem_idx)
580 local different_link = string.format("<a href='%s/different/%04d-01.html'>different</a>", base_path, poem_idx)
581 -- Anchor must match the spans the chronological pages emit, which are
582 -- get_poem_anchor_id() = "poem-<poem_index>". The old "poem-CATEGORY-ID"
583 -- form never matched any anchor, so the chronological link landed at the
584 -- top of the page instead of the poem.
585 local anchor_id = string.format("poem-%d", poem_idx)
586 -- Issue 10-036: Use chrono_page_map for correct paginated link (index.html is redirect that loses anchors)
587 local chrono_page = chrono_page_map and chrono_page_map[poem_idx] or "01"
588 local chrono_link = string.format("<a href='%s/chronological/%s.html#%s'>chronological</a>", base_path, chrono_page, anchor_id)
589
590 -- Word-wrap content to 80 chars
591 local content = poem.content or ""
592 content = content:gsub("&", "&amp;"):gsub("<", "&lt;"):gsub(">", "&gt;")
593
594 local wrapped_lines = {}
595
596 -- Handle content warning from poem.content_warning (ActivityPub CW)
597 if poem.content_warning and poem.content_warning ~= "" then
598 local cw_display = "CW: " .. poem.content_warning
599 local box_width = math.min(math.max(#cw_display, 20), 76)
600 local padded_cw = cw_display .. string.rep(" ", box_width - #cw_display)
601 table.insert(wrapped_lines, " ┌" .. string.rep("─", box_width + 2) .. "┐")
602 table.insert(wrapped_lines, " │ " .. padded_cw .. " │")
603 table.insert(wrapped_lines, " └" .. string.rep("─", box_width + 2) .. "┘")
604 table.insert(wrapped_lines, "")
605 table.insert(wrapped_lines, "")
606 end
607
608 -- Handle in-content CW: patterns
609 local main_content = content
610 local cw_match = content:match("^%s*[Cc][Ww]%s*:(.-)[\n\r]")
611 if not cw_match then
612 cw_match = content:match("^%s*[Cc]ontent [Ww]arning%s*:(.-)[\n\r]")
613 end
614 if cw_match then
615 local cw_text = cw_match:match("^%s*(.-)%s*$")
616 main_content = content:gsub("^%s*[Cc][Ww]%s*:[^\n\r]*[\n\r]?", "")
617 main_content = main_content:gsub("^%s*[Cc]ontent [Ww]arning%s*:[^\n\r]*[\n\r]?", "")
618 if cw_text and #cw_text > 0 then
619 local cw_display = "CW: " .. cw_text
620 local box_width = math.min(math.max(#cw_display, 20), 76)
621 local padded_cw = cw_display .. string.rep(" ", box_width - #cw_display)
622 table.insert(wrapped_lines, " ┌" .. string.rep("─", box_width + 2) .. "┐")
623 table.insert(wrapped_lines, " │ " .. padded_cw .. " │")
624 table.insert(wrapped_lines, " └" .. string.rep("─", box_width + 2) .. "┘")
625 table.insert(wrapped_lines, "")
626 end
627 end
628
629 -- Word-wrap paragraphs
630 for para in (main_content .. "\n"):gmatch("(.-)\n") do
631 if para == "" then
632 table.insert(wrapped_lines, "")
633 else
634 local current_line = ""
635 for word in para:gmatch("%S+") do
636 if #current_line + #word + 1 <= 80 then
637 current_line = current_line .. (current_line ~= "" and " " or "") .. word
638 else
639 if current_line ~= "" then table.insert(wrapped_lines, " " .. current_line) end
640 current_line = word
641 end
642 end
643 if current_line ~= "" then table.insert(wrapped_lines, " " .. current_line) end
644 end
645 end
646
647 -- Apply golden side borders if needed
648 if is_golden then
649 local golden_lines = {}
650 local colored_wall = string.format('<font color="%s"><b>║</b></font>', hex_color)
651 local CONTENT_WIDTH = 80
652
653 local function utf8_char_count(str)
654 return #(str:gsub("[\128-\191]", ""))
655 end
656
657 for _, line in ipairs(wrapped_lines) do
658 local line_content = line:match("^%s*(.*)$") or line
659 local visible_content = line_content:gsub("<[^>]+>", "")
660 local visible_length = utf8_char_count(visible_content)
661 local padded_content
662 if visible_length >= CONTENT_WIDTH then
663 padded_content = line_content
664 else
665 padded_content = line_content .. string.rep(" ", CONTENT_WIDTH - visible_length)
666 end
667 table.insert(golden_lines, colored_wall .. " " .. padded_content .. " │")
668 end
669 wrapped_lines = golden_lines
670 end
671
672 -- Helper to colorize box characters based on progress
673 local function color_char(char, pos)
674 if progress_chars > pos then
675 return string.format('<font color="%s"><b>%s</b></font>', hex_color, char)
676 end
677 return char
678 end
679
680 -- Navigation box + bottom bar from the shared poem-bars module. The old
681 -- inline copy had drifted (golden junctions at 9/70 instead of 10/71), which
682 -- is exactly why word-cloud golden poems were mangled.
683 local nav_top, nav_mid
684 if is_golden then
685 nav_top = poem_bars.golden_corner_box_separator(hex_color, progress_chars)
686 nav_mid = poem_bars.golden_corner_box_nav_line(similar_link, different_link, chrono_link, hex_color, progress_chars)
687 else
688 nav_top = poem_bars.corner_box_top(progress_chars, hex_color)
689 nav_mid = poem_bars.corner_box_nav_line(similar_link, different_link, chrono_link, progress_chars, hex_color)
690 end
691
692 local bottom_line = poem_bars.progress_dashes(
693 { percentage = progress_pct }, semantic_color, is_golden, "bottom", true).visual
694
695 -- Generate poem identifier (same format as similar/different pages)
696 -- Format: " -> file: fediverse/1234" or " -> file: notes/myfile"
697 local category = poem.category or "unknown"
698 local filename
699 if category == "notes" and poem.metadata and poem.metadata.source_file then
700 filename = poem.metadata.source_file
701 else
702 filename = tostring(poem.id or "unknown")
703 end
704 local poem_identifier = " -> file: " .. category .. "/" .. filename
705
706 -- Build final output
707 local output = {}
708 table.insert(output, colored_progress)
709 if is_golden then
710 -- The header (" -> file:") and the blank line below it belong INSIDE the
711 -- golden box, with the same ║ ... │ walls as every other line, so the box
712 -- has consistent borders and uniform line width (no special-spaced gap).
713 local function golden_line(content)
714 local visible = content:gsub("<[^>]+>", "")
715 local vlen = #(visible:gsub("[\128-\191]", ""))
716 local padded = content .. string.rep(" ", math.max(0, 80 - vlen))
717 return string.format('<font color="%s"><b>║</b></font> %s │', hex_color, padded)
718 end
719 table.insert(output, golden_line((poem_identifier:gsub("^%s+", ""))))
720 table.insert(output, golden_line(""))
721 else
722 table.insert(output, poem_identifier)
723 table.insert(output, "")
724 end
725 table.insert(output, table.concat(wrapped_lines, "\n"))
726 table.insert(output, nav_top)
727 table.insert(output, nav_mid)
728 table.insert(output, bottom_line)
729
730 return table.concat(output, "\n")
731end
732-- }}}
733
734-- {{{ local function generate_word_page
735-- Generates HTML page for a single word showing similar poems
736-- Issue 8-043c: Now uses same box-drawing format as similar/different pages
737-- Issue 8-050c: Word color shown in header, per-poem colors for progress bars
738-- Issue 8-050e: Chronological link points to centroid-based location in timeline
739-- Issue 10-036: Added chrono_page_map for correct per-poem pagination links
740-- Progress bar shows CHRONOLOGICAL position (not similarity) to orient readers
741local function generate_word_page(word, ranked_poems, output_dir, poems_per_page, poem_colors, color_config, chrono_map, word_hex_color, chrono_center_link, chrono_page_map)
742 local safe_word = word:lower():gsub("[^%w]", "")
743 local output_file = output_dir .. "/wordcloud/" .. safe_word .. ".html"
744
745 -- Ensure directory exists
746 os.execute('mkdir -p "' .. output_dir .. '/wordcloud"')
747
748 -- Take top N poems
749 local top_poems = {}
750 for i = 1, math.min(poems_per_page, #ranked_poems) do
751 top_poems[i] = ranked_poems[i]
752 end
753
754 -- Issue 8-050c: Use word's semantic color for header (default to gray if not provided)
755 local header_color = word_hex_color or "#888888"
756
757 -- Issue 8-050e: Use centroid-based chronological link if provided, else default
758 local base_path = ".."
759 local chrono_link = chrono_center_link or (base_path .. "/chronological/index.html")
760
761 -- Generate HTML
762 -- Issue 16-010: Added font style for Hack Nerd Font font-stack
763 -- Same centering CSS the similar/different/chronological pages use: each
764 -- <pre> centers as an inline-block (text stays left), so the poem column
765 -- lands on the page centerline even when an attached image is wider.
766 local font_style = [[<style>body, pre { font-family: 'Hack Nerd Font', 'Hack', 'Fira Code', 'JetBrains Mono', 'Cascadia Code', 'Consolas', 'Monaco', 'Liberation Mono', 'Courier New', monospace; }
767td { text-align: center; } pre { display: inline-block; text-align: left; margin: 0 auto; } img, video, audio { margin-left: auto; margin-right: auto; }</style>]]
768 local html_parts = {}
769 table.insert(html_parts, string.format([[<!DOCTYPE html>
770<html>
771<head>
772<meta charset="UTF-8">
773<title>Poems similar to: %s</title>
774%s</head>
775<body bgcolor="#000000" text="#FFFFFF" link="#6699FF" vlink="#9966FF">
776<center>
777<h1>Poems similar to: <i><font color="%s">%s</font></i></h1>
778<p>Top %d poems ranked by semantic similarity (progress bar shows chronological position)</p>
779<!-- Issue 16-010: Changed main.html to wordcloud.html (main.html doesn't exist) -->
780<p><a href="%s/wordcloud.html">Menu</a> │ <a href="%s">Chronological</a></p>
781</center>
782<hr>
783<table align="center"><tr><td>
784<pre>
785]], word, font_style, header_color, word, #top_poems, base_path, chrono_link))
786
787 -- Add ranked poems using box-drawing format
788 -- Issue 10-036: Pass chrono_page_map for correct per-poem pagination links
789 for i, entry in ipairs(top_poems) do
790 local formatted = format_poem_for_word_page(entry.poem, i, entry.similarity, poem_colors, color_config, chrono_map, chrono_page_map)
791 table.insert(html_parts, formatted)
792 table.insert(html_parts, "\n")
793 end
794
795 table.insert(html_parts, [[</pre>
796</td></tr></table>
797</body>
798</html>
799]])
800
801 local html = table.concat(html_parts)
802 return utils.write_file(output_file, html)
803end
804-- }}}
805
806-- {{{ function M.generate_word_embeddings
807-- Issue 8-043b: Stage 6 - Generate word embeddings only (expensive operation)
808-- Called during embedding generation stage of the pipeline
809function M.generate_word_embeddings(options)
810 options = options or {}
811
812 -- Check inference server availability
813 -- Issue 10-017: Use build_host_url() instead of deprecated OLLAMA_ENDPOINT
814 local endpoint = inference_config.build_host_url()
815 utils.log_info("Using inference endpoint: " .. endpoint)
816
817 -- Load poems for word extraction
818 local poems_file = utils.asset_path("poems.json")
819 local poems_data = utils.read_json_file(poems_file)
820 if not poems_data then
821 utils.log_error("Could not load poems.json")
822 return nil
823 end
824
825 -- Get word list (using CONFIG.max_words from CLI or config)
826 local stop_words = load_stop_words()
827 local words = get_word_list(poems_data, stop_words, 5, CONFIG.max_words, 3)
828 utils.log_info(string.format("Processing %d words", #words))
829
830 -- Load cached word embeddings
831 local word_embeddings = load_word_embeddings_cache()
832 local cache_hits = 0
833 local cache_misses = 0
834
835 -- Issue 10-050: collect the words missing from cache, then embed them all in
836 -- one batched + (sub-)batched call instead of one curl per word. Words are
837 -- single tokens so chunking is a no-op; embed_texts_with_chunking still
838 -- splits the request into BATCH_SIZE-sized round trips. endpoint + prompt
839 -- formatter are passed so the prefix matches the poem embeddings (required
840 -- for the word-to-poem cosine comparison to be meaningful).
841 local missing = {}
842 for _, word in ipairs(words) do
843 if not word_embeddings[word] then
844 missing[#missing + 1] = word
845 end
846 end
847 cache_hits = #words - #missing
848
849 if #missing > 0 then
850 utils.log_info(string.format("Embedding %d missing words (batched)...", #missing))
851 local vectors = fuzzy.embed_texts_with_chunking(missing, CONFIG.model_name, {
852 endpoint = endpoint,
853 format_fn = inference_config.format_embedding_prompt
854 })
855 if vectors then
856 for k, word in ipairs(missing) do
857 local embedding = vectors[k]
858 if embedding and type(embedding) == "table" and #embedding > 0 then
859 word_embeddings[word] = embedding
860 cache_misses = cache_misses + 1
861 -- Periodic checkpoint so a crash mid-run keeps prior work.
862 if cache_misses % 50 == 0 then
863 save_word_embeddings_cache(word_embeddings)
864 end
865 else
866 utils.log_warn(string.format("Failed to embed word '%s'", word))
867 end
868 end
869 else
870 utils.log_warn("Batch word embedding failed (inference server unreachable?)")
871 end
872 end
873
874 -- Save final cache
875 save_word_embeddings_cache(word_embeddings)
876 utils.log_info(string.format("Word embeddings: %d cached, %d newly generated", cache_hits, cache_misses))
877
878 -- Issue 8-050a: Compute and save semantic colors for all words
879 local word_colors = compute_word_colors(word_embeddings)
880 if word_colors then
881 save_word_colors_cache(word_colors)
882 utils.log_info(string.format("Saved semantic colors for %d words to word_colors.json", #word_colors))
883 end
884
885 return cache_hits + cache_misses
886end
887-- }}}
888
889-- {{{ function M.generate_word_html
890-- Issue 8-043b: Stage 9 - Generate HTML pages only (requires existing embeddings)
891-- Issue 8-043c: Now uses box-drawing format with semantic colors
892-- Called during HTML generation stage of the pipeline
893function M.generate_word_html(options)
894 options = options or {}
895 local output_dir = options.output_dir or (DIR .. "/output")
896
897 -- Load poems
898 local poems_file = utils.asset_path("poems.json")
899 local poems_data = utils.read_json_file(poems_file)
900 if not poems_data then
901 utils.log_error("Could not load poems.json")
902 return nil
903 end
904
905 -- Load poem embeddings
906 local embeddings_file = utils.embeddings_dir() .. "/embeddings.json"
907 local embeddings_data = utils.read_json_file(embeddings_file)
908 if not embeddings_data then
909 utils.log_error("Could not load poem embeddings - run --generate-embeddings first")
910 return nil
911 end
912 local poem_lookup = build_poem_embeddings_lookup(embeddings_data)
913
914 -- Load word embeddings (must exist from Stage 6)
915 local word_embeddings = load_word_embeddings_cache()
916 local word_count = 0
917 for _ in pairs(word_embeddings) do word_count = word_count + 1 end
918
919 if word_count == 0 then
920 utils.log_error("No word embeddings found - run --embeddings-only first")
921 return nil
922 end
923 utils.log_info(string.format("Loaded %d word embeddings", word_count))
924
925 -- Issue 8-043c: Load poem colors for semantic coloring
926 -- Issue 10-034: Fixed path - poem_colors.json is in embeddings directory, not assets root
927 local poem_colors_file = utils.embeddings_dir() .. "/poem_colors.json"
928 local poem_colors_data = utils.read_json_file(poem_colors_file)
929 -- poem_colors.json stores a plain ARRAY whose position IS the poem_index
930 -- (entries carry color/similarity but NO poem_index field). flat-html
931 -- reads it positionally (poem_colors[poem_index]); we must do the same.
932 -- The old code keyed on entry.poem_index -- always nil -- so the table came
933 -- out empty and every word-page progress bar fell back to gray. Reading the
934 -- array directly is what makes those bars match the similar/different pages.
935 local poem_colors = (poem_colors_data and poem_colors_data.poem_colors) or {}
936 if not (poem_colors_data and poem_colors_data.poem_colors) then
937 utils.log_warn("No poem colors found - using default gray")
938 end
939
940 -- Issue 8-050a: Load word colors for per-word semantic coloring
941 local word_colors = load_word_colors_cache()
942 local word_color_count = 0
943 for _ in pairs(word_colors) do word_color_count = word_color_count + 1 end
944 if word_color_count == 0 then
945 utils.log_warn("No word colors found - run --embeddings-only to generate them")
946 end
947
948 -- Issue 8-050b: Load color embeddings for balanced color selection
949 local color_embeddings = load_color_embeddings()
950 local use_balanced_selection = color_embeddings ~= nil
951 if not use_balanced_selection then
952 utils.log_warn("No color embeddings found - using pure similarity ranking")
953 end
954
955 -- Issue 8-050b: Get ordered color names from config
956 local color_names = unified_config.color_names
957 or {"red", "blue", "green", "purple", "orange", "yellow", "gray"}
958
959 -- Issue 8-043c: Load color configuration from unified config
960 local color_config = unified_config.colors or {
961 red = "#FF6B6B",
962 orange = "#FFA94D",
963 yellow = "#FFE066",
964 green = "#69DB7C",
965 cyan = "#38D9A9",
966 blue = "#74C0FC",
967 indigo = "#748FFC",
968 violet = "#DA77F2",
969 gray = "#868E96"
970 }
971
972 -- Issue 8-043c: Compute chronological mapping for progress bars
973 -- This maps poem_index → {position, total_poems} for timeline orientation
974 -- Issue 8-050e: Also builds chrono_page_map for centroid-based navigation
975 local chrono_map = {}
976 local chrono_page_map = {} -- poem_index → page string ("01", "02", etc.)
977 do
978 -- Reuse the chronological-page generator's OWN mapping instead of a second
979 -- inline copy. The old copy sorted by the raw creation_date string with no
980 -- tiebreaker and a 500/page default, so it disagreed with the actual
981 -- chronological pagination (timestamp sort + original-index tiebreaker +
982 -- config page size) -> links jumped to the wrong page and never scrolled.
983 -- The page size comes from resolve_chrono_per_page() (the build's
984 -- --chrono-per-page, else config). It MUST match what the chronological
985 -- pages were built with; a wrong size is exactly what broke these links,
986 -- so an absent value hard-errors rather than guessing (Issue 10-036).
987 local per_page = resolve_chrono_per_page()
988 local mapping = flat_html.compute_chronological_mapping(poems_data, per_page)
989 local total_poems = 0
990 for poem_index, info in pairs(mapping) do
991 chrono_map[poem_index] = {
992 position = info.position,
993 total_poems = info.total_poems,
994 -- Carry the time-based progress so the word-page bars match the
995 -- similar/different/chronological pages exactly (not position-based).
996 timeline_progress = info.timeline_progress,
997 }
998 chrono_page_map[poem_index] = string.format("%02d", info.page_number)
999 total_poems = info.total_poems
1000 end
1001 utils.log_info(string.format("Built chronological mapping for %d poems (%d per page, shared)", total_poems, per_page))
1002 end
1003
1004 -- Build poem index lookup
1005 local poems_by_index = {}
1006 for _, poem in ipairs(poems_data.poems) do
1007 if poem.poem_index then
1008 poems_by_index[poem.poem_index] = poem
1009 end
1010 end
1011
1012 -- Get word list (same as embedding generation to ensure consistency)
1013 local stop_words = load_stop_words()
1014 local words = get_word_list(poems_data, stop_words, 5, CONFIG.max_words, 3)
1015
1016 -- Generate pages for each word
1017 local pages_generated = 0
1018 for i, word in ipairs(words) do
1019 local word_embedding = word_embeddings[word]
1020 if word_embedding then
1021 io.write(string.format("\rGenerating word page %d/%d: %s ", i, #words, word))
1022 io.flush()
1023
1024 -- Issue 8-050b: Build candidate pool with embeddings preserved
1025 -- Phase 1: Rank ALL poems by word similarity
1026 local candidates = {}
1027 for poem_id_str, poem_embedding in pairs(poem_lookup) do
1028 local poem_id = tonumber(poem_id_str)
1029 local poem = poems_by_index[poem_id]
1030 if poem and poem_embedding then
1031 local word_sim = cosine_similarity(word_embedding, poem_embedding)
1032 table.insert(candidates, {
1033 poem = poem,
1034 embedding = poem_embedding,
1035 word_similarity = word_sim,
1036 similarity = word_sim -- for generate_word_page compatibility
1037 })
1038 end
1039 end
1040
1041 -- Sort by word similarity (descending)
1042 table.sort(candidates, function(a, b)
1043 return a.word_similarity > b.word_similarity
1044 end)
1045
1046 -- Issue 8-050b (revised): relevance first, THEN color spread.
1047 -- The page always shows the top-N MOST RELEVANT poems by similarity;
1048 -- balanced_color_select is handed exactly those N (not a 7N pool),
1049 -- so it keeps the whole relevant set and only REORDERS it to spread
1050 -- the colors across the page. The earlier 7N-pool version let color
1051 -- balancing DISPLACE strong matches with weaker color-diverse ones,
1052 -- which is why a "god" search surfaced unrelated poems.
1053 local ranked_poems
1054 if use_balanced_selection then
1055 local pool_size = math.min(#candidates, CONFIG.poems_per_word_page)
1056 local pool = {}
1057 for j = 1, pool_size do pool[j] = candidates[j] end
1058
1059 -- Reorder the top-N relevant poems for color spread (keeps all N).
1060 ranked_poems = balanced_color_select(
1061 pool, color_embeddings, color_names, CONFIG.poems_per_word_page)
1062 else
1063 -- Fallback: pure similarity ranking (no color data available)
1064 ranked_poems = candidates
1065 end
1066
1067 -- Issue 8-050c: Get word's semantic color for header
1068 local word_color_entry = word_colors[word]
1069 local word_semantic_color = word_color_entry and word_color_entry.color or "gray"
1070 local word_hex_color = color_config and color_config[word_semantic_color] or "#888888"
1071
1072 -- Issue 8-050e: Compute centroid-based chronological link
1073 local chrono_center_link = nil
1074 do
1075 -- Compute centroid of selected poems
1076 local centroid = compute_centroid(ranked_poems, poem_lookup)
1077 if centroid then
1078 -- Find the poem closest to the centroid
1079 local center_poem = find_closest_poem_to_centroid(centroid, poem_lookup, poems_by_index)
1080 if center_poem and center_poem.poem_index then
1081 -- Anchor must match the spans the chronological pages emit:
1082 -- get_poem_anchor_id() = "poem-<poem_index>". The old
1083 -- "poem-CATEGORY-ID" form matched no anchor, so this top
1084 -- "chronological" link landed at the page top, not the poem.
1085 local anchor_id = string.format("poem-%d", center_poem.poem_index)
1086 -- Get chronological page for this poem
1087 -- Issue 10-036: Use "01" fallback instead of "index" (redirect loses anchors)
1088 local chrono_page = chrono_page_map[center_poem.poem_index] or "01"
1089 -- Build full link
1090 local base_path = ".."
1091 chrono_center_link = string.format("%s/chronological/%s.html#%s",
1092 base_path, chrono_page, anchor_id)
1093 end
1094 end
1095 end
1096
1097 -- Generate page with semantic colors and chronological position
1098 -- Issue 10-036: Pass chrono_page_map for correct per-poem pagination links
1099 if generate_word_page(word, ranked_poems, output_dir, CONFIG.poems_per_word_page, poem_colors, color_config, chrono_map, word_hex_color, chrono_center_link, chrono_page_map) then
1100 pages_generated = pages_generated + 1
1101 end
1102 else
1103 utils.log_warn(string.format("Missing embedding for word '%s', skipping", word))
1104 end
1105 end
1106 print("") -- Newline after progress
1107
1108 utils.log_info(string.format("Generated %d word similarity pages in %s/wordcloud/", pages_generated, output_dir))
1109 return pages_generated
1110end
1111-- }}}
1112
1113-- {{{ function M.generate_word_pages
1114-- Backward compatible: generates both embeddings and HTML (original behavior)
1115function M.generate_word_pages(options)
1116 options = options or {}
1117
1118 -- Stage 1: Generate embeddings
1119 local embed_count = M.generate_word_embeddings(options)
1120 if not embed_count then
1121 return nil
1122 end
1123
1124 -- Stage 2: Generate HTML
1125 return M.generate_word_html(options)
1126end
1127-- }}}
1128
1129-- {{{ function M.main
1130function M.main(mode)
1131 mode = mode or RUN_MODE
1132
1133 if mode == "embeddings" then
1134 return M.generate_word_embeddings()
1135 elseif mode == "html" then
1136 return M.generate_word_html()
1137 else
1138 return M.generate_word_pages()
1139 end
1140end
1141-- }}}
1142
1143-- {{{ Command line execution
1144if arg and #arg >= 0 and debug.getinfo(3) == nil then
1145 if RUN_MODE == "help" then
1146 print("Usage: luajit src/generate-word-pages.lua [DIR] [OPTIONS]")
1147 print("")
1148 print("Generates similarity pages for word cloud words.")
1149 print("For each word, creates a page showing poems ranked by semantic similarity.")
1150 print("")
1151 print("Options:")
1152 print(" DIR Project directory (default: /mnt/mtwo/programming/ai-stuff/neocities-modernization)")
1153 print(" --embeddings-only Generate word embeddings only (Stage 6 - expensive)")
1154 print(" --html-only Generate HTML pages only (Stage 9 - fast, requires embeddings)")
1155 print(" --all Include all words (no max_words limit)")
1156 print(" --words N Set maximum words to process (default: 200 from config)")
1157 print(" --help Show this help message")
1158 print("")
1159 print("Pipeline Integration (Issue 8-043b):")
1160 print(" Stage 6 (Embeddings): luajit src/generate-word-pages.lua --embeddings-only")
1161 print(" Stage 9 (HTML): luajit src/generate-word-pages.lua --html-only")
1162 print("")
1163 print("Without flags, runs both stages (backward compatible).")
1164 os.exit(0)
1165 end
1166
1167 M.main()
1168end
1169-- }}}
1170
1171return M
1172