scripts/generate-html-parallel
#!/usr/bin/env luajit
-- Multi-threaded HTML page generation using effil
-- Generates similarity and difference pages in parallel for better performance
-- {{{ Early help check (before any requires)
-- Check for --help early, before setting up package paths
for i = 1, #arg do
if arg[i] == "--help" or arg[i] == "-h" then
print([[
Usage: generate-html-parallel [DIR] [OPTIONS]
Options:
--test Generate only first 10 poems (for quick testing)
--similar-only Generate only similarity pages
--different-only Generate only difference pages
--incremental Skip poems that already have HTML files
--threads=N Number of parallel threads (default: 8)
Pagination Options:
--pages=N Number of pages worth of poems to include (default: from config)
--poems-per-page=N Poems per page (default: from config, 100)
Total poems shown = pages × poems-per-page
-h, --help Show this help message
Examples:
generate-html-parallel # Default: 8 threads, all poems
generate-html-parallel . --threads=4 # 4 threads
generate-html-parallel . --incremental # Skip existing files
generate-html-parallel . --similar-only # Only similarity pages
generate-html-parallel . --pages=1 # Show only top 100 poems per file
generate-html-parallel . --pages=5 --poems-per-page=50 # Top 250 poems
]])
os.exit(0)
end
end
-- }}}
-- {{{ local function setup_dir_path
local function setup_dir_path(provided_dir)
if provided_dir then
return provided_dir
end
return "/mnt/mtwo/programming/ai-stuff/neocities-modernization"
end
-- }}}
-- {{{ Parse directory from arguments (before package.path setup)
local function find_dir_arg(args)
for i = 1, #args do
local a = args[i]
-- Skip flags
if a:sub(1, 1) ~= "-" and a:match("^%d+$") == nil then
return a
end
end
return nil
end
local DIR = setup_dir_path(find_dir_arg(arg))
-- }}}
-- Add effil library path
package.cpath = "/home/ritz/programming/ai-stuff/libs/lua/effil-jit/build/?.so;" .. package.cpath
package.path = DIR .. "/libs/?.lua;" .. DIR .. "/src/?.lua;" .. package.path
local effil = require("effil")
local utils = require("utils")
local dkjson = require("dkjson")
-- {{{ local function relative_path
-- Issue 7-003: Show project name instead of "./" when path equals DIR
local function relative_path(absolute_path)
if absolute_path == DIR or absolute_path == DIR .. "/" then
local dir_name = DIR:match("([^/]+)/?$")
return dir_name .. "/"
end
if absolute_path:sub(1, #DIR) == DIR then
local rel = absolute_path:sub(#DIR + 1)
if rel:sub(1, 1) == "/" then rel = rel:sub(2) end
return "./" .. rel
end
return absolute_path
end
-- }}}
-- {{{ Argument parsing
-- Parse command line arguments for more flexible option handling
local function parse_args(args)
local opts = {
dir = nil,
threads = 8,
test = false,
similar_only = false,
different_only = false,
incremental = false, -- Skip existing files
help = false,
-- Issue 8-022: Pagination CLI flags (nil means use config defaults)
pages = nil, -- Number of pages worth of poems to show
poems_per_page = nil -- Poems per page
}
local i = 1
while args and i <= #args do
local a = args[i]
if a == "--test" then
opts.test = true
elseif a == "--similar-only" then
opts.similar_only = true
elseif a == "--different-only" then
opts.different_only = true
elseif a == "--incremental" or a == "-i" then
opts.incremental = true
elseif a == "--help" or a == "-h" then
opts.help = true
elseif a:match("^--threads=") then
opts.threads = tonumber(a:match("^--threads=(%d+)")) or 8
-- Issue 8-022: Parse pagination flags
elseif a:match("^--pages=") then
local val = tonumber(a:match("^--pages=(%d+)"))
if val and val > 0 then
opts.pages = val
else
print("Error: --pages requires a positive integer")
os.exit(1)
end
elseif a:match("^--poems%-per%-page=") then
local val = tonumber(a:match("^--poems%-per%-page=(%d+)"))
if val and val > 0 then
opts.poems_per_page = val
else
print("Error: --poems-per-page requires a positive integer")
os.exit(1)
end
elseif a:match("^%d+$") then
opts.threads = tonumber(a)
elseif a:sub(1, 1) ~= "-" then
opts.dir = a
end
i = i + 1
end
return opts
end
local OPTS = parse_args(arg)
-- Note: --help is handled early (before requires) at top of script
-- }}}
-- Configuration (from parsed arguments)
local NUM_THREADS = OPTS.threads
local TEST_MODE = OPTS.test
local SIMILARITY_ONLY = OPTS.similar_only
local DIFFERENCE_ONLY = OPTS.different_only
local INCREMENTAL_MODE = OPTS.incremental
local MAX_TEST_PAGES = 10
local USE_CACHE = true -- Use pre-computed diversity sequences if available
-- Update DIR if provided in args
if OPTS.dir then
DIR = OPTS.dir
end
-- {{{ Issue 8-022: Load pagination config and apply CLI overrides
-- Issue 10-014: Migrated from input-sources.json to unified config.lua
local function load_pagination_config()
-- Default values (fallback if config missing)
local pagination = {
poems_per_page = 100,
max_pages_per_poem = 15 -- 0 means all poems (no limit)
}
-- Load from config.lua
local config_file = DIR .. "/config.lua"
local ok, config = pcall(dofile, config_file)
if ok and config and config.pagination then
pagination.poems_per_page = config.pagination.poems_per_page or pagination.poems_per_page
pagination.max_pages_per_poem = config.pagination.max_pages_per_poem or pagination.max_pages_per_poem
end
-- CLI overrides (Issue 8-022)
if OPTS.poems_per_page then
pagination.poems_per_page = OPTS.poems_per_page
end
if OPTS.pages then
pagination.max_pages_per_poem = OPTS.pages
end
return pagination
end
local PAGINATION = load_pagination_config()
-- Calculate max poems to include in each generated file
-- 0 means no limit (show all poems)
local MAX_POEMS_TO_SHOW = PAGINATION.max_pages_per_poem * PAGINATION.poems_per_page
local DIVERSITY_LIMIT = MAX_POEMS_TO_SHOW -- Limit diversity sequence to same amount
-- }}}
-- Color configuration (shared)
local COLOR_CONFIG = {
red = "#dc3c3c",
blue = "#3c78dc",
green = "#3cb45a",
purple = "#8c3cc8",
orange = "#e68c3c",
yellow = "#c8b428",
gray = "#787878"
}
-- Issue 8-057: Boost visual formatting color scheme for worker threads
-- Boosts are reshared content from other fediverse users, displayed with nested frames
local BOOST_COLOR_CONFIG = {
arrow = "#dc3c3c", -- Red: ◀─ and ─▶ arrows, [BOOST] label
outer_frame = "#3c78dc", -- Blue: ╔═╗║╚═╝ outer frame
inner_box = "#2aa198", -- Teal: ┌─┐│└─┘ inner content box
content_text = "#c8b428" -- Yellow: boosted text content
}
-- {{{ Worker function for similarity page generation
-- Note: This function must not capture any upvalues - all data passed as parameters
-- Issue 8-022: Added max_poems_to_show parameter to limit output size
-- Issue 8-057: Added boost visual formatting support (is_boost flag in poem array)
local function similarity_worker(poem_id, poem_content, poem_category, poem_is_boost,
similarities_for_poem, all_poems_array, poem_colors_table,
max_poem_id, output_dir, max_poems_to_show, boost_colors)
-- CRITICAL FIX: Copy effil.tables to local Lua tables at worker start
-- This avoids catastrophic IPC overhead (17B accesses → single O(n) copy)
-- See issue 8-002 "Bug 2: Catastrophic effil.table Access Overhead"
local local_poems_array = {}
for i = 1, #all_poems_array do
local_poems_array[i] = all_poems_array[i]
end
local local_similarities = {}
for k, v in pairs(similarities_for_poem) do
local_similarities[k] = v
end
local local_colors = {}
for k, v in pairs(poem_colors_table) do
local_colors[k] = v
end
local local_boost_colors = {}
for k, v in pairs(boost_colors) do
local_boost_colors[k] = v
end
-- {{{ Issue 8-057: Boost formatting helper functions (no upvalues)
-- Simplified version for similarity/different pages (no navigation section)
-- Generate boost top border with arrow and [BOOST] label
local function boost_top_border(progress_percent, colors)
local BAR_WIDTH = 78
local LABEL = "[BOOST]"
local LABEL_LEN = 7
local progress_chars = math.floor(progress_percent * BAR_WIDTH)
if progress_chars < LABEL_LEN + 2 then
progress_chars = LABEL_LEN + 2
end
local label_center = math.floor(progress_chars / 2)
local label_start = label_center - math.floor(LABEL_LEN / 2)
if label_start < 1 then label_start = 1 end
local colored_arrow = string.format('<font color="%s"><b>◀─</b></font>', colors.arrow)
local colored_frame_left = string.format('<font color="%s"><b>╔</b></font>', colors.outer_frame)
local colored_frame_right = string.format('<font color="%s"><b>╗</b></font>', colors.outer_frame)
local colored_bar = ""
for i = 1, BAR_WIDTH do
if i >= label_start and i < label_start + LABEL_LEN then
local char = LABEL:sub(i - label_start + 1, i - label_start + 1)
colored_bar = colored_bar .. string.format('<font color="%s"><b>%s</b></font>', colors.arrow, char)
elseif i <= progress_chars then
colored_bar = colored_bar .. string.format('<font color="%s"><b>═</b></font>', colors.outer_frame)
else
colored_bar = colored_bar .. string.format('<font color="%s">─</font>', colors.outer_frame)
end
end
return colored_arrow .. colored_frame_left .. colored_bar .. colored_frame_right
end
-- Generate inner box top border
local function boost_inner_top(colors)
local outer_wall = string.format('<font color="%s"><b>║</b></font>', colors.outer_frame)
local inner_corner_left = string.format('<font color="%s"><b>┌</b></font>', colors.inner_box)
local inner_corner_right = string.format('<font color="%s"><b>┐</b></font>', colors.inner_box)
local inner_dash = string.format('<font color="%s">─</font>', colors.inner_box)
return outer_wall .. " " .. inner_corner_left .. string.rep(inner_dash, 76) .. inner_corner_right .. " " .. outer_wall
end
-- Generate inner box bottom border
local function boost_inner_bottom(colors)
local outer_wall = string.format('<font color="%s"><b>║</b></font>', colors.outer_frame)
local inner_corner_left = string.format('<font color="%s"><b>└</b></font>', colors.inner_box)
local inner_corner_right = string.format('<font color="%s"><b>┘</b></font>', colors.inner_box)
local inner_dash = string.format('<font color="%s">─</font>', colors.inner_box)
return outer_wall .. " " .. inner_corner_left .. string.rep(inner_dash, 76) .. inner_corner_right .. " " .. outer_wall
end
-- Generate content line with nested frames
local function boost_content_line(line, colors)
local CONTENT_WIDTH = 74
local visible_length = #line -- Simplified: byte length (HTML tags will extend this)
local padded_line
if visible_length >= CONTENT_WIDTH then
padded_line = line
else
padded_line = line .. string.rep(" ", CONTENT_WIDTH - visible_length)
end
local outer_wall = string.format('<font color="%s"><b>║</b></font>', colors.outer_frame)
local inner_wall = string.format('<font color="%s"><b>│</b></font>', colors.inner_box)
local colored_content = string.format('<font color="%s">%s</font>', colors.content_text, padded_line)
return outer_wall .. " " .. inner_wall .. " " .. colored_content .. " " .. inner_wall .. " " .. outer_wall
end
-- Generate bottom border with arrow
local function boost_bottom_border(progress_percent, colors)
local BAR_WIDTH = 78
local progress_chars = math.floor(progress_percent * BAR_WIDTH)
local outer_corner_left = string.format('<font color="%s"><b>╚</b></font>', colors.outer_frame)
local outer_corner_right = string.format('<font color="%s"><b>╝</b></font>', colors.outer_frame)
local colored_arrow = string.format('<font color="%s"><b>─▶</b></font>', colors.arrow)
local bar_str = ""
for i = 1, BAR_WIDTH do
local in_progress = i <= progress_chars
if in_progress then
bar_str = bar_str .. string.format('<font color="%s"><b>═</b></font>', colors.outer_frame)
else
bar_str = bar_str .. string.format('<font color="%s">─</font>', colors.outer_frame)
end
end
return outer_corner_left .. bar_str .. outer_corner_right .. colored_arrow
end
-- Apply complete boost formatting (simplified - no nav section)
local function apply_boost_formatting(content, progress_percent, colors)
local lines = {}
table.insert(lines, boost_top_border(progress_percent, colors))
table.insert(lines, boost_inner_top(colors))
for line in (content .. "\n"):gmatch("(.-)\n") do
table.insert(lines, boost_content_line(line, colors))
end
table.insert(lines, boost_inner_bottom(colors))
table.insert(lines, boost_bottom_border(progress_percent, colors))
return table.concat(lines, "\n")
end
-- }}} End boost formatting helpers
-- Generate similarity ranking
local ranked_poems = {}
-- Add starting poem first
table.insert(ranked_poems, {
id = poem_id,
content = poem_content,
category = poem_category,
is_boost = poem_is_boost,
similarity = 1.0
})
-- Add other poems sorted by similarity (using local copies of effil.tables)
-- Issue 8-057: Poem array now has 4 elements per poem: id, content, category, is_boost
local other_poems = {}
for i = 1, #local_poems_array, 4 do
local other_id = local_poems_array[i]
local other_content = local_poems_array[i + 1]
local other_category = local_poems_array[i + 2]
local other_is_boost = local_poems_array[i + 3]
if other_id ~= poem_id then
local sim_score = local_similarities[tostring(other_id)] or 0
table.insert(other_poems, {
id = other_id,
content = other_content,
category = other_category,
is_boost = other_is_boost,
similarity = sim_score
})
end
end
table.sort(other_poems, function(a, b)
return a.similarity > b.similarity
end)
-- Issue 8-022: Limit poems to max_poems_to_show (includes starting poem)
local poems_limit = max_poems_to_show > 0 and (max_poems_to_show - 1) or #other_poems
for i, p in ipairs(other_poems) do
if i > poems_limit then break end
table.insert(ranked_poems, p)
end
-- Generate HTML content
local content_parts = {}
for _, poem_info in ipairs(ranked_poems) do
local pid = poem_info.id
-- Calculate progress (as decimal 0-1 for boost functions)
local progress_decimal = pid / max_poem_id
local progress_pct = progress_decimal * 100
local progress_chars = math.floor((progress_pct / 100) * 80)
local remaining_chars = 80 - progress_chars
-- Get color (using local copy of effil.table)
local color_name = local_colors[tostring(pid)] or "gray"
local hex_color = ({
red = "#dc3c3c", blue = "#3c78dc", green = "#3cb45a",
purple = "#8c3cc8", orange = "#e68c3c", yellow = "#c8b428", gray = "#787878"
})[color_name] or "#787878"
-- Format poem entry
table.insert(content_parts, string.format(" -> file: %s/%s.txt\n", poem_info.category or "unknown", pid))
-- Issue 8-057: Apply boost formatting for boosted content
if poem_info.is_boost then
-- Boost poems get nested frame with arrows and [BOOST] label
local formatted_content = apply_boost_formatting(poem_info.content or "", progress_decimal, local_boost_colors)
table.insert(content_parts, formatted_content .. "\n\n")
else
-- Regular poems get standard progress bar formatting
local progress_section = string.rep("═", progress_chars)
local remaining_section = string.rep("─", remaining_chars)
local progress_bar = string.format('<font color="%s"><b>%s</b></font>%s',
hex_color, progress_section, remaining_section)
table.insert(content_parts, string.format('<div aria-label="eighty dashes. %s.">%s</div>\n', color_name, progress_bar))
table.insert(content_parts, (poem_info.content or "") .. "\n")
table.insert(content_parts, string.format('<div aria-label="eighty dashes. %s.">%s</div>\n\n', color_name, progress_bar))
end
end
local html_content = table.concat(content_parts)
-- Generate unique filename ID (category prefix for cross-category uniqueness)
local unique_id = string.format("%s-%04d", poem_category, poem_id)
-- Generate HTML
local html = string.format([[<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>Poems sorted by similarity to: %s #%d</title>
</head>
<body>
<center>
<h1>Poetry Collection</h1>
<p>All poems sorted by similarity to: %s #%d</p>
<pre>
%s
</pre>
</center>
</body>
</html>]], poem_category, poem_id, poem_category, poem_id, html_content)
-- Write file (using category prefix for unique filename - Issue 8-019)
local filename = string.format("%s/similar/%s.html", output_dir, unique_id)
local f = io.open(filename, "w")
if f then
f:write(html)
f:close()
return true
end
return false
end
-- }}}
-- {{{ Worker function for difference page generation (centroid-based diversity)
-- Note: This function must not capture any upvalues - all data passed as parameters
-- Issue 8-057: Added boost visual formatting support
local function diversity_worker(poem_id, poem_content, poem_category, poem_is_boost,
starting_embedding_flat, all_poems_array, all_embeddings_flat,
embedding_dim, poem_colors_table, max_poem_id, output_dir,
diversity_limit, boost_colors)
-- CRITICAL FIX: Copy effil.tables to local Lua tables at worker start
-- This is especially important for diversity_worker which has O(n²) access patterns
-- Without this fix: ~17 BILLION IPC calls per sequence (~5 hours)
-- With this fix: single O(n) copy, then fast local access
-- See issue 8-002 "Bug 2: Catastrophic effil.table Access Overhead"
local local_poems_array = {}
for i = 1, #all_poems_array do
local_poems_array[i] = all_poems_array[i]
end
local local_embeddings_flat = {}
for i = 1, #all_embeddings_flat do
local_embeddings_flat[i] = all_embeddings_flat[i]
end
local local_starting_embedding = {}
for i = 1, #starting_embedding_flat do
local_starting_embedding[i] = starting_embedding_flat[i]
end
local local_colors = {}
for k, v in pairs(poem_colors_table) do
local_colors[k] = v
end
local local_boost_colors = {}
for k, v in pairs(boost_colors) do
local_boost_colors[k] = v
end
-- {{{ Issue 8-057: Boost formatting helper functions (no upvalues)
-- Simplified version for similarity/different pages (no navigation section)
local function boost_top_border(progress_percent, colors)
local BAR_WIDTH = 78
local LABEL = "[BOOST]"
local LABEL_LEN = 7
local progress_chars = math.floor(progress_percent * BAR_WIDTH)
if progress_chars < LABEL_LEN + 2 then
progress_chars = LABEL_LEN + 2
end
local label_center = math.floor(progress_chars / 2)
local label_start = label_center - math.floor(LABEL_LEN / 2)
if label_start < 1 then label_start = 1 end
local colored_arrow = string.format('<font color="%s"><b>◀─</b></font>', colors.arrow)
local colored_frame_left = string.format('<font color="%s"><b>╔</b></font>', colors.outer_frame)
local colored_frame_right = string.format('<font color="%s"><b>╗</b></font>', colors.outer_frame)
local colored_bar = ""
for i = 1, BAR_WIDTH do
if i >= label_start and i < label_start + LABEL_LEN then
local char = LABEL:sub(i - label_start + 1, i - label_start + 1)
colored_bar = colored_bar .. string.format('<font color="%s"><b>%s</b></font>', colors.arrow, char)
elseif i <= progress_chars then
colored_bar = colored_bar .. string.format('<font color="%s"><b>═</b></font>', colors.outer_frame)
else
colored_bar = colored_bar .. string.format('<font color="%s">─</font>', colors.outer_frame)
end
end
return colored_arrow .. colored_frame_left .. colored_bar .. colored_frame_right
end
local function boost_inner_top(colors)
local outer_wall = string.format('<font color="%s"><b>║</b></font>', colors.outer_frame)
local inner_corner_left = string.format('<font color="%s"><b>┌</b></font>', colors.inner_box)
local inner_corner_right = string.format('<font color="%s"><b>┐</b></font>', colors.inner_box)
local inner_dash = string.format('<font color="%s">─</font>', colors.inner_box)
return outer_wall .. " " .. inner_corner_left .. string.rep(inner_dash, 76) .. inner_corner_right .. " " .. outer_wall
end
local function boost_inner_bottom(colors)
local outer_wall = string.format('<font color="%s"><b>║</b></font>', colors.outer_frame)
local inner_corner_left = string.format('<font color="%s"><b>└</b></font>', colors.inner_box)
local inner_corner_right = string.format('<font color="%s"><b>┘</b></font>', colors.inner_box)
local inner_dash = string.format('<font color="%s">─</font>', colors.inner_box)
return outer_wall .. " " .. inner_corner_left .. string.rep(inner_dash, 76) .. inner_corner_right .. " " .. outer_wall
end
local function boost_content_line(line, colors)
local CONTENT_WIDTH = 74
local visible_length = #line
local padded_line
if visible_length >= CONTENT_WIDTH then
padded_line = line
else
padded_line = line .. string.rep(" ", CONTENT_WIDTH - visible_length)
end
local outer_wall = string.format('<font color="%s"><b>║</b></font>', colors.outer_frame)
local inner_wall = string.format('<font color="%s"><b>│</b></font>', colors.inner_box)
local colored_content = string.format('<font color="%s">%s</font>', colors.content_text, padded_line)
return outer_wall .. " " .. inner_wall .. " " .. colored_content .. " " .. inner_wall .. " " .. outer_wall
end
local function boost_bottom_border(progress_percent, colors)
local BAR_WIDTH = 78
local progress_chars = math.floor(progress_percent * BAR_WIDTH)
local outer_corner_left = string.format('<font color="%s"><b>╚</b></font>', colors.outer_frame)
local outer_corner_right = string.format('<font color="%s"><b>╝</b></font>', colors.outer_frame)
local colored_arrow = string.format('<font color="%s"><b>─▶</b></font>', colors.arrow)
local bar_str = ""
for i = 1, BAR_WIDTH do
local in_progress = i <= progress_chars
if in_progress then
bar_str = bar_str .. string.format('<font color="%s"><b>═</b></font>', colors.outer_frame)
else
bar_str = bar_str .. string.format('<font color="%s">─</font>', colors.outer_frame)
end
end
return outer_corner_left .. bar_str .. outer_corner_right .. colored_arrow
end
local function apply_boost_formatting(content, progress_percent, colors)
local lines = {}
table.insert(lines, boost_top_border(progress_percent, colors))
table.insert(lines, boost_inner_top(colors))
for line in (content .. "\n"):gmatch("(.-)\n") do
table.insert(lines, boost_content_line(line, colors))
end
table.insert(lines, boost_inner_bottom(colors))
table.insert(lines, boost_bottom_border(progress_percent, colors))
return table.concat(lines, "\n")
end
-- }}} End boost formatting helpers
-- Helper: cosine distance between two embeddings (passed as flat arrays with offset)
local function cosine_distance(emb1, emb2)
local dot_product = 0
local norm1 = 0
local norm2 = 0
for i = 1, #emb1 do
dot_product = dot_product + (emb1[i] * emb2[i])
norm1 = norm1 + (emb1[i] * emb1[i])
norm2 = norm2 + (emb2[i] * emb2[i])
end
norm1 = math.sqrt(norm1)
norm2 = math.sqrt(norm2)
if norm1 == 0 or norm2 == 0 then
return 1.0
end
return 1.0 - (dot_product / (norm1 * norm2))
end
-- Helper: calculate centroid of a list of embeddings
local function calculate_centroid(embeddings_list)
if #embeddings_list == 0 then return nil end
local dim = #embeddings_list[1]
local centroid = {}
for i = 1, dim do centroid[i] = 0 end
for _, emb in ipairs(embeddings_list) do
for i = 1, dim do
centroid[i] = centroid[i] + emb[i]
end
end
for i = 1, dim do
centroid[i] = centroid[i] / #embeddings_list
end
return centroid
end
-- Build starting embedding as table (using local copy)
local starting_embedding = {}
for i = 1, embedding_dim do
starting_embedding[i] = local_starting_embedding[i]
end
-- Build list of other poems with embeddings (using local copies)
-- Issue 8-057: Poem array now has 4 elements per poem: id, content, category, is_boost
local remaining_poems = {}
local num_poems = #local_poems_array / 4
for i = 1, num_poems do
local idx = (i - 1) * 4 + 1
local other_id = local_poems_array[idx]
local other_content = local_poems_array[idx + 1]
local other_category = local_poems_array[idx + 2]
local other_is_boost = local_poems_array[idx + 3]
if other_id ~= poem_id then
-- Extract embedding for this poem (using local copy)
local emb_start = (i - 1) * embedding_dim + 1
local embedding = {}
for j = 1, embedding_dim do
embedding[j] = local_embeddings_flat[emb_start + j - 1]
end
table.insert(remaining_poems, {
id = other_id,
content = other_content,
category = other_category,
is_boost = other_is_boost,
embedding = embedding
})
end
end
-- Generate diversity sequence using centroid-based selection
local diversity_sequence = {{
id = poem_id,
content = poem_content,
category = poem_category,
is_boost = poem_is_boost
}}
local selected_embeddings = {starting_embedding}
-- Limit diversity sequence if specified
local max_sequence = diversity_limit > 0 and diversity_limit or (#remaining_poems + 1)
while #remaining_poems > 0 and #diversity_sequence < max_sequence do
local centroid = calculate_centroid(selected_embeddings)
if not centroid then break end
-- Find poem with maximum distance from centroid
local max_dist = -1
local max_idx = -1
for i, poem_info in ipairs(remaining_poems) do
local dist = cosine_distance(centroid, poem_info.embedding)
if dist > max_dist then
max_dist = dist
max_idx = i
end
end
if max_idx > 0 then
local selected = remaining_poems[max_idx]
table.insert(diversity_sequence, {
id = selected.id,
content = selected.content,
category = selected.category,
is_boost = selected.is_boost
})
table.insert(selected_embeddings, selected.embedding)
table.remove(remaining_poems, max_idx)
else
break
end
end
-- Generate HTML content
local content_parts = {}
for _, poem_info in ipairs(diversity_sequence) do
local pid = poem_info.id
-- Calculate progress (as decimal 0-1 for boost functions)
local progress_decimal = pid / max_poem_id
local progress_pct = progress_decimal * 100
local progress_chars = math.floor((progress_pct / 100) * 80)
local remaining_chars = 80 - progress_chars
-- Get color (using local copy of effil.table)
local color_name = local_colors[tostring(pid)] or "gray"
local hex_color = ({
red = "#dc3c3c", blue = "#3c78dc", green = "#3cb45a",
purple = "#8c3cc8", orange = "#e68c3c", yellow = "#c8b428", gray = "#787878"
})[color_name] or "#787878"
-- Format poem entry
table.insert(content_parts, string.format(" -> file: %s/%s.txt\n", poem_info.category or "unknown", pid))
-- Issue 8-057: Apply boost formatting for boosted content
if poem_info.is_boost then
local formatted_content = apply_boost_formatting(poem_info.content or "", progress_decimal, local_boost_colors)
table.insert(content_parts, formatted_content .. "\n\n")
else
-- Generate progress bar
local progress_section = string.rep("═", progress_chars)
local remaining_section = string.rep("─", remaining_chars)
local progress_bar = string.format('<font color="%s"><b>%s</b></font>%s',
hex_color, progress_section, remaining_section)
-- Format poem entry
table.insert(content_parts, string.format('<div aria-label="eighty dashes. %s.">%s</div>\n', color_name, progress_bar))
table.insert(content_parts, (poem_info.content or "") .. "\n")
table.insert(content_parts, string.format('<div aria-label="eighty dashes. %s.">%s</div>\n\n', color_name, progress_bar))
end
end
local html_content = table.concat(content_parts)
-- Generate HTML
-- Generate unique filename ID (category prefix for cross-category uniqueness)
local unique_id = string.format("%s-%04d", poem_category, poem_id)
local html = string.format([[<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>Poems sorted by difference from: %s #%d</title>
</head>
<body>
<center>
<h1>Poetry Collection</h1>
<p>All poems sorted by difference from: %s #%d</p>
<pre>
%s
</pre>
</center>
</body>
</html>]], poem_category, poem_id, poem_category, poem_id, html_content)
-- Write file (using category prefix for unique filename - Issue 8-019)
local filename = string.format("%s/different/%s.html", output_dir, unique_id)
local f = io.open(filename, "w")
if f then
f:write(html)
f:close()
return true
end
return false
end
-- }}}
-- {{{ Worker function for cached difference page generation (uses pre-computed sequences)
-- Much faster than on-the-fly computation - just looks up poem data and generates HTML
-- Issue 8-022: Added max_poems_to_show parameter to limit output size
-- Issue 8-057: Added boost visual formatting support
local function cached_diversity_worker(poem_id, diversity_sequence, all_poems_lookup,
poem_colors_table, max_poem_id, output_dir,
max_poems_to_show, boost_colors)
-- CRITICAL FIX: Copy effil.tables to local Lua tables at worker start
-- See issue 8-002 "Bug 2: Catastrophic effil.table Access Overhead"
-- Issue 8-022: Limit sequence length based on pagination settings
local sequence_limit = max_poems_to_show > 0 and max_poems_to_show or #diversity_sequence
local local_sequence = {}
for i = 1, math.min(#diversity_sequence, sequence_limit) do
local_sequence[i] = diversity_sequence[i]
end
local local_poems_lookup = {}
for k, v in pairs(all_poems_lookup) do
local_poems_lookup[k] = v
end
local local_colors = {}
for k, v in pairs(poem_colors_table) do
local_colors[k] = v
end
local local_boost_colors = {}
for k, v in pairs(boost_colors) do
local_boost_colors[k] = v
end
-- {{{ Issue 8-057: Boost formatting helper functions (no upvalues)
local function boost_top_border(progress_percent, colors)
local BAR_WIDTH = 78
local LABEL = "[BOOST]"
local LABEL_LEN = 7
local progress_chars = math.floor(progress_percent * BAR_WIDTH)
if progress_chars < LABEL_LEN + 2 then
progress_chars = LABEL_LEN + 2
end
local label_center = math.floor(progress_chars / 2)
local label_start = label_center - math.floor(LABEL_LEN / 2)
if label_start < 1 then label_start = 1 end
local colored_arrow = string.format('<font color="%s"><b>◀─</b></font>', colors.arrow)
local colored_frame_left = string.format('<font color="%s"><b>╔</b></font>', colors.outer_frame)
local colored_frame_right = string.format('<font color="%s"><b>╗</b></font>', colors.outer_frame)
local colored_bar = ""
for i = 1, BAR_WIDTH do
if i >= label_start and i < label_start + LABEL_LEN then
local char = LABEL:sub(i - label_start + 1, i - label_start + 1)
colored_bar = colored_bar .. string.format('<font color="%s"><b>%s</b></font>', colors.arrow, char)
elseif i <= progress_chars then
colored_bar = colored_bar .. string.format('<font color="%s"><b>═</b></font>', colors.outer_frame)
else
colored_bar = colored_bar .. string.format('<font color="%s">─</font>', colors.outer_frame)
end
end
return colored_arrow .. colored_frame_left .. colored_bar .. colored_frame_right
end
local function boost_inner_top(colors)
local outer_wall = string.format('<font color="%s"><b>║</b></font>', colors.outer_frame)
local inner_corner_left = string.format('<font color="%s"><b>┌</b></font>', colors.inner_box)
local inner_corner_right = string.format('<font color="%s"><b>┐</b></font>', colors.inner_box)
local inner_dash = string.format('<font color="%s">─</font>', colors.inner_box)
return outer_wall .. " " .. inner_corner_left .. string.rep(inner_dash, 76) .. inner_corner_right .. " " .. outer_wall
end
local function boost_inner_bottom(colors)
local outer_wall = string.format('<font color="%s"><b>║</b></font>', colors.outer_frame)
local inner_corner_left = string.format('<font color="%s"><b>└</b></font>', colors.inner_box)
local inner_corner_right = string.format('<font color="%s"><b>┘</b></font>', colors.inner_box)
local inner_dash = string.format('<font color="%s">─</font>', colors.inner_box)
return outer_wall .. " " .. inner_corner_left .. string.rep(inner_dash, 76) .. inner_corner_right .. " " .. outer_wall
end
local function boost_content_line(line, colors)
local CONTENT_WIDTH = 74
local visible_length = #line
local padded_line
if visible_length >= CONTENT_WIDTH then
padded_line = line
else
padded_line = line .. string.rep(" ", CONTENT_WIDTH - visible_length)
end
local outer_wall = string.format('<font color="%s"><b>║</b></font>', colors.outer_frame)
local inner_wall = string.format('<font color="%s"><b>│</b></font>', colors.inner_box)
local colored_content = string.format('<font color="%s">%s</font>', colors.content_text, padded_line)
return outer_wall .. " " .. inner_wall .. " " .. colored_content .. " " .. inner_wall .. " " .. outer_wall
end
local function boost_bottom_border(progress_percent, colors)
local BAR_WIDTH = 78
local progress_chars = math.floor(progress_percent * BAR_WIDTH)
local outer_corner_left = string.format('<font color="%s"><b>╚</b></font>', colors.outer_frame)
local outer_corner_right = string.format('<font color="%s"><b>╝</b></font>', colors.outer_frame)
local colored_arrow = string.format('<font color="%s"><b>─▶</b></font>', colors.arrow)
local bar_str = ""
for i = 1, BAR_WIDTH do
local in_progress = i <= progress_chars
if in_progress then
bar_str = bar_str .. string.format('<font color="%s"><b>═</b></font>', colors.outer_frame)
else
bar_str = bar_str .. string.format('<font color="%s">─</font>', colors.outer_frame)
end
end
return outer_corner_left .. bar_str .. outer_corner_right .. colored_arrow
end
local function apply_boost_formatting(content, progress_percent, colors)
local lines = {}
table.insert(lines, boost_top_border(progress_percent, colors))
table.insert(lines, boost_inner_top(colors))
for line in (content .. "\n"):gmatch("(.-)\n") do
table.insert(lines, boost_content_line(line, colors))
end
table.insert(lines, boost_inner_bottom(colors))
table.insert(lines, boost_bottom_border(progress_percent, colors))
return table.concat(lines, "\n")
end
-- }}} End boost formatting helpers
-- diversity_sequence is already ordered list of poem IDs
-- all_poems_lookup maps poem_id -> {content, category, is_boost}
-- Generate HTML content
local content_parts = {}
for _, pid in ipairs(local_sequence) do
local poem_data = local_poems_lookup[tostring(pid)]
if poem_data then
-- Calculate progress (as decimal 0-1 for boost functions)
local progress_decimal = pid / max_poem_id
local progress_pct = progress_decimal * 100
local progress_chars = math.floor((progress_pct / 100) * 80)
local remaining_chars = 80 - progress_chars
-- Get color (using local copy of effil.table)
local color_name = local_colors[tostring(pid)] or "gray"
local hex_color = ({
red = "#dc3c3c", blue = "#3c78dc", green = "#3cb45a",
purple = "#8c3cc8", orange = "#e68c3c", yellow = "#c8b428", gray = "#787878"
})[color_name] or "#787878"
-- Format poem entry
table.insert(content_parts, string.format(" -> file: %s/%s.txt\n", poem_data.category or "unknown", pid))
-- Issue 8-057: Apply boost formatting for boosted content
if poem_data.is_boost then
local formatted_content = apply_boost_formatting(poem_data.content or "", progress_decimal, local_boost_colors)
table.insert(content_parts, formatted_content .. "\n\n")
else
-- Generate progress bar
local progress_section = string.rep("═", progress_chars)
local remaining_section = string.rep("─", remaining_chars)
local progress_bar = string.format('<font color="%s"><b>%s</b></font>%s',
hex_color, progress_section, remaining_section)
table.insert(content_parts, string.format('<div aria-label="eighty dashes. %s.">%s</div>\n', color_name, progress_bar))
table.insert(content_parts, (poem_data.content or "") .. "\n")
table.insert(content_parts, string.format('<div aria-label="eighty dashes. %s.">%s</div>\n\n', color_name, progress_bar))
end
end
end
local html_content = table.concat(content_parts)
-- Get starting poem's category for unique filename
local starting_poem_data = local_poems_lookup[tostring(poem_id)]
local poem_category = (starting_poem_data and starting_poem_data.category) or "unknown"
local unique_id = string.format("%s-%04d", poem_category, poem_id)
-- Generate HTML
local html = string.format([[<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>Poems sorted by difference from: %s #%d</title>
</head>
<body>
<center>
<h1>Poetry Collection</h1>
<p>All poems sorted by difference from: %s #%d</p>
<pre>
%s
</pre>
</center>
</body>
</html>]], poem_category, poem_id, poem_category, poem_id, html_content)
-- Write file (using category prefix for unique filename - Issue 8-019)
local filename = string.format("%s/different/%s.html", output_dir, unique_id)
local f = io.open(filename, "w")
if f then
f:write(html)
f:close()
return true
end
return false
end
-- }}}
-- {{{ Main execution
print("=" .. string.rep("=", 60))
print("Multi-threaded HTML Generation")
print("=" .. string.rep("=", 60))
print(string.format("Project directory: %s", relative_path(DIR)))
print(string.format("Thread count: %d", NUM_THREADS))
local mode_desc = TEST_MODE and "test (first " .. MAX_TEST_PAGES .. " poems)" or
SIMILARITY_ONLY and "similarity only" or
DIFFERENCE_ONLY and "difference only" or "full"
print(string.format("Mode: %s", mode_desc))
-- Issue 8-022: Display pagination settings
if MAX_POEMS_TO_SHOW > 0 then
print(string.format("Pagination: %d pages × %d poems = %d poems per file",
PAGINATION.max_pages_per_poem, PAGINATION.poems_per_page, MAX_POEMS_TO_SHOW))
else
print("Pagination: unlimited (all poems)")
end
-- {{{ Pipeline validation (Issue 10-011 Phase G)
print("\n🔍 Validating pipeline data...")
local validator = require("pipeline-validator")
validator.set_verbose(false) -- Keep output minimal
-- Validate for HTML generation
-- Diversity cache only required if:
-- 1. NOT running --similar-only (need difference pages)
-- 2. AND using cache mode (not on-the-fly calculation)
local require_diversity = (not SIMILARITY_ONLY) and USE_CACHE
local validation = validator.validate_for_html_generation("embeddinggemma_latest", require_diversity)
if not validation.ready then
print("\n" .. string.rep("=", 60))
io.stderr:write("\n")
validator.print_validation_report(validation)
io.stderr:write("Cannot generate HTML pages with incomplete pipeline data.\n")
io.stderr:write("Run the commands above to fix, or use --no-cache for on-the-fly\n")
io.stderr:write("diversity calculation (slow but doesn't require cache).\n\n")
os.exit(1)
end
-- Show warnings but continue
if #validation.warnings > 0 then
io.stderr:write("\n")
validator.print_validation_report(validation)
io.stderr:write("Continuing despite warnings...\n\n")
end
print(" ✅ Pipeline validation passed")
-- }}}
-- Load data files
print("\n🔄 Loading data files...")
local poems_file = DIR .. "/assets/poems.json"
local similarities_dir = DIR .. "/assets/embeddings/embeddinggemma_latest/similarities/"
local poem_colors_file = DIR .. "/assets/embeddings/embeddinggemma_latest/poem_colors.json"
local embeddings_file = DIR .. "/assets/embeddings/embeddinggemma_latest/embeddings.json"
local diversity_cache_file = DIR .. "/assets/embeddings/embeddinggemma_latest/diversity_cache.json"
local poems_data = utils.read_json_file(poems_file)
local poem_colors_data = utils.read_json_file(poem_colors_file)
local embeddings_data = nil
local diversity_cache = nil
-- {{{ local function load_individual_similarity_files
-- Loads similarity data from individual poem_X.json files in similarities_dir
-- Returns a dictionary matching the expected format: {poem_id: {other_id: score, ...}, ...}
-- This replaces the monolithic similarity_matrix.json with per-poem files
local function load_individual_similarity_files(sim_dir)
local similarities = {}
local loaded_count = 0
-- Find all similarity files
local handle = io.popen("find '" .. sim_dir .. "' -name 'poem_*.json' 2>/dev/null")
if not handle then
return nil, 0
end
local files = {}
for filepath in handle:lines() do
table.insert(files, filepath)
end
handle:close()
if #files == 0 then
return nil, 0
end
print(string.format(" Loading %d individual similarity files...", #files))
-- Load each file and convert to expected format
for i, filepath in ipairs(files) do
local sim_data = utils.read_json_file(filepath)
if sim_data and sim_data.metadata and sim_data.similarities then
local poem_id = sim_data.metadata.poem_id
if poem_id then
-- Convert array format to dictionary: {other_id: score, ...}
local poem_sims = {}
for _, entry in ipairs(sim_data.similarities) do
if entry.id then
poem_sims[tostring(entry.id)] = entry.similarity
end
end
similarities[tostring(poem_id)] = poem_sims
loaded_count = loaded_count + 1
end
end
-- Progress update every 500 files
if i % 500 == 0 then
print(string.format(" Loaded %d/%d files...", i, #files))
end
end
return similarities, loaded_count
end
-- }}}
-- Load similarities from individual files (replaces similarity_matrix.json)
local similarity_data = nil
local similarity_count = 0
if not DIFFERENCE_ONLY then
similarity_data, similarity_count = load_individual_similarity_files(similarities_dir)
if similarity_data then
print(string.format(" ✅ Loaded similarities for %d poems from individual files", similarity_count))
end
end
-- Check for diversity cache first (much faster than computing on-the-fly)
if not SIMILARITY_ONLY and USE_CACHE then
diversity_cache = utils.read_json_file(diversity_cache_file)
if diversity_cache then
print(" ✅ Diversity cache found - using pre-computed sequences")
end
end
-- Only load embeddings if generating difference pages AND no cache
if not SIMILARITY_ONLY and not diversity_cache then
print(" Loading embeddings (62MB, may take a moment)...")
print(" Note: Run scripts/precompute-diversity-sequences for faster generation")
embeddings_data = utils.read_json_file(embeddings_file)
end
if not poems_data then
print("❌ Error: Could not load poems.json")
os.exit(1)
end
if not similarity_data and not DIFFERENCE_ONLY then
print("❌ Error: Could not load similarity files from " .. similarities_dir)
print(" Run: ./run.sh --generate-similarity (to generate similarity files)")
os.exit(1)
end
if not embeddings_data and not diversity_cache and not SIMILARITY_ONLY then
print("❌ Error: Could not load embeddings.json or diversity_cache.json")
print(" Run: scripts/precompute-diversity-sequences")
os.exit(1)
end
-- Validate that all poems with content have valid embeddings
-- Load embeddings for validation if not already loaded
local validation_embeddings = embeddings_data
if not validation_embeddings then
validation_embeddings = utils.read_json_file(embeddings_file)
end
if validation_embeddings and validation_embeddings.embeddings then
local missing_embeddings = {}
local invalid_dimensions = {}
local random_embeddings = 0
for i, poem in ipairs(poems_data.poems) do
if poem.id and poem.content and poem.content ~= "" then
local emb_entry = validation_embeddings.embeddings[i]
if not emb_entry or not emb_entry.embedding then
table.insert(missing_embeddings, poem.id)
elseif type(emb_entry.embedding) ~= "table" or #emb_entry.embedding ~= 768 then
table.insert(invalid_dimensions, poem.id)
elseif emb_entry.is_random then
random_embeddings = random_embeddings + 1
end
end
end
if #missing_embeddings > 0 or #invalid_dimensions > 0 then
print("\n❌ EMBEDDING VALIDATION FAILED")
print("════════════════════════════════════════════════════════════════════════════")
if #missing_embeddings > 0 then
print(string.format(" Missing embeddings: %d poems with content have no embedding", #missing_embeddings))
print(" First 10 IDs: " .. table.concat({unpack(missing_embeddings, 1, 10)}, ", "))
end
if #invalid_dimensions > 0 then
print(string.format(" Invalid dimensions: %d poems have wrong embedding size", #invalid_dimensions))
print(" First 10 IDs: " .. table.concat({unpack(invalid_dimensions, 1, 10)}, ", "))
end
print("")
print(" Please regenerate embeddings before generating HTML:")
print(" ./run.sh --generate-similarity (option 1)")
print("════════════════════════════════════════════════════════════════════════════")
os.exit(1)
end
if random_embeddings > 0 then
print(string.format(" ℹ️ %d empty poems have random embeddings (expected)", random_embeddings))
end
end
local similarities = similarity_data and (similarity_data.similarities or similarity_data) or {}
local embedding_dim = embeddings_data and embeddings_data.metadata and embeddings_data.metadata.embedding_dimension or 768
print("✅ Data files loaded")
print(string.format(" 📄 Poems: %d", #poems_data.poems))
if diversity_cache then
local cache_count = 0
for _ in pairs(diversity_cache.sequences or {}) do cache_count = cache_count + 1 end
print(string.format(" 📦 Diversity cache: %d sequences", cache_count))
elseif embeddings_data then
print(string.format(" 🧮 Embeddings: %d (dim=%d)", #embeddings_data.embeddings, embedding_dim))
end
-- Build flat array of poems for worker (id, content, category, is_boost quadruplets)
-- Issue 8-057: Added is_boost flag for visual formatting support
-- Also build embeddings lookup for diversity generation
local all_poems_array = {}
local all_embeddings_flat = {}
local poem_lookup = {}
local poem_index_lookup = {} -- poem_id -> index in arrays
local valid_poem_ids = {}
local max_poem_id = 0
for i, poem in ipairs(poems_data.poems) do
if poem.id then
-- Issue 8-057: Detect boost poems via metadata.is_boost flag
local is_boost = (poem.metadata and poem.metadata.is_boost) and true or false
table.insert(all_poems_array, poem.id)
table.insert(all_poems_array, poem.content or "")
table.insert(all_poems_array, poem.category or "unknown")
table.insert(all_poems_array, is_boost) -- 4th element: is_boost flag
-- Store embedding as flat values if available
-- Use poem_index for lookup (Issue 8-019: cross-category ID collision fix)
local lookup_key = poem.poem_index or i
local emb_data = embeddings_data and embeddings_data.embeddings[lookup_key]
if emb_data and emb_data.embedding then
for _, val in ipairs(emb_data.embedding) do
table.insert(all_embeddings_flat, val)
end
else
-- Pad with zeros if no embedding
for _ = 1, embedding_dim do
table.insert(all_embeddings_flat, 0)
end
end
poem_lookup[poem.id] = poem
poem_index_lookup[poem.id] = i
table.insert(valid_poem_ids, poem.id)
if poem.id > max_poem_id then
max_poem_id = poem.id
end
end
end
-- Build poem colors lookup (just color names)
local poem_colors_lookup = {}
if poem_colors_data and poem_colors_data.poem_colors then
for id, data in pairs(poem_colors_data.poem_colors) do
poem_colors_lookup[tostring(id)] = data.color or "gray"
end
end
-- Build poem content lookup for cached diversity worker (id -> {content, category, is_boost})
-- Issue 8-057: Added is_boost flag for visual formatting support
local poem_content_lookup = {}
for _, poem in ipairs(poems_data.poems) do
if poem.id then
local is_boost = (poem.metadata and poem.metadata.is_boost) and true or false
poem_content_lookup[tostring(poem.id)] = {
content = poem.content or "",
category = poem.category or "unknown",
is_boost = is_boost
}
end
end
-- Sort IDs for consistent ordering
table.sort(valid_poem_ids)
-- Limit in test mode
if TEST_MODE then
local limited_ids = {}
for i = 1, math.min(MAX_TEST_PAGES, #valid_poem_ids) do
table.insert(limited_ids, valid_poem_ids[i])
end
valid_poem_ids = limited_ids
end
-- {{{ Incremental mode: skip poems that already have HTML files
-- Filter out poem IDs that have existing output files
local function file_exists(path)
local f = io.open(path, "r")
if f then f:close() return true end
return false
end
local skipped_similar = 0
local skipped_different = 0
local original_count = #valid_poem_ids
-- Track which poem IDs need similarity vs difference pages (for incremental mode)
local sim_needed_set = {}
local diff_needed_set = {}
if INCREMENTAL_MODE then
local output_dir = DIR .. "/output"
-- Check each poem for existing files (using category prefix for unique filenames)
local sim_needed_count = 0
local diff_needed_count = 0
for _, poem_id in ipairs(valid_poem_ids) do
-- Get poem's category for unique filename (Issue 8-019)
local poem = poem_lookup[poem_id]
local category = poem and poem.category or "unknown"
local unique_id = string.format("%s-%04d", category, poem_id)
local sim_file = string.format("%s/similar/%s.html", output_dir, unique_id)
local diff_file = string.format("%s/different/%s.html", output_dir, unique_id)
if not file_exists(sim_file) then
sim_needed_set[poem_id] = true
sim_needed_count = sim_needed_count + 1
else
skipped_similar = skipped_similar + 1
end
if not file_exists(diff_file) then
diff_needed_set[poem_id] = true
diff_needed_count = diff_needed_count + 1
else
skipped_different = skipped_different + 1
end
end
-- Filter to poems needing at least one type of page
local filtered_ids = {}
for _, poem_id in ipairs(valid_poem_ids) do
local needs_sim = (not DIFFERENCE_ONLY) and sim_needed_set[poem_id]
local needs_diff = (not SIMILARITY_ONLY) and diff_needed_set[poem_id]
if needs_sim or needs_diff then
table.insert(filtered_ids, poem_id)
end
end
valid_poem_ids = filtered_ids
if skipped_similar > 0 or skipped_different > 0 then
print(string.format(" ⏭️ Incremental mode: skipping existing files"))
if not DIFFERENCE_ONLY then
print(string.format(" Similar pages: %d exist, %d to generate", skipped_similar, sim_needed_count))
end
if not SIMILARITY_ONLY then
print(string.format(" Difference pages: %d exist, %d to generate", skipped_different, diff_needed_count))
end
end
-- If nothing to do, exit early
if #valid_poem_ids == 0 then
print("\n✅ All pages already exist, nothing to generate")
os.exit(0)
end
else
-- Non-incremental mode: mark all poems as needing generation
for _, poem_id in ipairs(valid_poem_ids) do
sim_needed_set[poem_id] = true
diff_needed_set[poem_id] = true
end
end
-- }}}
print(string.format(" 🔢 Pages to generate: %d", #valid_poem_ids))
-- Create output directories
local output_dir = DIR .. "/output"
os.execute("mkdir -p " .. output_dir .. "/similar")
os.execute("mkdir -p " .. output_dir .. "/different")
-- Convert data to effil-shareable format
print("\n📦 Converting data for parallel processing...")
local shared_poems_array = effil.table(all_poems_array)
local shared_colors = effil.table(poem_colors_lookup)
-- Issue 8-057: Share boost color config for worker threads
local shared_boost_colors = effil.table(BOOST_COLOR_CONFIG)
local shared_embeddings = nil
if embeddings_data then
print(" Converting embeddings to shared format...")
shared_embeddings = effil.table(all_embeddings_flat)
print(string.format(" ✅ Embeddings converted (%d values)", #all_embeddings_flat))
end
-- Generate similarity pages in parallel (unless --different-only)
-- Note: sim_failed removed - we now fail-fast on any error
local sim_completed = 0
local sim_elapsed = 0
if not DIFFERENCE_ONLY then
print("\n🚀 Generating similarity pages...")
local start_time = os.time()
local threads = {}
-- Process in batches
local batch_start = 1
while batch_start <= #valid_poem_ids do
local batch_end = math.min(batch_start + NUM_THREADS - 1, #valid_poem_ids)
-- Start threads for this batch (skip poems that don't need similarity pages in incremental mode)
local batch_threads = {}
local batch_poem_ids = {}
local thread_idx = 0
for i = batch_start, batch_end do
local poem_id = valid_poem_ids[i]
-- In incremental mode, skip poems that already have similarity pages
if not sim_needed_set[poem_id] then
-- Already exists, skip (counted in skipped_similar earlier)
else
local poem = poem_lookup[poem_id]
local poem_sims = similarities[tostring(poem_id)] or {}
-- Issue 8-057: Detect boost status for visual formatting
local is_boost = (poem.metadata and poem.metadata.is_boost) and true or false
-- Convert similarities to effil table
local shared_sims = effil.table(poem_sims)
thread_idx = thread_idx + 1
batch_poem_ids[thread_idx] = poem_id
-- Issue 8-022: Pass max_poems_to_show for pagination control
-- Issue 8-057: Pass is_boost and boost_colors for visual formatting
batch_threads[thread_idx] = effil.thread(similarity_worker)(
poem_id,
poem.content or "",
poem.category or "unknown",
is_boost,
shared_sims,
shared_poems_array,
shared_colors,
max_poem_id,
output_dir,
MAX_POEMS_TO_SHOW,
shared_boost_colors
)
end
end
-- Wait for batch to complete (FAIL-FAST: any failure stops generation)
for i = 1, thread_idx do
local status = batch_threads[i]:wait()
-- effil doesn't return function values directly, check file existence instead
local poem_id = batch_poem_ids[i]
-- Get poem's category for unique filename (Issue 8-019)
local poem = poem_lookup[poem_id]
local category = poem and poem.category or "unknown"
local unique_id = string.format("%s-%04d", category, poem_id)
local filename = string.format("%s/similar/%s.html", output_dir, unique_id)
local f = io.open(filename, "r")
if f then
f:close()
sim_completed = sim_completed + 1
else
-- FAIL-FAST: Stop immediately on any page generation failure
print(string.format("\n❌ SIMILARITY PAGE GENERATION FAILED"))
print(string.format(" Poem ID: %s #%d", category, poem_id))
print(string.format(" Expected file: %s", filename))
print(string.format(" Completed before failure: %d/%d", sim_completed, #valid_poem_ids))
print("")
print(" Remedy: Check similarity data for this poem:")
print(string.format(" - Similarity file: %s/assets/embeddings/embeddinggemma_latest/similarities/poem_%d.json", DIR, poem_id))
print(" - Regenerate similarity data: ./run.sh --generate-similarity")
os.exit(1)
end
end
-- Progress update
local elapsed = os.time() - start_time
local rate = sim_completed / math.max(elapsed, 1)
print(string.format(" Similarity: %d/%d completed (%.1f pages/sec)",
sim_completed, #valid_poem_ids, rate))
threads = {}
batch_start = batch_end + 1
end
sim_elapsed = os.time() - start_time
print(string.format("\n✅ Similarity pages: %d in %d seconds (%.1f pages/sec)",
sim_completed, sim_elapsed, sim_completed / math.max(sim_elapsed, 1)))
else
print("\n⏭️ Skipping similarity pages (--different-only)")
end
-- Generate difference pages in parallel (unless --similar-only)
-- Note: diff_failed removed - we now fail-fast on any error
local diff_completed = 0
local diff_elapsed = 0
if not SIMILARITY_ONLY and (diversity_cache or shared_embeddings) then
local using_cache = diversity_cache ~= nil
if using_cache then
print("\n🚀 Generating difference pages (using cached sequences)...")
-- Convert poem content lookup to effil-shareable format
local shared_poem_content = effil.table(poem_content_lookup)
else
print("\n🚀 Generating difference pages (O(n²) diversity algorithm)...")
print(" Note: This is slower than similarity due to centroid calculations")
print(" Tip: Run scripts/precompute-diversity-sequences for faster generation")
end
local start_time = os.time()
-- Process in batches
local batch_start = 1
while batch_start <= #valid_poem_ids do
local batch_end = math.min(batch_start + NUM_THREADS - 1, #valid_poem_ids)
-- Start threads for this batch (skip poems that don't need difference pages in incremental mode)
local batch_threads = {}
local batch_poem_ids = {}
local thread_idx = 0
for i = batch_start, batch_end do
local poem_id = valid_poem_ids[i]
-- In incremental mode, skip poems that already have difference pages
if not diff_needed_set[poem_id] then
-- Already exists, skip (counted in skipped_different earlier)
elseif using_cache then
-- Use cached sequence (fast path)
local cached_sequence = diversity_cache.sequences[tostring(poem_id)]
if cached_sequence then
local shared_sequence = effil.table(cached_sequence)
local shared_poem_content = effil.table(poem_content_lookup)
thread_idx = thread_idx + 1
batch_poem_ids[thread_idx] = poem_id
-- Issue 8-022: Pass max_poems_to_show for pagination control
-- Issue 8-057: Pass boost_colors for visual formatting
batch_threads[thread_idx] = effil.thread(cached_diversity_worker)(
poem_id,
shared_sequence,
shared_poem_content,
shared_colors,
max_poem_id,
output_dir,
MAX_POEMS_TO_SHOW,
shared_boost_colors
)
end
else
-- Compute on-the-fly (slow path)
local poem = poem_lookup[poem_id]
local poem_index = poem_index_lookup[poem_id]
-- Issue 8-057: Detect boost status for visual formatting
local is_boost = (poem.metadata and poem.metadata.is_boost) and true or false
-- Extract embedding for starting poem
local emb_start = (poem_index - 1) * embedding_dim + 1
local starting_embedding = {}
for j = 1, embedding_dim do
starting_embedding[j] = all_embeddings_flat[emb_start + j - 1]
end
local shared_starting_emb = effil.table(starting_embedding)
thread_idx = thread_idx + 1
batch_poem_ids[thread_idx] = poem_id
-- Issue 8-057: Pass is_boost and boost_colors for visual formatting
batch_threads[thread_idx] = effil.thread(diversity_worker)(
poem_id,
poem.content or "",
poem.category or "unknown",
is_boost,
shared_starting_emb,
shared_poems_array,
shared_embeddings,
embedding_dim,
shared_colors,
max_poem_id,
output_dir,
DIVERSITY_LIMIT,
shared_boost_colors
)
end
end
-- Wait for batch to complete (FAIL-FAST: any failure stops generation)
for i = 1, thread_idx do
if batch_threads[i] then
local status = batch_threads[i]:wait()
end
-- Check file existence to verify success
local poem_id = batch_poem_ids[i]
-- Get poem's category for unique filename (Issue 8-019)
local poem = poem_lookup[poem_id]
local category = poem and poem.category or "unknown"
local unique_id = string.format("%s-%04d", category, poem_id)
local filename = string.format("%s/different/%s.html", output_dir, unique_id)
local f = io.open(filename, "r")
if f then
f:close()
diff_completed = diff_completed + 1
else
-- FAIL-FAST: Stop immediately on any page generation failure
print(string.format("\n❌ DIFFERENCE PAGE GENERATION FAILED"))
print(string.format(" Poem ID: %s #%d", category, poem_id))
print(string.format(" Expected file: %s", filename))
print(string.format(" Completed before failure: %d/%d", diff_completed, #valid_poem_ids))
print("")
print(" Remedy: Check diversity data for this poem:")
print(string.format(" - Diversity cache: %s/assets/embeddings/embeddinggemma_latest/diversity_cache.json", DIR))
print(" - Regenerate diversity data: scripts/precompute-diversity-sequences")
os.exit(1)
end
end
-- Progress update
local elapsed = os.time() - start_time
local rate = diff_completed / math.max(elapsed, 1)
print(string.format(" Difference: %d/%d completed (%.1f pages/sec)",
diff_completed, #valid_poem_ids, rate))
batch_start = batch_end + 1
end
diff_elapsed = os.time() - start_time
print(string.format("\n✅ Difference pages: %d in %d seconds (%.1f pages/sec)",
diff_completed, diff_elapsed, diff_completed / math.max(diff_elapsed, 1)))
else
if SIMILARITY_ONLY then
print("\n⏭️ Skipping difference pages (--similar-only)")
else
print("\n⚠️ Skipping difference pages (no cache or embeddings)")
print(" Run: scripts/precompute-diversity-sequences")
end
end
-- Summary (FAIL-FAST: if we reach here, all pages generated successfully)
print("\n" .. string.rep("=", 61))
print("Generation Summary")
print(string.rep("=", 61))
if not DIFFERENCE_ONLY then
print(string.format(" Similarity pages: %d completed", sim_completed))
end
if not SIMILARITY_ONLY then
print(string.format(" Difference pages: %d completed", diff_completed))
end
local total_elapsed = sim_elapsed + diff_elapsed
print(string.format(" Total time: %d seconds", total_elapsed))
print("\n📖 Output directory: " .. relative_path(output_dir))
if not DIFFERENCE_ONLY then
print(" Similar: firefox " .. relative_path(output_dir) .. "/similar/fediverse-0001.html")
end
if not SIMILARITY_ONLY then
print(" Different: firefox " .. relative_path(output_dir) .. "/different/fediverse-0001.html")
end
-- }}}