src/centroid-generator.lua
1#!/usr/bin/env lua
2
3-- Centroid Generator
4-- Generates custom embedding centroids from user-defined source files and keywords.
5-- These centroids serve as alternative starting points for similarity/diversity exploration,
6-- allowing users to discover poems by mood or theme rather than by existing poem.
7--
8-- The generator reads assets/centroids.json, embeds each centroid's combined content,
9-- and outputs the results to assets/embeddings/{model}/centroids.json for use by
10-- the HTML generator.
11
12-- {{{ local function setup_dir_path
13local function setup_dir_path(provided_dir)
14 if provided_dir then
15 return provided_dir
16 end
17 return "/mnt/mtwo/programming/ai-stuff/neocities-modernization"
18end
19-- }}}
20
21-- Script configuration
22local DIR = setup_dir_path()
23
24-- Load required libraries
25package.path = DIR .. "/libs/?.lua;" .. DIR .. "/src/?.lua;" .. package.path
26local utils = require("utils")
27local dkjson = require("dkjson")
28local inference_config = require("inference-server-config")
29-- Issue 10-050: shared chunker + batched embedding primitive replace this file's
30-- own recursive binary-split chunker and per-chunk curl. endpoint + prompt
31-- formatter are threaded in so fuzzy-computing's separate config instance is
32-- never consulted for server selection.
33local fuzzy = require("fuzzy-computing")
34local text_chunking = require("text-chunking")
35
36-- Issue 10-003: Load unified config from config.lua
37local config_loader = require("config-loader")
38config_loader.set_project_root(DIR)
39local unified_config = config_loader.load()
40
41-- Initialize asset path configuration
42utils.init_assets_root(arg)
43
44local M = {}
45
46-- {{{ Configuration
47-- Issue 10-003: centroids now loaded from unified config
48-- model_name is the GGUF-style identifier the inference server expects (e.g. qwen3-embedding:4b).
49-- model_storage_name is its sanitized-for-filesystem form (e.g. qwen3-embedding_4b).
50-- embedding_dimensions is read from the loaded embeddings.json metadata so a
51-- model swap doesn't require code changes; this CONFIG.dimensions is only
52-- used as a sanity hint for log lines.
53local _selected_model = inference_config.get_selected_model()
54local CONFIG = {
55 model_name = _selected_model,
56 model_storage_name = _selected_model:gsub("[^%w%-_.]", "_"),
57 embedding_dimensions = nil, -- resolved at runtime from embeddings.json metadata
58 max_content_length = 20000,
59 min_content_length = 10
60}
61-- }}}
62
63-- {{{ local function generate_embedding_with_chunking
64-- Issue 10-050: embeds a centroid's combined text and returns the LIST of its
65-- per-chunk vectors (calculate_ultra_centroid below folds them into one
66-- normalized centroid). This replaced three things at once: a bespoke single-
67-- input curl (generate_embedding), a paragraph/line split-point finder
68-- (find_safe_split_point), and a recursive binary chunker. All three are now the
69-- shared chunker (text-chunking.lua) plus ONE batched request covering every
70-- chunk of the centroid. The `depth` parameter is kept for call-site
71-- compatibility but is no longer used (chunking is no longer recursive here).
72local function generate_embedding_with_chunking(text, endpoint, depth)
73 -- Exact token-aware chunking (Issue 10-050): size each chunk by the model's
74 -- real tokenizer (via /tokenize) so it fits the context with no truncation,
75 -- exactly as the poem path does. Raises if /tokenize is unreachable — no
76 -- silent fallback, since the embed call would fail next anyway.
77 local count_fn = fuzzy.make_token_counter(endpoint)
78 local max_tokens = fuzzy.embedding_chunk_budget(endpoint, inference_config.format_embedding_prompt)
79 local chunks = text_chunking.chunk_text_by_tokens(text, count_fn, max_tokens)
80 if #chunks == 0 then
81 utils.log_error("Centroid text produced no chunks (empty after preprocessing)")
82 return nil, "empty_text"
83 end
84
85 -- All chunks of this centroid embed in one request. endpoint + prompt
86 -- formatter are passed so the centroid shares the poems' embedding space.
87 local vectors, err = fuzzy.get_embeddings_batch(
88 chunks, CONFIG.model_name, endpoint, inference_config.format_embedding_prompt)
89 if not vectors then
90 utils.log_error("Centroid embedding batch failed: " .. tostring(err))
91 return nil, err or "batch_failed"
92 end
93
94 local all_embeddings = {}
95 for i = 1, #chunks do
96 local v = vectors[i]
97 if type(v) == "table" and #v > 0 then
98 table.insert(all_embeddings, v)
99 -- Learn the model's dimension from the first real vector (matches
100 -- the old behavior; CONFIG.embedding_dimensions is a logging hint).
101 if not CONFIG.embedding_dimensions then
102 CONFIG.embedding_dimensions = #v
103 end
104 else
105 utils.log_warn(string.format(" Missing vector for centroid chunk %d/%d", i, #chunks))
106 end
107 end
108
109 if #all_embeddings == 0 then
110 return nil, "no_embeddings_generated"
111 end
112 return all_embeddings, "success"
113end
114-- }}}
115
116-- {{{ local function calculate_ultra_centroid
117-- Combines multiple chunk embeddings into a single normalized centroid
118local function calculate_ultra_centroid(chunk_embeddings)
119 if not chunk_embeddings or #chunk_embeddings == 0 then
120 return nil
121 end
122
123 -- If only one embedding, just normalize and return it
124 if #chunk_embeddings == 1 then
125 local embedding = chunk_embeddings[1]
126 local magnitude = 0
127 for i = 1, #embedding do
128 magnitude = magnitude + embedding[i] * embedding[i]
129 end
130 magnitude = math.sqrt(magnitude)
131
132 local normalized = {}
133 for i = 1, #embedding do
134 normalized[i] = embedding[i] / magnitude
135 end
136 return normalized
137 end
138
139 local dim = #chunk_embeddings[1]
140 local centroid = {}
141
142 -- Initialize with zeros
143 for i = 1, dim do
144 centroid[i] = 0
145 end
146
147 -- Sum all chunk embeddings
148 -- Note: Division by count before normalization is unnecessary because
149 -- normalization rescales to unit length regardless of input magnitude.
150 -- See Issue 9-003 for mathematical proof of cosine scale-invariance.
151 for _, embedding in ipairs(chunk_embeddings) do
152 for i = 1, dim do
153 centroid[i] = centroid[i] + embedding[i]
154 end
155 end
156
157 -- Normalize to unit length (makes any prior scaling irrelevant)
158 local magnitude = 0
159 for i = 1, dim do
160 magnitude = magnitude + centroid[i] * centroid[i]
161 end
162 magnitude = math.sqrt(magnitude)
163
164 if magnitude > 0 then
165 for i = 1, dim do
166 centroid[i] = centroid[i] / magnitude
167 end
168 end
169
170 return centroid
171end
172-- }}}
173
174-- {{{ local function load_source_files
175-- Reads and concatenates content from source file paths
176local function load_source_files(file_paths)
177 local contents = {}
178
179 for _, filepath in ipairs(file_paths or {}) do
180 local content, err = utils.read_file(filepath)
181 if content then
182 table.insert(contents, content)
183 utils.log_info(" Loaded source file: " .. filepath .. " (" .. #content .. " chars)")
184 else
185 utils.log_warn(" Could not load source file: " .. filepath .. " - " .. (err or "unknown error"))
186 end
187 end
188
189 return table.concat(contents, "\n\n")
190end
191-- }}}
192
193-- {{{ local function build_centroid_text
194-- Combines source file contents and keywords into a single text for embedding
195local function build_centroid_text(centroid_def)
196 local parts = {}
197
198 -- Add source file contents
199 local file_content = load_source_files(centroid_def.source_files)
200 if #file_content > 0 then
201 table.insert(parts, file_content)
202 end
203
204 -- Add keywords as natural phrases
205 if centroid_def.keywords and #centroid_def.keywords > 0 then
206 local keywords_text = table.concat(centroid_def.keywords, "\n")
207 table.insert(parts, keywords_text)
208 end
209
210 return table.concat(parts, "\n\n")
211end
212-- }}}
213
214-- {{{ local function generate_centroid_embedding
215-- Main function to generate a single centroid's embedding
216local function generate_centroid_embedding(centroid_def, endpoint)
217 utils.log_info("Processing centroid: " .. centroid_def.name)
218
219 -- Build combined text
220 local combined_text = build_centroid_text(centroid_def)
221
222 if #combined_text < CONFIG.min_content_length then
223 utils.log_error(" Combined content too short (" .. #combined_text .. " chars) - need at least " .. CONFIG.min_content_length)
224 return nil, "content_too_short"
225 end
226
227 utils.log_info(" Combined content: " .. #combined_text .. " chars")
228
229 -- Generate embeddings (with chunking if needed)
230 local chunk_embeddings, status = generate_embedding_with_chunking(combined_text, endpoint)
231
232 if not chunk_embeddings then
233 utils.log_error(" Failed to generate embeddings: " .. (status or "unknown"))
234 return nil, status
235 end
236
237 utils.log_info(" Generated " .. #chunk_embeddings .. " chunk embedding(s)")
238
239 -- Calculate ultra-centroid
240 local centroid_vector = calculate_ultra_centroid(chunk_embeddings)
241
242 if not centroid_vector then
243 utils.log_error(" Failed to calculate ultra-centroid")
244 return nil, "centroid_calculation_failed"
245 end
246
247 utils.log_info(" Ultra-centroid calculated successfully")
248
249 return {
250 name = centroid_def.name,
251 description = centroid_def.description,
252 output_slug = centroid_def.output_slug,
253 embedding = centroid_vector,
254 chunk_count = #chunk_embeddings,
255 content_length = #combined_text,
256 generated_at = utils.get_timestamp()
257 }, "success"
258end
259-- }}}
260
261-- {{{ function M.generate_all_centroids
262-- Processes all centroids defined in the config file
263function M.generate_all_centroids(options)
264 options = options or {}
265
266 -- Check inference server availability
267 local endpoint = inference_config.build_host_url()
268 utils.log_info("Using inference endpoint: " .. endpoint)
269
270 -- Verify endpoint is reachable. /v1/models is llama.cpp's OpenAI-
271 -- compatible liveness probe (was /api/tags under Ollama).
272 local test_cmd = "curl -s --max-time 5 " .. endpoint .. "/v1/models > /dev/null 2>&1"
273 local test_result = os.execute(test_cmd)
274 if test_result ~= 0 and test_result ~= true then
275 utils.log_error("Cannot reach the inference endpoint: " .. endpoint)
276 utils.log_error("Please ensure the inference server is running and accessible.")
277 return nil, "inference_unavailable"
278 end
279
280 -- Issue 10-003: Load centroids from unified config instead of assets/centroids.json
281 local centroids_list = unified_config.centroids
282 if not centroids_list or #centroids_list == 0 then
283 utils.log_error("No centroids defined in config.lua")
284 return nil, "config_parse_error"
285 end
286
287 utils.log_info("Found " .. #centroids_list .. " centroid definition(s)")
288
289 -- Process each centroid
290 local results = {
291 centroids = {},
292 metadata = {
293 model = CONFIG.model_name,
294 dimensions = CONFIG.embedding_dimensions,
295 generated_at = utils.get_timestamp(),
296 source_config = "config.lua"
297 }
298 }
299
300 local success_count = 0
301 local error_count = 0
302
303 for i, centroid_def in ipairs(centroids_list) do
304 utils.log_info(string.format("\n[%d/%d] Processing: %s", i, #centroids_list, centroid_def.name))
305
306 local result, status = generate_centroid_embedding(centroid_def, endpoint)
307
308 if result then
309 results.centroids[centroid_def.output_slug] = result
310 success_count = success_count + 1
311 else
312 error_count = error_count + 1
313 utils.log_error(" Skipping centroid due to error: " .. (status or "unknown"))
314 end
315 end
316
317 utils.log_info(string.format("\nGeneration complete: %d succeeded, %d failed", success_count, error_count))
318
319 -- Determine output path.
320 -- Issue 10-054: route through embeddings_dir() so centroids follow the
321 -- RAM/disk switch like every other movable cache. Equivalent to the old
322 -- assets_root/embeddings/<model_storage_name> while the switch is off (same
323 -- selected model, same sanitization).
324 local output_dir = utils.embeddings_dir()
325 os.execute("mkdir -p " .. output_dir)
326
327 local output_file = output_dir .. "/centroids.json"
328
329 -- Save results
330 local json_output = dkjson.encode(results, {indent = true})
331 local write_success, write_err = utils.write_file(output_file, json_output)
332
333 if not write_success then
334 utils.log_error("Failed to save centroids: " .. (write_err or "unknown"))
335 return nil, "write_error"
336 end
337
338 utils.log_info("Centroids saved to: " .. output_file)
339
340 return results, "success"
341end
342-- }}}
343
344-- {{{ Main execution
345if arg and arg[0] and arg[0]:match("centroid%-generator%.lua$") then
346 utils.log_info("=== Centroid Generator ===")
347 utils.log_info("Generating custom mood/theme centroids for exploration pages")
348 utils.log_info("")
349
350 local results, status = M.generate_all_centroids()
351
352 if results then
353 utils.log_info("\n=== Summary ===")
354 local count = 0
355 for slug, data in pairs(results.centroids) do
356 count = count + 1
357 utils.log_info(string.format(" %s: %s (%d chunks)", slug, data.name, data.chunk_count))
358 end
359 utils.log_info(string.format("Total: %d centroids generated", count))
360 else
361 utils.log_error("Centroid generation failed: " .. (status or "unknown"))
362 os.exit(1)
363 end
364end
365-- }}}
366
367return M
368