src/semantic-color-calculator.lua

422 lines

1#!/usr/bin/env lua
2
3-- Semantic color calculation system for neocities-modernization
4-- Generates color embeddings and precomputes poem-to-color mappings
5
6-- {{{ local function setup_dir_path
7local function setup_dir_path(provided_dir)
8 if provided_dir then
9 return provided_dir
10 end
11 return "/mnt/mtwo/programming/ai-stuff/neocities-modernization"
12end
13-- }}}
14
15-- Script configuration - handle args properly to avoid -I interfering with DIR
16local DIR = setup_dir_path()
17if arg then
18 for _, arg_val in ipairs(arg) do
19 if arg_val ~= "-I" and not arg_val:match("^%-") then
20 DIR = arg_val
21 break
22 end
23 end
24end
25
26-- Load required libraries
27package.path = DIR .. "/libs/?.lua;" .. DIR .. "/src/?.lua;" .. package.path
28local utils = require("utils")
29local dkjson = require("dkjson")
30-- Shared progress renderer (Issue 10-051 family): one animated \r bar on a TTY,
31-- plain newline-terminated lines under --debug (VKC_DEBUG, so a redirected log
32-- keeps the full history), and silent when piped. Replaces the old every-100
33-- "[INFO] Progress:" lines that scrolled the console during a full run.
34local progress = require("progress-display")
35
36-- Issue 10-003: Load unified config from config.lua
37local config_loader = require("config-loader")
38config_loader.set_project_root(DIR)
39local unified_config = config_loader.load()
40
41-- Endpoint resolution goes through the shared inference-server-config module so that
42-- --server=<name> and default_inference_server are honored here the same way
43-- they are honored by the rest of the pipeline. Previously this file had a
44-- hardcoded fallback IP that drifted from config.lua and quietly broke
45-- color-embedding generation when the IP no longer pointed at a live server.
46local inference_config = require("inference-server-config")
47-- Issue 10-050: shared batched embedding primitive. We hand it our endpoint and
48-- prompt formatter so fuzzy-computing's separate config instance is never used.
49local fuzzy = require("fuzzy-computing")
50-- combine_chunk_vectors: reused to mean-combine a color's association-word
51-- embeddings into one centroid (same recombination used for long-poem chunks).
52local text_chunking = require("text-chunking")
53inference_config.set_project_root(DIR)
54
55-- Initialize asset path configuration (CLI --dir takes precedence over config)
56utils.init_assets_root(arg)
57
58local M = {}
59
60-- {{{ function cosine_similarity
61local function cosine_similarity(vec1, vec2)
62 -- Calculate cosine similarity between two vectors
63 if #vec1 ~= #vec2 then
64 error("Vectors must have same dimension")
65 end
66
67 local dot_product = 0
68 local norm1 = 0
69 local norm2 = 0
70
71 for i = 1, #vec1 do
72 dot_product = dot_product + (vec1[i] * vec2[i])
73 norm1 = norm1 + (vec1[i] * vec1[i])
74 norm2 = norm2 + (vec2[i] * vec2[i])
75 end
76
77 norm1 = math.sqrt(norm1)
78 norm2 = math.sqrt(norm2)
79
80 if norm1 == 0 or norm2 == 0 then
81 return 0.0
82 end
83
84 return dot_product / (norm1 * norm2)
85end
86-- }}}
87
88-- {{{ local function compute_color_stats
89-- Per-color mean/std of cosine similarity across ALL poems, used to z-score the
90-- color assignment below. Why this is needed: bare color-word embeddings suffer
91-- from "hubness" -- a couple of them (yellow, green) sit slightly nearer the
92-- centre of the whole poem cloud, so by raw nearest-cosine they win ~70% of all
93-- poems (38% yellow + 29% green) while blue gets 2.5%. The colours are NOT
94-- evenly spread anchors; they are bunched, and tiny baseline offsets decide
95-- everything. Standardising each colour's similarity (subtract its mean, divide
96-- by its std) makes a poem pick the colour it is most ABOVE-baseline for, which
97-- balances the distribution to ~10-18% each without touching the embeddings.
98local function compute_color_stats(poems_data, poem_embeddings_data, color_embeddings)
99 local sums, sums2, n = {}, {}, 0
100 for cname, _ in pairs(color_embeddings) do sums[cname] = 0; sums2[cname] = 0 end
101 for i, poem in ipairs(poems_data.poems) do
102 local e = poem_embeddings_data.embeddings[i]
103 if poem.poem_index and e and e.embedding then
104 n = n + 1
105 for cname, cvec in pairs(color_embeddings) do
106 if cvec then
107 local s = cosine_similarity(e.embedding, cvec)
108 sums[cname] = sums[cname] + s
109 sums2[cname] = sums2[cname] + s * s
110 end
111 end
112 end
113 end
114 local stats = {}
115 for cname, _ in pairs(color_embeddings) do
116 local mean = (n > 0) and (sums[cname] / n) or 0
117 local var = (n > 0) and (sums2[cname] / n - mean * mean) or 0
118 stats[cname] = { mean = mean, std = math.sqrt(math.max(var, 1e-9)) }
119 end
120 return stats
121end
122-- }}}
123
124-- {{{ function calculate_semantic_color_for_poem
125-- color_stats is optional. With it, the colour is chosen by z-scored similarity
126-- (hubness-corrected); without it, by raw nearest cosine (legacy behaviour). The
127-- returned similarity is always the RAW cosine, so downstream displays are
128-- unchanged -- only the WINNER selection is standardised.
129local function calculate_semantic_color_for_poem(poem_embedding, color_embeddings, color_stats)
130 local best_color = "gray" -- Default fallback
131 local best_score = -math.huge
132 local best_raw = -1
133
134 for color_name, color_embedding in pairs(color_embeddings) do
135 if color_embedding then
136 local similarity = cosine_similarity(poem_embedding, color_embedding)
137 local score = similarity
138 if color_stats and color_stats[color_name] then
139 local st = color_stats[color_name]
140 score = (similarity - st.mean) / st.std
141 end
142 if score > best_score then
143 best_score = score
144 best_color = color_name
145 best_raw = similarity
146 end
147 end
148 end
149
150 return best_color, best_raw
151end
152-- }}}
153
154-- {{{ function generate_color_embeddings
155-- endpoint is optional: nil means "ask inference-server-config for the active server",
156-- which is the right behavior for almost all callers. The parameter exists
157-- so that test harnesses or one-off scripts can target a specific server
158-- without having to mutate inference-server-config's module-local selection.
159--
160-- Each color's embedding is the MEAN of the embeddings of its association words
161-- (config.color_associations), giving a richer "essence" anchor than the bare
162-- color word. color_associations is optional: without it (or for a color missing
163-- from it) we fall back to embedding the bare color name, so old callers keep
164-- working. Words are embedded in one batched request per color via the shared
165-- primitive; endpoint + prompt formatter are passed so this file's config
166-- instance stays authoritative (matching prefixes are essential so colors and
167-- poems land in the same embedding space). Mean (not length-weighted) combine:
168-- every association word should count equally regardless of its spelling length.
169function M.generate_color_embeddings(color_names, model_name, endpoint, color_associations)
170 local color_embeddings = {}
171 model_name = model_name or inference_config.get_selected_model()
172 endpoint = endpoint or inference_config.build_host_url()
173
174 utils.log_info(string.format("Generating embeddings for %d colors using model: %s", #color_names, model_name))
175
176 for _, color_name in ipairs(color_names) do
177 local words = (color_associations and color_associations[color_name]) or { color_name }
178 local vectors, err = fuzzy.get_embeddings_batch(
179 words, model_name, endpoint, inference_config.format_embedding_prompt)
180 if not vectors then
181 utils.log_error(string.format("Color embedding batch failed for %s: %s", color_name, tostring(err)))
182 else
183 -- keep only the well-formed vectors, then mean-combine into a centroid
184 local vecs = {}
185 for i = 1, #words do
186 if type(vectors[i]) == "table" and #vectors[i] > 0 then vecs[#vecs + 1] = vectors[i] end
187 end
188 if #vecs > 0 then
189 color_embeddings[color_name] = text_chunking.combine_chunk_vectors(vecs, nil, "mean")
190 utils.log_info(string.format("Color %s: centroid from %d/%d association words (dim %d)",
191 color_name, #vecs, #words, #color_embeddings[color_name]))
192 else
193 utils.log_error("No association embeddings for color: " .. color_name)
194 end
195 end
196 end
197
198 return color_embeddings
199end
200-- }}}
201
202-- {{{ function precompute_poem_colors
203function M.precompute_poem_colors(poems_data, poem_embeddings_data, color_embeddings, output_file)
204 -- Calculate all poem-to-color mappings at compile time
205 local poem_colors = {}
206 local processed_count = 0
207 local total_poems = 0
208
209 -- Count total poems for progress tracking
210 -- Note: Use poem.poem_index (globally unique) not poem.id (per-category, NOT unique)
211 for i, poem in ipairs(poems_data.poems) do
212 if poem.poem_index and poem_embeddings_data.embeddings[i] and poem_embeddings_data.embeddings[i].embedding then
213 total_poems = total_poems + 1
214 end
215 end
216
217 utils.log_info(string.format("Computing semantic colors for %d poems", total_poems))
218
219 -- Hubness correction: gather each colour's similarity distribution across all
220 -- poems first, so the assignment below can z-score it (see compute_color_stats).
221 -- This is what stops two "magnet" colours from swallowing ~70% of the poems.
222 local color_stats = compute_color_stats(poems_data, poem_embeddings_data, color_embeddings)
223
224 for i, poem in ipairs(poems_data.poems) do
225 if poem.poem_index and poem_embeddings_data.embeddings[i] and poem_embeddings_data.embeddings[i].embedding then
226 local color, similarity = calculate_semantic_color_for_poem(
227 poem_embeddings_data.embeddings[i].embedding,
228 color_embeddings,
229 color_stats
230 )
231
232 -- Key by poem_index (globally unique across all categories)
233 poem_colors[poem.poem_index] = {
234 color = color,
235 similarity = similarity,
236 calculated_at = os.date("%Y-%m-%d %H:%M:%S")
237 }
238
239 processed_count = processed_count + 1
240
241 -- Animate a single progress line instead of printing one every 100.
242 -- Throttle by mode: under --debug (verbose) keep it sparse at every
243 -- 100 so the durable log stays readable; on a live TTY redraw the bar
244 -- more often (every 25) for smooth motion. The suffix shows the most
245 -- recent poem's assigned color, as the old line did.
246 local step = (progress.mode() == 2) and 100 or 25
247 if processed_count % step == 0 then
248 progress.update(" 🎨 Semantic colors", processed_count, total_poems,
249 string.format("poem_index %d = %s", poem.poem_index, color))
250 end
251 end
252 end
253 -- Final frame at the true count (the throttle above can stop short of it),
254 -- then close the animated line so later output starts on a fresh row.
255 progress.update(" 🎨 Semantic colors", processed_count, total_poems, "done")
256 progress.finish()
257
258 -- Save to file for use during HTML generation
259 local output_data = {
260 poem_colors = poem_colors,
261 generated_at = os.date("%Y-%m-%d %H:%M:%S"),
262 total_poems = processed_count,
263 model_used = poem_embeddings_data.model_name or "unknown",
264 color_count = 0
265 }
266
267 -- Count colors used
268 local color_counts = {}
269 for _, poem_color in pairs(poem_colors) do
270 color_counts[poem_color.color] = (color_counts[poem_color.color] or 0) + 1
271 end
272 output_data.color_distribution = color_counts
273
274 utils.write_json_file(output_file, output_data)
275
276 utils.log_info(string.format("Precomputed colors for %d poems", processed_count))
277 utils.log_info("Color distribution:")
278 for color, count in pairs(color_counts) do
279 utils.log_info(string.format(" %s: %d poems (%.1f%%)", color, count, (count / processed_count) * 100))
280 end
281
282 return poem_colors, output_data
283end
284-- }}}
285
286-- {{{ function M.main
287function M.main(interactive_mode)
288 if interactive_mode then
289 print("Semantic Color Calculator - Interactive Mode")
290 print("1. Generate color embeddings only")
291 print("2. Precompute poem colors (requires existing embeddings)")
292 print("3. Generate color embeddings + precompute poem colors")
293 print("4. Test color calculation on single poem")
294 io.write("Select option (1-4): ")
295 local choice = io.read()
296
297 -- Issue 10-003: Use unified config instead of semantic-colors.json
298 local poems_file = utils.asset_path("poems.json")
299 local embeddings_file = utils.embeddings_dir() .. "/embeddings.json"
300 local color_embeddings_file = utils.embeddings_dir() .. "/color_embeddings.json"
301 local poem_colors_file = utils.embeddings_dir() .. "/poem_colors.json"
302
303 -- Color configuration from unified config
304 local color_config = {
305 color_names = unified_config.color_names,
306 semantic_colors = unified_config.semantic_colors
307 }
308 if not color_config.color_names then
309 utils.log_error("Failed to load color_names from unified config")
310 return
311 end
312
313 if choice == "1" or choice == "3" then
314 print("Generating color embeddings...")
315 local color_embeddings = M.generate_color_embeddings(
316 color_config.color_names,
317 "embeddinggemma:latest"
318 )
319
320 -- Save color embeddings
321 if next(color_embeddings) then
322 local color_embeddings_data = {
323 embeddings = color_embeddings,
324 generated_at = os.date("%Y-%m-%d %H:%M:%S"),
325 model_name = "embeddinggemma:latest",
326 color_count = #color_config.color_names
327 }
328 utils.write_json_file(color_embeddings_file, color_embeddings_data)
329 utils.log_info("Color embeddings saved to: " .. color_embeddings_file)
330 else
331 utils.log_error("No color embeddings generated")
332 return
333 end
334 end
335
336 if choice == "2" or choice == "3" then
337 print("Loading poem embeddings...")
338 local poems_data = utils.read_json_file(poems_file)
339 local embeddings_data = utils.read_json_file(embeddings_file)
340
341 -- Load color embeddings (either just generated or existing)
342 local color_embeddings_data = utils.read_json_file(color_embeddings_file)
343
344 if poems_data and embeddings_data and color_embeddings_data then
345 print("Precomputing poem colors...")
346 M.precompute_poem_colors(
347 poems_data,
348 embeddings_data,
349 color_embeddings_data.embeddings,
350 poem_colors_file
351 )
352 utils.log_info("Poem colors saved to: " .. poem_colors_file)
353 else
354 utils.log_error("Failed to load required data files")
355 end
356 elseif choice == "4" then
357 io.write("Enter poem ID to test: ")
358 local poem_id = tonumber(io.read())
359 if poem_id then
360 -- Load all required data
361 local poems_data = utils.read_json_file(poems_file)
362 local embeddings_data = utils.read_json_file(embeddings_file)
363 local color_embeddings_data = utils.read_json_file(color_embeddings_file)
364
365 if poems_data and embeddings_data and color_embeddings_data then
366 -- Find the poem
367 local poem_data = nil
368 local poem_embedding = nil
369
370 for i, poem in ipairs(poems_data.poems) do
371 if poem.id == poem_id then
372 poem_data = poem
373 if embeddings_data.embeddings[i] then
374 poem_embedding = embeddings_data.embeddings[i].embedding
375 end
376 break
377 end
378 end
379
380 if poem_data and poem_embedding then
381 local color, similarity = calculate_semantic_color_for_poem(
382 poem_embedding,
383 color_embeddings_data.embeddings
384 )
385
386 print(string.format("Poem %d (%s):", poem_id, poem_data.category or "unknown"))
387 print("Content preview:", poem_data.content:sub(1, 100) .. "...")
388 print(string.format("Semantic color: %s (similarity: %.3f)", color, similarity))
389
390 -- Show all color similarities
391 print("All color similarities:")
392 for color_name, color_embedding in pairs(color_embeddings_data.embeddings) do
393 local sim = cosine_similarity(poem_embedding, color_embedding)
394 print(string.format(" %s: %.3f", color_name, sim))
395 end
396 else
397 print("Could not find poem or embedding for ID:", poem_id)
398 end
399 end
400 end
401 end
402 else
403 utils.log_info("Use -I flag for interactive mode")
404 end
405end
406-- }}}
407
408-- Command line execution
409if arg then
410 -- Check for interactive flag
411 local interactive = false
412 for _, arg_val in ipairs(arg) do
413 if arg_val == "-I" then
414 interactive = true
415 break
416 end
417 end
418
419 M.main(interactive)
420end
421
422return M