libs/pipeline-validator.lua
1-- {{{ Pipeline Validator Module
2-- Provides validation functions for checking pipeline data completeness and freshness
3-- Used by scripts to fail-fast before starting long-running operations
4-- }}}
5
6local M = {}
7
8local dkjson = require('dkjson')
10
11-- {{{ Configuration
12M.config = {
13 poems_json = "assets/poems.json",
14 embeddings_dir = "assets/embeddings",
15 -- default_model is filled in below from config.lua (see the resolver), not a
16 -- hardcoded literal, so a model swap in config or a --model on the CLI is
17 -- honored here too. It only acts as a fallback: every check_* function takes
18 -- an explicit model argument and uses this when the caller passes none.
19 default_model = nil,
20 output_similar_dir = "output/similar",
21 output_different_dir = "output/different",
22 verbose = false
23}
24
25-- Resolve default_model from the shared resolver (which reads this run's --model
26-- override notepad, then config.lua). pcall-guarded so a validator used outside a
27-- configured project still loads; a nil default then flows to utils.*_dir(), which
28-- resolves the model through the very same path -- so there is no hardcoded
29-- fallback anywhere in the chain.
30do
31 local ok_req, inference_config = pcall(require, "inference-server-config")
32 if ok_req then
33 local ok_model, model = pcall(inference_config.get_selected_model)
34 if ok_model and model then
35 M.config.default_model = model
36 end
37 end
38end
39-- }}}
40
41-- {{{ Helper: Log functions
42local function log_error(msg)
43 io.stderr:write("❌ ERROR: " .. msg .. "\n")
44end
45
46local function log_warning(msg)
47 io.stderr:write("⚠️ WARNING: " .. msg .. "\n")
48end
49
50local function log_info(msg)
51 if M.config.verbose then
52 io.stderr:write("ℹ️ " .. msg .. "\n")
53 end
54end
55-- }}}
56
57-- {{{ Helper: Count files matching pattern
58local function count_files(directory, pattern)
59 if not directory then return 0 end
60
61 local handle = io.popen(string.format(
62 "find '%s' -name '%s' -type f 2>/dev/null | wc -l",
63 directory, pattern
64 ))
65 if not handle then return 0 end
66
67 local result = handle:read("*a")
68 handle:close()
69
70 return tonumber(result) or 0
71end
72-- }}}
73
74-- {{{ Helper: Get file modification time
75local function get_mtime(filepath)
76 local handle = io.popen(string.format("stat -c '%%Y' '%s' 2>/dev/null", filepath))
77 if not handle then return 0 end
78
79 local result = handle:read("*a")
80 handle:close()
81
82 return tonumber(result) or 0
83end
84-- }}}
85
86-- {{{ Helper: Load JSON file
87local function load_json_file(filepath)
88 local file = io.open(filepath, "r")
89 if not file then
90 return nil, "File not found: " .. filepath
91 end
92
93 local content = file:read("*all")
94 file:close()
95
96 local data, pos, err = dkjson.decode(content)
97 if not data then
98 return nil, "JSON parse error at position " .. tostring(pos) .. ": " .. tostring(err)
99 end
100
101 return data
102end
103-- }}}
104
105-- {{{ check_embeddings
106-- Checks if embeddings are complete for all poems
107-- Returns: {complete = bool, count = num, total = num, missing = num, status = string}
108function M.check_embeddings(model)
109 model = model or M.config.default_model
110 local result = {
111 complete = false,
112 count = 0,
113 total = 0,
114 missing = 0,
115 status = "UNKNOWN"
116 }
117
118 -- Count total poems
119 local poems_data, err = load_json_file(M.config.poems_json)
120 if not poems_data then
121 result.status = "ERROR"
122 result.error = "Cannot read poems.json: " .. tostring(err)
123 return result
124 end
125
126 result.total = poems_data.poems and #poems_data.poems or 0
127
128 -- Count embeddings (individual similarity files = one per poem)
129 local similarities_dir = utils.similarities_dir(model) -- movable (RAM after flip)
130 result.count = count_files(similarities_dir, "poem_*.json")
131
132 result.missing = result.total - result.count
133 result.complete = (result.count == result.total and result.total > 0)
134
135 if result.complete then
136 result.status = "COMPLETE"
137 elseif result.count == 0 then
138 result.status = "MISSING"
139 else
140 result.status = "INCOMPLETE"
141 end
142
143 log_info(string.format("Embeddings: %d/%d (%.1f%%)", result.count, result.total,
144 result.total > 0 and (result.count / result.total * 100) or 0))
145
146 return result
147end
148-- }}}
149
150-- {{{ check_similarity_matrix
151-- Checks if similarity matrix files exist for all poems
152-- Returns: {complete = bool, count = num, total = num, missing = num, status = string}
153function M.check_similarity_matrix(model)
154 model = model or M.config.default_model
155 local result = {
156 complete = false,
157 count = 0,
158 total = 0,
159 missing = 0,
160 status = "UNKNOWN"
161 }
162
163 -- Count total poems
164 local poems_data, err = load_json_file(M.config.poems_json)
165 if not poems_data then
166 result.status = "ERROR"
167 result.error = "Cannot read poems.json: " .. tostring(err)
168 return result
169 end
170
171 result.total = poems_data.poems and #poems_data.poems or 0
172
173 -- Count similarity matrix files
174 local similarities_dir = utils.similarities_dir(model) -- movable (RAM after flip)
175 result.count = count_files(similarities_dir, "poem_*.json")
176
177 result.missing = result.total - result.count
178 result.complete = (result.count == result.total and result.total > 0)
179
180 if result.complete then
181 result.status = "COMPLETE"
182 elseif result.count == 0 then
183 result.status = "MISSING"
184 else
185 result.status = "INCOMPLETE"
186 end
187
188 log_info(string.format("Similarity Matrix: %d/%d (%.1f%%)", result.count, result.total,
189 result.total > 0 and (result.count / result.total * 100) or 0))
190
191 return result
192end
193-- }}}
194
195-- {{{ check_diversity_cache
196-- Checks if diversity cache exists and is complete
197-- Returns: {complete = bool, count = num, total = num, status = string, exists = bool}
198function M.check_diversity_cache(model)
199 model = model or M.config.default_model
200 local result = {
201 complete = false,
202 count = 0,
203 total = 0,
204 status = "MISSING",
205 exists = false
206 }
207
208 -- Count total poems
209 local poems_data, err = load_json_file(M.config.poems_json)
210 if not poems_data then
211 result.status = "ERROR"
212 result.error = "Cannot read poems.json: " .. tostring(err)
213 return result
214 end
215
216 result.total = poems_data.poems and #poems_data.poems or 0
217
218 -- Check diversity cache
219 local cache_file = utils.embeddings_dir_disk(model) .. "/diversity_cache.json" -- stays on disk
220 local cache_data, cache_err = load_json_file(cache_file)
221
222 if cache_data then
223 result.exists = true
224 -- Count sequences in cache
225 local count = 0
226 for _ in pairs(cache_data) do
227 count = count + 1
228 end
229 result.count = count
230 result.complete = (count == result.total)
231 result.status = result.complete and "COMPLETE" or "INCOMPLETE"
232 else
233 result.status = "MISSING"
234 result.exists = false
235 end
236
237 log_info(string.format("Diversity Cache: %s (%d/%d)", result.status, result.count, result.total))
238
239 return result
240end
241-- }}}
242
243-- {{{ check_freshness
244-- Checks if cached data is fresher than source data
245-- Returns: {fresh = bool, issues = {}}
246function M.check_freshness(model)
247 model = model or M.config.default_model
248 local result = {
249 fresh = true,
250 issues = {}
251 }
252
253 local poems_mtime = get_mtime(M.config.poems_json)
254 local embeddings_file = utils.embeddings_dir(model) .. "/embeddings.json" -- movable (RAM after flip)
255 local embeddings_mtime = get_mtime(embeddings_file)
256
257 -- Check if embeddings are stale
258 if poems_mtime > embeddings_mtime then
259 result.fresh = false
260 table.insert(result.issues, {
261 type = "STALE_EMBEDDINGS",
262 message = "poems.json is newer than embeddings.json",
263 fix = "./generate-embeddings.sh --incremental"
264 })
265 end
266
267 -- Check if similarity matrix is stale
268 local similarities_dir = utils.similarities_dir(model) -- movable (RAM after flip)
269 local handle = io.popen(string.format(
270 "find '%s' -name 'poem_*.json' -type f -printf '%%T@\\n' 2>/dev/null | sort -n | tail -1",
271 similarities_dir
272 ))
273 if handle then
274 local latest_sim = handle:read("*a")
275 handle:close()
276 local similarity_mtime = tonumber(latest_sim) or 0
277
278 if similarity_mtime > 0 and embeddings_mtime > similarity_mtime then
279 result.fresh = false
280 table.insert(result.issues, {
281 type = "STALE_SIMILARITY",
282 message = "embeddings.json is newer than similarity matrix",
283 fix = "lua src/similarity-engine-parallel.lua"
284 })
285 end
286 end
287
288 -- Check if diversity cache is stale
289 local cache_file = utils.embeddings_dir_disk(model) .. "/diversity_cache.json" -- stays on disk
290 local cache_mtime = get_mtime(cache_file)
291
292 if cache_mtime > 0 and embeddings_mtime > cache_mtime then
293 result.fresh = false
294 table.insert(result.issues, {
295 type = "STALE_DIVERSITY",
296 message = "embeddings.json is newer than diversity cache",
297 fix = "./scripts/precompute-diversity-sequences"
298 })
299 end
300
301 log_info(string.format("Freshness: %s (%d issues)", result.fresh and "FRESH" or "STALE", #result.issues))
302
303 return result
304end
305-- }}}
306
307-- {{{ validate_for_html_generation
308-- Comprehensive check before HTML generation
309-- Returns: {ready = bool, errors = {}, warnings = {}}
310function M.validate_for_html_generation(model, require_diversity_cache)
311 model = model or M.config.default_model
312 require_diversity_cache = require_diversity_cache == nil and true or require_diversity_cache
313
314 local result = {
315 ready = true,
316 errors = {},
317 warnings = {}
318 }
319
320 -- Check embeddings
321 local emb = M.check_embeddings(model)
322 if not emb.complete then
323 result.ready = false
324 table.insert(result.errors, {
325 type = "INCOMPLETE_EMBEDDINGS",
326 message = string.format("Embeddings incomplete: %d/%d poems", emb.count, emb.total),
327 fix = "./generate-embeddings.sh --incremental"
328 })
329 end
330
331 -- Check similarity matrix
332 local sim = M.check_similarity_matrix(model)
333 if not sim.complete then
334 result.ready = false
335 table.insert(result.errors, {
336 type = "INCOMPLETE_SIMILARITY",
337 message = string.format("Similarity matrix incomplete: %d/%d poems", sim.count, sim.total),
338 fix = "lua src/similarity-engine-parallel.lua"
339 })
340 end
341
342 -- Check diversity cache (optional warning if require_diversity_cache is false)
343 local div = M.check_diversity_cache(model)
344 if not div.complete then
345 if require_diversity_cache then
346 result.ready = false
347 table.insert(result.errors, {
348 type = "MISSING_DIVERSITY",
349 message = "Diversity cache not generated",
350 fix = "./scripts/precompute-diversity-sequences"
351 })
352 else
353 table.insert(result.warnings, {
354 type = "MISSING_DIVERSITY",
355 message = "Diversity cache not generated (generation will be slow)",
356 fix = "./scripts/precompute-diversity-sequences"
357 })
358 end
359 end
360
361 -- Check freshness
362 local fresh = M.check_freshness(model)
363 if not fresh.fresh then
364 for _, issue in ipairs(fresh.issues) do
365 table.insert(result.warnings, issue)
366 end
367 end
368
369 return result
370end
371-- }}}
372
373-- {{{ print_validation_report
374-- Prints a formatted validation report to stderr
375function M.print_validation_report(validation)
376 if #validation.errors > 0 then
377 log_error("Pipeline validation failed:")
378 for _, err in ipairs(validation.errors) do
379 io.stderr:write(string.format(" • %s\n", err.message))
380 io.stderr:write(string.format(" Fix: %s\n", err.fix))
381 end
382 io.stderr:write("\n")
383 end
384
385 if #validation.warnings > 0 then
386 log_warning("Pipeline warnings:")
387 for _, warn in ipairs(validation.warnings) do
388 io.stderr:write(string.format(" • %s\n", warn.message))
389 io.stderr:write(string.format(" Fix: %s\n", warn.fix))
390 end
391 io.stderr:write("\n")
392 end
393
394 if validation.ready then
395 io.stderr:write("✅ Pipeline ready for HTML generation\n\n")
396 else
397 io.stderr:write("❌ Pipeline NOT ready - fix errors above\n\n")
398 end
399end
400-- }}}
401
402-- {{{ set_verbose
403-- Enable/disable verbose logging
404function M.set_verbose(verbose)
405 M.config.verbose = verbose
406end
407-- }}}
408
409return M
410-- }}}
411