src/poem-validator.lua
1#!/usr/bin/env lua
2
3-- {{{ local function setup_dir_path
4local function setup_dir_path(provided_dir)
5 if provided_dir then
6 return provided_dir
7 end
8 return "/mnt/mtwo/programming/ai-stuff/neocities-modernization"
9end
10-- }}}
11
12-- Script configuration
13local DIR = setup_dir_path()
14
15-- Load required libraries
16package.path = DIR .. "/libs/?.lua;" .. DIR .. "/src/?.lua;" .. package.path
17local dkjson = require("dkjson")
18local utils = require("utils")
19
20-- Initialize asset path configuration for standalone execution
21utils.init_assets_root(arg)
22
23-- ANSI color codes for terminal output
24local COLOR_GOLD = "\027[93m" -- Bright yellow (gold) for golden poems
25local COLOR_RESET = "\027[0m"
26
27-- {{{ local function relative_path
28local function relative_path(absolute_path)
29 if absolute_path:sub(1, #DIR) == DIR then
30 local rel = absolute_path:sub(#DIR + 1)
31 if rel:sub(1, 1) == "/" then rel = rel:sub(2) end
32 return "./" .. rel
33 end
34 return absolute_path
35end
36-- }}}
37
38local M = {}
39
40-- {{{ local function generate_character_distribution_report
41local function generate_character_distribution_report(validation_stats)
42 local report = {
43 "Character Count Distribution (Top 20 by Frequency):",
44 "====================================================",
45 ""
46 }
47
48 -- Collect all lengths with their counts
49 local lengths = {}
50 for length_str, count in pairs(validation_stats.character_distribution or {}) do
51 table.insert(lengths, {length = tonumber(length_str), count = count})
52 end
53
54 -- Sort by number of occurrences (descending), not by character length
55 table.sort(lengths, function(a, b) return a.count > b.count end)
56
57 -- Show top 20 results only
58 local display_count = math.min(20, #lengths)
59 for i = 1, display_count do
60 local item = lengths[i]
61 local line
62 if item.length == 1024 then
63 -- Golden poems get gold coloring
64 line = COLOR_GOLD .. string.format("%d poems @ %d characters ← GOLDEN POEMS",
65 item.count, item.length) .. COLOR_RESET
66 else
67 line = string.format("%d poems @ %d characters", item.count, item.length)
68 end
69 table.insert(report, line)
70 end
71
72 -- Show how many more entries exist
73 if #lengths > 20 then
74 table.insert(report, string.format("... and %d more unique character counts", #lengths - 20))
75 end
76
77 -- Category breakdown if available
78 if validation_stats.by_category and next(validation_stats.by_category) then
79 table.insert(report, "")
80 table.insert(report, "By Category:")
81 table.insert(report, "------------")
82 for cat, cat_stats in pairs(validation_stats.by_category) do
83 local golden_info = ""
84 if cat_stats.golden > 0 then
85 golden_info = string.format(" (%d golden)", cat_stats.golden)
86 end
87 table.insert(report, string.format(" %s: %d poems%s", cat, cat_stats.total, golden_info))
88 end
89 end
90
91 return table.concat(report, "\n")
92end
93-- }}}
94
95-- {{{ local function analyze_poem_content
96local function analyze_poem_content(poem)
97 local analysis = {
98 id = poem.id,
99 filename = poem.filename,
100 category = poem.category,
101 has_content = poem.content and #poem.content > 0,
102 length = poem.length,
103 actual_length = poem.content and #poem.content or 0,
104 length_matches = poem.length == (poem.content and #poem.content or 0),
105 line_count = 0,
106 word_count = 0,
107 char_distribution = {},
108 is_fediverse_length = false, -- Exactly 1024 chars or less
109 -- Use pre-calculated golden poem status from extraction metadata
110 golden_poem_character_count = poem.metadata and poem.metadata.golden_poem_character_count or nil,
111 is_golden_poem = poem.metadata and poem.metadata.is_golden_poem or false
112 }
113
114 if poem.content and #poem.content > 0 then
115 -- Count lines
116 analysis.line_count = select(2, poem.content:gsub('\n', '\n')) + 1
117
118 -- Count words (simple whitespace split)
119 for word in poem.content:gmatch("%S+") do
120 analysis.word_count = analysis.word_count + 1
121 end
122
123 -- Character distribution analysis
124 for char in poem.content:gmatch(".") do
125 analysis.char_distribution[char] = (analysis.char_distribution[char] or 0) + 1
126 end
127
128 -- Check if it's fediverse-compatible length (1024 chars including content warning)
129 analysis.is_fediverse_length = analysis.actual_length <= 1024
130 end
131
132 return analysis
133end
134-- }}}
135
136-- {{{ local function detect_duplicates
137local function detect_duplicates(poems)
138 local content_hash = {}
139 local duplicates = {}
140
141 for _, poem in ipairs(poems) do
142 if poem.content and #poem.content > 10 then -- Only check non-trivial content
143 local content = poem.content:gsub("%s+", " "):lower() -- Normalize whitespace and case
144
145 if content_hash[content] then
146 table.insert(duplicates, {
147 original = content_hash[content],
148 duplicate = poem.id,
149 content_preview = poem.content:sub(1, 50) .. "..."
150 })
151 else
152 content_hash[content] = poem.id
153 end
154 end
155 end
156
157 return duplicates
158end
159-- }}}
160
161
162-- {{{ local function generate_statistics
163local function generate_statistics(analyses)
164 local stats = {
165 total_poems = #analyses,
166 empty_poems = 0,
167 non_empty_poems = 0,
168 total_words = 0,
169 total_characters = 0,
170 fediverse_compatible = 0,
171 golden_poems = 0, -- Using pre-calculated metadata from extraction
172 character_distribution = {},
173 length_mismatches = 0,
174 average_length = 0,
175 median_length = 0,
176 max_length = 0,
177 min_length = math.huge,
178 length_distribution = {
179 ["0"] = 0, -- Empty
180 ["1-100"] = 0, -- Very short
181 ["101-500"] = 0, -- Short
182 ["501-1024"] = 0, -- Fediverse length
183 ["1025-2000"] = 0, -- Medium
184 ["2000+"] = 0 -- Long
185 },
186 by_category = {} -- Track stats per category
187 }
188
189 local lengths = {}
190
191 for _, analysis in ipairs(analyses) do
192 -- Track by category
193 local cat = analysis.category or "unknown"
194 if not stats.by_category[cat] then
195 stats.by_category[cat] = { total = 0, golden = 0 }
196 end
197 stats.by_category[cat].total = stats.by_category[cat].total + 1
198
199 if analysis.has_content then
200 stats.non_empty_poems = stats.non_empty_poems + 1
201 stats.total_words = stats.total_words + analysis.word_count
202 stats.total_characters = stats.total_characters + analysis.actual_length
203
204 if analysis.actual_length > stats.max_length then
205 stats.max_length = analysis.actual_length
206 end
207 if analysis.actual_length < stats.min_length then
208 stats.min_length = analysis.actual_length
209 end
210
211 table.insert(lengths, analysis.actual_length)
212 else
213 stats.empty_poems = stats.empty_poems + 1
214 end
215
216 if analysis.is_fediverse_length then
217 stats.fediverse_compatible = stats.fediverse_compatible + 1
218 end
219
220 -- Use pre-calculated golden poem status from extraction metadata
221 if analysis.is_golden_poem then
222 stats.golden_poems = stats.golden_poems + 1
223 stats.by_category[cat].golden = stats.by_category[cat].golden + 1
224 end
225
226 -- Track character distribution using golden_poem_character_count if available
227 local char_count = analysis.golden_poem_character_count or analysis.actual_length
228 local length_key = tostring(char_count)
229 stats.character_distribution[length_key] = (stats.character_distribution[length_key] or 0) + 1
230
231 if not analysis.length_matches then
232 stats.length_mismatches = stats.length_mismatches + 1
233 end
234
235 -- Length distribution
236 local len = analysis.actual_length
237 if len == 0 then
238 stats.length_distribution["0"] = stats.length_distribution["0"] + 1
239 elseif len <= 100 then
240 stats.length_distribution["1-100"] = stats.length_distribution["1-100"] + 1
241 elseif len <= 500 then
242 stats.length_distribution["101-500"] = stats.length_distribution["101-500"] + 1
243 elseif len <= 1024 then
244 stats.length_distribution["501-1024"] = stats.length_distribution["501-1024"] + 1
245 elseif len <= 2000 then
246 stats.length_distribution["1025-2000"] = stats.length_distribution["1025-2000"] + 1
247 else
248 stats.length_distribution["2000+"] = stats.length_distribution["2000+"] + 1
249 end
250 end
251
252 -- Calculate averages
253 if stats.non_empty_poems > 0 then
254 stats.average_length = stats.total_characters / stats.non_empty_poems
255
256 -- Calculate median
257 table.sort(lengths)
258 local mid = math.floor(#lengths / 2)
259 if #lengths % 2 == 0 then
260 stats.median_length = (lengths[mid] + lengths[mid + 1]) / 2
261 else
262 stats.median_length = lengths[mid + 1]
263 end
264 end
265
266 if stats.min_length == math.huge then
267 stats.min_length = 0
268 end
269
270 return stats
271end
272-- }}}
273
274-- {{{ function M.validate_poems
275function M.validate_poems(poems_file, output_file)
276 print("Loading poems from: " .. relative_path(poems_file))
277
278 -- Load poems data
279 local file = io.open(poems_file, "r")
280 if not file then
281 error("Could not open poems file: " .. poems_file)
282 end
283
284 local content = file:read("*all")
285 file:close()
286
287 local data = dkjson.decode(content)
288 if not data or not data.poems then
289 error("Invalid poems file format")
290 end
291
292 local poems = data.poems
293 print("Validating " .. #poems .. " poems...")
294
295 -- Analyze each poem
296 local analyses = {}
297 for _, poem in ipairs(poems) do
298 table.insert(analyses, analyze_poem_content(poem))
299 end
300
301 -- Detect duplicates
302 print("Checking for duplicate content...")
303 local duplicates = detect_duplicates(poems)
304
305 -- Generate statistics
306 print("Generating statistics...")
307 local statistics = generate_statistics(analyses)
308
309 -- Create validation report
310 local report = {
311 metadata = {
312 source_file = poems_file,
313 validated_at = os.date("%Y-%m-%d %H:%M:%S"),
314 validation_version = "2.0"
315 },
316 summary = {
317 total_poems = #poems,
318 source_metadata = data.metadata
319 },
320 statistics = statistics,
321 duplicates = duplicates,
322 detailed_analyses = analyses
323 }
324
325 -- Save report
326 local json_output = dkjson.encode(report, { indent = true })
327 local output = io.open(output_file, "w")
328 if not output then
329 error("Could not create output file: " .. output_file)
330 end
331
332 output:write(json_output)
333 output:close()
334
335 -- Print summary
336 print("\n=== VALIDATION SUMMARY ===")
337 print("Total Poems: " .. statistics.total_poems)
338 print("Non-empty Poems: " .. statistics.non_empty_poems)
339 print("Empty Poems: " .. statistics.empty_poems)
340 print("Average Length: " .. string.format("%.1f", statistics.average_length) .. " characters")
341 print("Median Length: " .. string.format("%.1f", statistics.median_length) .. " characters")
342 print("Fediverse Compatible (≤1024 chars): " .. statistics.fediverse_compatible)
343 print("Golden Poems (exactly 1024 chars): " .. statistics.golden_poems)
344 print("Duplicate Content: " .. #duplicates .. " pairs")
345
346 -- Add character distribution report
347 print("\n" .. generate_character_distribution_report(statistics))
348
349 print("\nValidation report saved to: " .. relative_path(output_file))
350 return report
351end
352-- }}}
353
354-- {{{ function M.main
355function M.main(interactive_mode)
356 if interactive_mode then
357 print("=== Poem Validation Tool ===")
358 print("1. Validate extracted poems.json")
359 print("2. Validate custom file")
360 io.write("Select option (1-2): ")
361 local choice = io.read()
362
363 local input_file, output_file
364
365 if choice == "1" then
366 input_file = utils.asset_path("poems.json")
367 output_file = utils.asset_path("validation-report.json")
368 elseif choice == "2" then
369 io.write("Enter input file path: ")
370 input_file = io.read()
371 io.write("Enter output file path: ")
372 output_file = io.read()
373 else
374 print("Invalid choice")
375 return
376 end
377
378 M.validate_poems(input_file, output_file)
379 else
380 -- Default non-interactive mode
381 local input_file = utils.asset_path("poems.json")
382 local output_file = utils.asset_path("validation-report.json")
383 M.validate_poems(input_file, output_file)
384 end
385end
386-- }}}
387
388-- Command line execution (only when run directly, not when required as module)
389if arg and arg[0] and arg[0]:match("poem%-validator%.lua$") then
390 local interactive_mode = false
391 for i, arg_val in ipairs(arg) do
392 if arg_val == "-I" then
393 interactive_mode = true
394 break
395 end
396 end
397
398 M.main(interactive_mode)
399end
400
401return M