src/image-manager.lua
1#!/usr/bin/env lua
2
3-- Image discovery and cataloging system
4-- Scans configured directories for supported image formats and creates metadata catalog
5
6-- {{{ setup_dir_path
7local function setup_dir_path(provided_dir)
8 -- Skip if provided_dir is a flag (starts with -)
9 if provided_dir and provided_dir:sub(1, 1) ~= "-" then
10 return provided_dir
11 end
12 return "/mnt/mtwo/programming/ai-stuff/neocities-modernization"
13end
14-- }}}
15
16-- {{{ parse_dir_from_args
17-- Parse arguments to extract directory path, skipping flags like --verbose
18local function parse_dir_from_args(args)
19 if not args then return nil end
20 for i = 1, #args do
21 local a = args[i]
22 -- Skip flags (start with -)
23 if a and a:sub(1, 1) ~= "-" then
24 return a
25 end
26 end
27 return nil
28end
29-- }}}
30
31-- {{{ parse_seed_from_args
32-- Issue 10-058: pull the build's master seed out of the shared arg vector. run.sh
33-- threads "--seed=N" (equals form, so the bare number is never mistaken for the
34-- positional DIR). Both "--seed=N" and "--seed N" are accepted. Returns a number
35-- or nil (nil => no master seed supplied; per-source random_seed still applies and
36-- a source with neither is randomized by the system RNG, which run.sh warns about).
37local function parse_seed_from_args(args)
38 if not args then return nil end
39 for i = 1, #args do
40 local a = args[i]
41 if a == "--seed" then
42 return tonumber(args[i + 1])
43 end
44 local eq = a:match("^--seed=(.+)$")
45 if eq then return tonumber(eq) end
46 end
47 return nil
48end
49-- }}}
50
51-- Script configuration
52local DIR = setup_dir_path(parse_dir_from_args(arg))
53-- Issue 10-058: master seed (nil unless run.sh / the operator passed --seed).
54local MASTER_SEED = parse_seed_from_args(arg)
55
56-- Load required libraries
57package.path = DIR .. "/libs/?.lua;" .. package.path
58local dkjson = require("dkjson")
59local utils = require("utils")
60
61-- Issue 10-003: Load unified config from config.lua
62local config_loader = require("config-loader")
63config_loader.set_project_root(DIR)
64local unified_config = config_loader.load()
65
66-- Issue 10-015a: Load sources configuration for multi-directory support
67local sources_loader = require("sources-loader")
68sources_loader.set_project_root(DIR)
69
70-- Initialize asset path configuration (CLI --dir takes precedence over config)
71utils.init_assets_root(arg)
72
73-- ANSI color codes for terminal output
74local COLOR_YELLOW = "\027[93m" -- Bright yellow for warnings
75local COLOR_RESET = "\027[0m"
76
77-- {{{ local function shell_escape
78-- Escape single quotes in paths for safe shell execution
79-- e.g., "Sant'Azraphel.png" -> "Sant'\''Azraphel.png"
80local function shell_escape(path)
81 return path:gsub("'", "'\\''")
82end
83-- }}}
84
85-- {{{ local function relative_path
86local function relative_path(absolute_path)
87 if absolute_path:sub(1, #DIR) == DIR then
88 local rel = absolute_path:sub(#DIR + 1)
89 if rel:sub(1, 1) == "/" then rel = rel:sub(2) end
90 return "./" .. rel
91 end
92 return absolute_path
93end
94-- }}}
95
96local M = {}
97
98-- {{{ function load_config
99-- Issue 10-015a: Use sources-loader for image directories (replaces input_sources dependency)
100-- Issue 10-030: Now returns full directory objects (not just paths) for randomization support
101-- This is the "no fallbacks" design - errors if sources.images not configured
102local function load_config()
103 -- Get image source configuration
104 local images_source = sources_loader.get_source("images")
105 if not images_source then
106 error("sources.images not configured in config.lua - required for image discovery")
107 end
108
109 -- Build image_directories from sources.images.directories
110 -- Issue 10-030: Now returns full directory objects with randomize_order and random_seed
111 local directories = sources_loader.get_directories("images")
112 if #directories == 0 then
113 error("sources.images.directories is empty in config.lua - no directories to scan")
114 end
115
116 -- Issue 10-030: Keep full directory objects, converting paths to relative
117 local image_directories = {}
118 for _, dir in ipairs(directories) do
119 -- sources-loader returns absolute paths, convert to relative for consistency
120 local path = dir.path
121 if path:sub(1, #DIR) == DIR then
122 path = path:sub(#DIR + 2) -- Strip DIR prefix and leading slash
123 end
124 table.insert(image_directories, {
125 name = dir.name,
126 path = path,
127 description = dir.description,
128 -- Issue 10-030: Randomization options
129 randomize_order = dir.randomize_order or false,
130 random_seed = dir.random_seed
131 })
132 end
133
134 -- Build config from sources.images settings (with image_integration fallbacks for display settings)
135 local display_config = unified_config.image_integration or {}
136 local config = {
137 enabled = sources_loader.is_enabled("images"),
138 image_directories = image_directories,
139 -- Use sources.images settings, fall back to image_integration for legacy compat
140 supported_formats = images_source.supported_formats or display_config.supported_formats or {"png", "jpg", "jpeg", "gif", "webp", "svg"},
141 max_file_size_mb = images_source.max_file_size_mb or display_config.max_file_size_mb or 10,
142 -- Keep display settings from image_integration
143 output_path = display_config.output_path,
144 catalog_file = display_config.catalog_file
145 }
146
147 return config
148end
149-- }}}
150
151-- {{{ function get_file_size
152local function get_file_size(file_path)
153 local stat_cmd = string.format("stat -c %%s '%s' 2>/dev/null", shell_escape(file_path))
154 local handle = io.popen(stat_cmd)
155 local result = handle:read("*a")
156 handle:close()
157
158 if result and result ~= "" then
159 local clean_result = result:gsub("%s+", "")
160 local size = tonumber(clean_result)
161 if size then
162 return size
163 end
164 end
165
166 return 0
167end
168-- }}}
169
170-- {{{ function get_file_mtime
171local function get_file_mtime(file_path)
172 local stat_cmd = string.format("stat -c %%Y '%s' 2>/dev/null", shell_escape(file_path))
173 local handle = io.popen(stat_cmd)
174 local result = handle:read("*a")
175 handle:close()
176
177 if result and result ~= "" then
178 local clean_result = result:gsub("%s+", "")
179 local timestamp = tonumber(clean_result)
180 if timestamp then
181 return timestamp
182 end
183 end
184
185 return os.time()
186end
187-- }}}
188
189-- {{{ function extract_image_dimensions
190local function extract_image_dimensions(file_path)
191 -- Try to use imagemagick's identify command
192 local identify_cmd = string.format("identify -format '%%wx%%h' '%s' 2>/dev/null", shell_escape(file_path))
193 local handle = io.popen(identify_cmd)
194 local result = handle:read("*a")
195 handle:close()
196
197 if result and result ~= "" then
198 local width, height = result:match("(%d+)x(%d+)")
199 if width and height then
200 return tonumber(width), tonumber(height)
201 end
202 end
203
204 -- Fallback: return unknown dimensions
205 return nil, nil
206end
207-- }}}
208
209-- {{{ function generate_image_hash
210local function generate_image_hash(file_path)
211 -- Generate MD5 hash for duplicate detection
212 local hash_cmd = string.format("md5sum '%s' 2>/dev/null | cut -d' ' -f1", shell_escape(file_path))
213 local handle = io.popen(hash_cmd)
214 local result = handle:read("*a")
215 handle:close()
216
217 if result and result ~= "" then
218 return result:gsub("%s+", "")
219 end
220
221 return nil
222end
223-- }}}
224
225-- {{{ function is_supported_format
226local function is_supported_format(file_path, supported_formats)
227 local extension = file_path:match("%.([^%.]+)$")
228 if not extension then
229 return false
230 end
231
232 extension = extension:lower()
233 for _, format in ipairs(supported_formats) do
234 if extension == format:lower() then
235 return true
236 end
237 end
238
239 return false
240end
241-- }}}
242
243-- {{{ function scan_directory_for_images
244-- Issue 10-030: Now accepts dir_config (full directory object) for randomization tracking
245local function scan_directory_for_images(directory, config, dir_config)
246 local images = {}
247
248 print("š Scanning directory: " .. relative_path(directory))
249
250 -- Check if directory exists
251 local check_cmd = string.format("test -d '%s'", shell_escape(directory))
252 local exists = os.execute(check_cmd) == true or os.execute(check_cmd) == 0
253
254 if not exists then
255 print("ā ļø Directory not found: " .. relative_path(directory))
256 return images
257 end
258
259 -- Find all files in directory
260 local find_cmd = string.format("find '%s' -type f", shell_escape(directory))
261 local handle = io.popen(find_cmd)
262
263 local processed_count = 0
264 local skipped_count = 0
265
266 for file_path in handle:lines() do
267 if is_supported_format(file_path, config.supported_formats) then
268 local file_size = get_file_size(file_path)
269 local max_size = (config.max_file_size_mb or 10) * 1024 * 1024
270
271 if file_size <= max_size then
272 local width, height = extract_image_dimensions(file_path)
273 local file_hash = generate_image_hash(file_path)
274 local mtime = get_file_mtime(file_path)
275
276 local image_entry = {
277 file_path = file_path,
278 relative_path = file_path:gsub("^" .. DIR .. "/", ""),
279 filename = file_path:match("([^/]+)$"),
280 extension = file_path:match("%.([^%.]+)$"):lower(),
281 size_bytes = file_size,
282 size_mb = math.floor(file_size / 1024 / 1024 * 100) / 100,
283 width = width,
284 height = height,
285 aspect_ratio = (width and height) and (width / height) or nil,
286 hash = file_hash,
287 modification_time = mtime,
288 modification_date = os.date("%Y-%m-%dT%H:%M:%SZ", mtime),
289 source_directory = directory,
290 -- Issue 10-030: Track source config for randomization
291 source_name = dir_config and dir_config.name or nil,
292 source_randomize = dir_config and dir_config.randomize_order or false,
293 source_random_seed = dir_config and dir_config.random_seed or nil
294 }
295
296 table.insert(images, image_entry)
297 processed_count = processed_count + 1
298 else
299 skipped_count = skipped_count + 1
300 end
301 end
302 end
303
304 handle:close()
305
306 print(string.format(" š Processed: %d images", processed_count))
307 if skipped_count > 0 then
308 print(string.format(" ā© Skipped: %d images (size/format)", skipped_count))
309 end
310
311 return images
312end
313-- }}}
314
315-- {{{ local function create_seeded_rng
316-- Issue 10-030: Create a seeded random number generator for reproducible randomization
317-- Uses a simple linear congruential generator (LCG) for deterministic results
318local function create_seeded_rng(seed)
319 local state = seed
320 return function()
321 -- LCG parameters (same as glibc)
322 state = (state * 1103515245 + 12345) % 2147483648
323 return state / 2147483648
324 end
325end
326-- }}}
327
328-- {{{ local function derive_source_seed(master_seed, source_name)
329-- Issue 10-058: turn the one build-wide master seed into a STABLE per-source seed,
330-- so every randomized source gets its own reproducible order from the same master.
331-- The per-source key is the source NAME (a djb2 string hash), not its position in
332-- the iteration -- so the derived seed does not change if sources are reordered,
333-- added, or removed. Folded to a 31-bit non-negative int to match create_seeded_rng.
334local function derive_source_seed(master_seed, source_name)
335 local hash = 5381
336 for i = 1, #source_name do
337 -- djb2: hash * 33 + byte, kept inside 31 bits each step so it never grows
338 -- past Lua's exact-integer range (no precision drift across machines).
339 hash = ((hash * 33) + source_name:byte(i)) % 2147483647
340 end
341 return (master_seed + hash) % 2147483647
342end
343-- }}}
344
345-- {{{ local function apply_randomization
346-- Issue 10-030: Apply randomized timestamps to images from randomized sources
347local function apply_randomization(all_images)
348 -- First, find the timeline range from non-randomized images
349 local min_time = nil
350 local max_time = nil
351
352 for _, image in ipairs(all_images) do
353 if not image.source_randomize then
354 local t = image.modification_time
355 if not min_time or t < min_time then min_time = t end
356 if not max_time or t > max_time then max_time = t end
357 end
358 end
359
360 -- If no non-randomized images, use current year as range
361 if not min_time then
362 local now = os.time()
363 min_time = now - 365 * 24 * 60 * 60 -- One year ago
364 max_time = now
365 end
366
367 local range = max_time - min_time
368 if range <= 0 then range = 1 end -- Prevent division by zero
369
370 -- Group images by source for seeded randomization
371 local source_rngs = {}
372 local randomized_count = 0
373
374 for _, image in ipairs(all_images) do
375 if image.source_randomize then
376 -- Get or create RNG for this source
377 local source_name = image.source_name or "default"
378 if not source_rngs[source_name] then
379 if image.source_random_seed then
380 -- Explicit per-source seed (Issue 10-030): a deliberate override,
381 -- highest precedence -- like --seed beating config.
382 source_rngs[source_name] = create_seeded_rng(image.source_random_seed)
383 elseif MASTER_SEED then
384 -- Issue 10-058: no explicit seed, so derive one deterministically
385 -- from the build's master seed + this source's name. Reproducible
386 -- by default -- the same master seed reorders this source the same
387 -- way every build, with no per-source config needed.
388 source_rngs[source_name] = create_seeded_rng(
389 derive_source_seed(MASTER_SEED, source_name))
390 else
391 -- No explicit seed AND no master seed (e.g. image-manager run
392 -- standalone without --seed): fall back to the system RNG. This
393 -- is NON-reproducible; per CLAUDE.md fallback policy run.sh always
394 -- supplies --seed so the live pipeline never lands here.
395 io.stderr:write(string.format(
396 "[image-manager] WARNING: source '%s' randomized with the system RNG "
397 .. "(no per-source random_seed and no --seed master seed) -- this build's "
398 .. "image order is NOT reproducible.\n", source_name))
399 source_rngs[source_name] = math.random
400 end
401 end
402
403 local rng = source_rngs[source_name]
404 local random_offset = rng() * range
405 image.modification_time = math.floor(min_time + random_offset)
406 image.modification_date = os.date("%Y-%m-%dT%H:%M:%SZ", image.modification_time)
407 image.randomized = true -- Mark as randomized for debugging
408 randomized_count = randomized_count + 1
409 end
410 end
411
412 return randomized_count
413end
414-- }}}
415
416-- {{{ function M.discover_images
417function M.discover_images()
418 print("š¼ļø Starting image discovery...")
419
420 local config = load_config()
421 if not config.enabled then
422 print("ā Image integration disabled in configuration")
423 return {}
424 end
425
426 local all_images = {}
427
428 -- Issue 10-030: Now iterates over directory objects (not just paths)
429 for _, dir_config in ipairs(config.image_directories) do
430 local full_path = DIR .. "/" .. dir_config.path
431 local directory_images = scan_directory_for_images(full_path, config, dir_config)
432
433 -- Add directory images to main collection
434 for _, image in ipairs(directory_images) do
435 table.insert(all_images, image)
436 end
437 end
438
439 -- Issue 10-030: Apply randomization before sorting
440 local randomized_count = apply_randomization(all_images)
441 if randomized_count > 0 then
442 print(string.format("š² Randomized timestamps for %d images", randomized_count))
443 end
444
445 -- Sort by modification time (newest first)
446 table.sort(all_images, function(a, b)
447 return a.modification_time > b.modification_time
448 end)
449
450 print(string.format("ā
Image discovery complete: %d images found", #all_images))
451 return all_images
452end
453-- }}}
454
455-- {{{ function M.generate_catalog
456function M.generate_catalog(images, output_file)
457 local config = load_config()
458
459 -- Create assets directory if it doesn't exist (use configured path)
460 local assets_dir = utils.get_assets_root()
461 os.execute("mkdir -p " .. assets_dir)
462
463 -- Generate duplicate analysis - group by hash with full image data
464 local hash_groups = {}
465
466 for _, image in ipairs(images) do
467 if image.hash then
468 if not hash_groups[image.hash] then
469 hash_groups[image.hash] = {}
470 end
471 table.insert(hash_groups[image.hash], image)
472 end
473 end
474
475 -- Resolve duplicates by keeping only the newest file from each group
476 -- Also track which duplicates were resolved for reporting
477 local resolved_duplicates = {}
478 local kept_hashes = {} -- Track which images to keep
479 local duplicate_count = 0
480
481 for hash, group in pairs(hash_groups) do
482 if #group > 1 then
483 -- Sort group by modification_time descending (newest first)
484 table.sort(group, function(a, b)
485 return (a.modification_time or 0) > (b.modification_time or 0)
486 end)
487
488 -- Keep the newest file (first after sorting)
489 kept_hashes[hash] = group[1].file_path
490
491 -- Track resolved duplicates for reporting
492 local removed_paths = {}
493 for i = 2, #group do
494 table.insert(removed_paths, group[i].relative_path)
495 duplicate_count = duplicate_count + 1
496 end
497
498 table.insert(resolved_duplicates, {
499 hash = hash,
500 kept = group[1].relative_path,
501 removed = removed_paths,
502 count = #group
503 })
504 else
505 -- Only one file with this hash - keep it
506 kept_hashes[hash] = group[1].file_path
507 end
508 end
509
510 -- Filter images to only include kept files (newest from each duplicate group)
511 local filtered_images = {}
512 for _, image in ipairs(images) do
513 if not image.hash then
514 -- No hash - can't detect duplicates, keep it
515 table.insert(filtered_images, image)
516 elseif kept_hashes[image.hash] == image.file_path then
517 -- This is the kept file from the duplicate group
518 table.insert(filtered_images, image)
519 end
520 -- Otherwise skip - it's a duplicate that was resolved
521 end
522
523 -- Report duplicate resolution
524 if duplicate_count > 0 then
525 print(string.format("š Resolved %d duplicates (kept newest of each group)", duplicate_count))
526 end
527
528 -- Generate statistics (using filtered_images which has duplicates removed)
529 local stats = {
530 total_images = #images, -- Original count before filtering
531 unique_images = #filtered_images, -- After duplicate resolution
532 duplicate_images = duplicate_count,
533 total_size_mb = 0,
534 average_size_mb = 0,
535 format_distribution = {},
536 size_distribution = {
537 small = 0, -- < 100KB
538 medium = 0, -- 100KB - 1MB
539 large = 0 -- > 1MB
540 },
541 resolution_distribution = {
542 low = 0, -- < 500px width
543 medium = 0, -- 500-1500px width
544 high = 0 -- > 1500px width
545 }
546 }
547
548 -- Calculate stats using filtered (de-duplicated) images
549 for _, image in ipairs(filtered_images) do
550 -- Size statistics
551 stats.total_size_mb = stats.total_size_mb + (image.size_mb or 0)
552
553 -- Format distribution
554 local ext = image.extension or "unknown"
555 stats.format_distribution[ext] = (stats.format_distribution[ext] or 0) + 1
556
557 -- Size distribution
558 local size_mb = image.size_mb or 0
559 if size_mb < 0.1 then
560 stats.size_distribution.small = stats.size_distribution.small + 1
561 elseif size_mb < 1 then
562 stats.size_distribution.medium = stats.size_distribution.medium + 1
563 else
564 stats.size_distribution.large = stats.size_distribution.large + 1
565 end
566
567 -- Resolution distribution
568 local width = image.width or 0
569 if width < 500 then
570 stats.resolution_distribution.low = stats.resolution_distribution.low + 1
571 elseif width < 1500 then
572 stats.resolution_distribution.medium = stats.resolution_distribution.medium + 1
573 else
574 stats.resolution_distribution.high = stats.resolution_distribution.high + 1
575 end
576 end
577
578 stats.average_size_mb = stats.total_size_mb / math.max(#filtered_images, 1)
579 stats.total_size_mb = math.floor(stats.total_size_mb * 100) / 100
580 stats.average_size_mb = math.floor(stats.average_size_mb * 1000) / 1000
581
582 -- Create catalog structure (uses filtered_images with duplicates resolved)
583 local catalog = {
584 metadata = {
585 generated_at = os.date("%Y-%m-%dT%H:%M:%SZ"),
586 total_images = #images, -- Original count before filtering
587 unique_images = #filtered_images, -- After duplicate resolution
588 configuration = config,
589 statistics = stats
590 },
591 images = filtered_images, -- De-duplicated image list
592 resolved_duplicates = resolved_duplicates -- Record of what was deduplicated
593 }
594
595 -- Write catalog to file (use configured assets path)
596 local catalog_path = output_file or utils.asset_path("image-catalog.json")
597 local file = io.open(catalog_path, "w")
598 if not file then
599 error("Could not create catalog file: " .. catalog_path)
600 end
601
602 file:write(dkjson.encode(catalog, { indent = true }))
603 file:close()
604
605 print("š Generated catalog: " .. relative_path(catalog_path))
606 return catalog
607end
608-- }}}
609
610-- {{{ function M.show_statistics
611-- Issue 10-015a: Added verbose parameter - detailed stats only show when verbose=true
612-- Summary (always shown): total, unique, duplicates count, size
613-- Detailed (verbose only): format distribution, size distribution, resolution distribution
614-- Warnings (always shown): duplicate groups
615function M.show_statistics(catalog, verbose)
616 local stats = catalog.metadata.statistics
617
618 print("\n=== IMAGE CATALOG STATISTICS ===")
619 print(string.format("Total Images: %d", stats.total_images))
620 print(string.format("Unique Images: %d", stats.unique_images))
621 print(string.format("Duplicate Images: %d", stats.duplicate_images))
622 print(string.format("Total Size: %.2f MB", stats.total_size_mb))
623 print(string.format("Average Size: %.3f MB", stats.average_size_mb))
624
625 -- Detailed statistics only shown when verbose flag is set
626 if verbose then
627 print("\nFormat Distribution:")
628 for format, count in pairs(stats.format_distribution) do
629 print(string.format(" %s: %d images", format, count))
630 end
631
632 print("\nSize Distribution:")
633 print(string.format(" Small (<100KB): %d images", stats.size_distribution.small))
634 print(string.format(" Medium (100KB-1MB): %d images", stats.size_distribution.medium))
635 print(string.format(" Large (>1MB): %d images", stats.size_distribution.large))
636
637 print("\nResolution Distribution:")
638 print(string.format(" Low (<500px): %d images", stats.resolution_distribution.low))
639 print(string.format(" Medium (500-1500px): %d images", stats.resolution_distribution.medium))
640 print(string.format(" High (>1500px): %d images", stats.resolution_distribution.high))
641 end
642
643 -- Resolved duplicates are handled silently. The count is uninteresting
644 -- to the operator and the per-group details are debug-level noise that
645 -- nobody reads. If a regression ever produces wrong duplicate handling
646 -- we surface it via the catalog file itself, not via stdout spam.
647end
648-- }}}
649
650-- {{{ function M.main
651-- Issue 10-015a: Added verbose parameter for detailed statistics output
652function M.main(verbose)
653 print("š¼ļø Image Integration System")
654 print("Project Directory: " .. relative_path(DIR))
655
656 -- Discover all images
657 local images = M.discover_images()
658
659 if #images == 0 then
660 print("ā No images found in configured directories")
661 return false
662 end
663
664 -- Generate catalog
665 local catalog = M.generate_catalog(images)
666
667 -- Show statistics (detailed distribution only when verbose=true)
668 M.show_statistics(catalog, verbose)
669
670 print("\nā
Image integration system ready")
671 return true
672end
673-- }}}
674
675-- Command line execution (only when run directly, not when required as module)
676if arg and arg[0] and arg[0]:match("image%-manager%.lua$") then
677 -- Issue 10-015a: Parse --verbose/-v flag for detailed statistics
678 local verbose = false
679 for i = 1, #arg do
680 if arg[i] == "--verbose" or arg[i] == "-v" then
681 verbose = true
682 break
683 end
684 end
685 M.main(verbose)
686end
687
688return M