scripts/migrate-to-triangular-files.lua
1#!/usr/bin/env luajit
2
3-- Migration Script: Convert Full Individual Files to Triangular Format
4-- Reads existing poem_*.json files and removes redundant lower-triangle entries
5-- Each file will only contain similarities where other_id > this_id
6
7local DIR = DIR or "/mnt/mtwo/programming/ai-stuff/neocities-modernization"
8package.path = DIR .. '/libs/?.lua;' .. package.path
9
10local utils = require('utils')
11local dkjson = require('dkjson')
12
13local function migrate_files(similarities_dir, dry_run)
14 dry_run = dry_run or false
15
16 print("🔄 Migrating similarity files to triangular format")
17 print(" Directory: " .. similarities_dir)
18 print(" Dry run: " .. tostring(dry_run))
19 print()
20
21 -- Get list of all similarity files
22 local files = {}
23 local find_cmd = string.format("find '%s' -name 'poem_*.json' -type f", similarities_dir)
24 local handle = io.popen(find_cmd)
25 if handle then
26 for filepath in handle:lines() do
27 table.insert(files, filepath)
28 end
29 handle:close()
30 end
31
32 if #files == 0 then
33 print("❌ No similarity files found in " .. similarities_dir)
34 return false
35 end
36
37 print(string.format("Found %d files to migrate", #files))
38 print()
39
40 local migrated = 0
41 local skipped = 0
42 local errors = 0
43 local total_size_before = 0
44 local total_size_after = 0
45
46 for i, filepath in ipairs(files) do
47 -- Extract poem ID from filename
48 local poem_id = tonumber(filepath:match("poem_(%d+)%.json$"))
49 if not poem_id then
50 print(string.format("⚠️ Skipping %s (can't extract poem ID)", filepath))
51 skipped = skipped + 1
52 goto continue
53 end
54
55 -- Read existing file
56 local file_data = utils.read_json_file(filepath)
57 if not file_data or not file_data.similarities then
58 print(string.format("⚠️ Skipping poem %d (can't read file)", poem_id))
59 skipped = skipped + 1
60 goto continue
61 end
62
63 -- Check if already triangular
64 if file_data.metadata and file_data.metadata.format == "triangular_upper" then
65 skipped = skipped + 1
66 goto continue
67 end
68
69 -- Calculate original size
70 local orig_size = #file_data.similarities
71
72 -- Filter to keep only upper triangle (other_id > poem_id)
73 local triangular_similarities = {}
74 for _, entry in ipairs(file_data.similarities) do
75 local other_id = tonumber(entry.id)
76 if other_id and other_id > poem_id then
77 table.insert(triangular_similarities, entry)
78 end
79 end
80
81 -- Calculate new size
82 local new_size = #triangular_similarities
83
84 -- Update metadata
85 file_data.similarities = triangular_similarities
86 file_data.metadata = file_data.metadata or {}
87 file_data.metadata.format = "triangular_upper"
88 file_data.metadata.range = string.format("%d-7797", poem_id + 1)
89 file_data.metadata.total_comparisons = new_size
90 file_data.metadata.migrated_at = os.date("%Y-%m-%d %H:%M:%S")
91
92 total_size_before = total_size_before + orig_size
93 total_size_after = total_size_after + new_size
94
95 -- Write back (unless dry run)
96 if not dry_run then
97 local success = utils.write_json_file(filepath, file_data)
98 if not success then
99 print(string.format("❌ Error writing poem %d", poem_id))
100 errors = errors + 1
101 goto continue
102 end
103 end
104
105 migrated = migrated + 1
106
107 -- Progress indicator
108 if migrated % 100 == 0 then
109 local reduction_pct = ((orig_size - new_size) / orig_size) * 100
110 print(string.format("Progress: %d/%d migrated (poem %d: %d → %d entries, %.1f%% reduction)",
111 migrated, #files, poem_id, orig_size, new_size, reduction_pct))
112 end
113
114 ::continue::
115 end
116
117 -- Summary
118 print()
119 print("📊 Migration Summary:")
120 print(string.format(" Total files: %d", #files))
121 print(string.format(" ✅ Migrated: %d", migrated))
122 print(string.format(" ⏭️ Skipped: %d (already triangular)", skipped))
123 print(string.format(" ❌ Errors: %d", errors))
124 print()
125 print(string.format(" Entries before: %.1fM", total_size_before / 1000000))
126 print(string.format(" Entries after: %.1fM", total_size_after / 1000000))
127 print(string.format(" Reduction: %.1f%%",
128 ((total_size_before - total_size_after) / total_size_before) * 100))
129 print()
130
131 if dry_run then
132 print("🔍 DRY RUN: No files were modified")
133 print(" Run without --dry-run to apply changes")
134 else
135 print("✅ Migration complete!")
136 end
137
138 return errors == 0
139end
140
141-- Command line execution
142if arg and arg[0] then
143 local similarities_dir = arg[1] or "assets/embeddings/embeddinggemma_latest/similarities"
144 local dry_run = arg[2] == "--dry-run"
145
146 print("Triangular Similarity Files Migration")
147 print("=====================================")
148 print()
149
150 local success = migrate_files(similarities_dir, dry_run)
151 os.exit(success and 0 or 1)
152end
153
154return {migrate_files = migrate_files}
155