scripts/zip-extractor.lua

335 lines

1#!/usr/bin/env lua
2-- ZIP archive extraction script for content processing
3-- Detects and extracts JSON data from ZIP archives for poem extraction pipeline
4
5-- {{{ setup_dir_path
6local function setup_dir_path(provided_dir)
7 if provided_dir then
8 return provided_dir
9 end
10 return "/mnt/mtwo/programming/ai-stuff/neocities-modernization"
11end
12-- }}}
13
14-- Get project directory from command line or use default
15local DIR = setup_dir_path(arg and arg[1])
16local TEMP_DIR = arg and arg[2] or error("Temporary directory required as second argument")
17
18-- Set up package path to find libs
19package.path = DIR .. "/libs/?.lua;" .. package.path
20local dkjson = require("dkjson")
21
22-- Issue 7-003: Load config for ignored_archives list
23local config_loader = require("config-loader")
24config_loader.set_project_root(DIR)
25local config = config_loader.load()
26local ignored_archives = (config.extraction and config.extraction.ignored_archives) or {}
27
28-- {{{ local function is_ignored_archive
29-- Check if a ZIP file should be skipped based on config.extraction.ignored_archives
30local function is_ignored_archive(basename)
31 for _, ignored in ipairs(ignored_archives) do
32 if basename == ignored then
33 return true
34 end
35 end
36 return false
37end
38-- }}}
39
40-- ANSI color codes for terminal output
41local COLOR_GREEN = "\027[92m" -- Bright green for success (✓, ✅)
42local COLOR_BLUE = "\027[94m" -- Bright blue for info (ℹ️)
43local COLOR_RED = "\027[91m" -- Bright red for errors (✗, ❌)
44local COLOR_YELLOW = "\027[93m" -- Bright yellow for warnings (⚠️)
45local COLOR_RESET = "\027[0m" -- Reset to default
46
47-- {{{ local function get_file_mtime
48-- Issue 7-005: Get file modification time for archive date comparison
49local function get_file_mtime(file_path)
50 local stat_cmd = string.format("stat -c %%Y '%s' 2>/dev/null", file_path)
51 local handle = io.popen(stat_cmd)
52 local result = handle:read("*a")
53 handle:close()
54
55 if result and result ~= "" then
56 -- Wrap gsub in parens to discard second return value (count)
57 local clean = (result:gsub("%s+", ""))
58 return tonumber(clean) or 0
59 end
60 return 0
61end
62-- }}}
63
64-- {{{ local function relative_path
65-- Issue 7-003: Show project name instead of "./" when path equals DIR
66local function relative_path(absolute_path)
67 if absolute_path == DIR or absolute_path == DIR .. "/" then
68 local dir_name = DIR:match("([^/]+)/?$")
69 return dir_name .. "/"
70 end
71 if absolute_path:sub(1, #DIR) == DIR then
72 local rel = absolute_path:sub(#DIR + 1)
73 if rel:sub(1, 1) == "/" then rel = rel:sub(2) end
74 return "./" .. rel
75 end
76 return absolute_path
77end
78-- }}}
79
80-- {{{ function detect_archive_type
81local function detect_archive_type(zip_file)
82 -- Check archive contents to determine type
83 local list_cmd = string.format("unzip -l '%s' 2>/dev/null", zip_file)
84 local handle = io.popen(list_cmd)
85 local content = handle:read("*a")
86 handle:close()
87
88 if content:match("outbox%.json") then
89 return "fediverse"
90 elseif content:match("export%.json") then
91 return "messages"
92 elseif content:match("notes/") or content:match("%.txt$") or content:match("%.md$") then
93 -- Look for notes directory or text/markdown files
94 return "notes"
95 end
96
97 return nil
98end
99-- }}}
100
101-- {{{ function detect_archives
102local function detect_archives(input_directory)
103 local archives = {}
104
105 print("🔍 Scanning for ZIP archives in: " .. relative_path(input_directory))
106
107 -- Scan for ZIP files
108 local find_cmd = string.format("find '%s' -name '*.zip' -type f", input_directory)
109 local handle = io.popen(find_cmd)
110
111 for file in handle:lines() do
112 local basename = file:match("([^/]+)%.zip$")
113
114 -- Issue 7-003: Skip archives in the ignored list (configured in config.lua)
115 if is_ignored_archive(basename) then
116 -- Silently skip - these are known non-content ZIPs
117 else
118 local archive_type = detect_archive_type(file)
119 if archive_type then
120 -- Issue 7-005: Store modification time for date-based selection
121 local mtime = get_file_mtime(file)
122 table.insert(archives, {
123 path = file,
124 type = archive_type,
125 basename = basename,
126 mtime = mtime,
127 mtime_date = os.date("%Y-%m-%d", mtime)
128 })
129 print("📦 Found " .. archive_type .. " archive: " .. basename .. " (" .. os.date("%Y-%m-%d", mtime) .. ")")
130 else
131 -- Issue 7-006: Full-line coloring for warnings
132 print(COLOR_YELLOW .. "⚠️ Unknown archive type: " .. basename .. COLOR_RESET)
133 end
134 end
135 end
136 handle:close()
137
138 return archives
139end
140-- }}}
141
142-- {{{ local function select_archives_by_type
143-- Issue 7-005: Groups archives by type, sorts by mtime (newest first),
144-- returns selected archives and skipped (older) archives
145local function select_archives_by_type(archives)
146 local by_type = {}
147
148 -- Group by type
149 for _, archive in ipairs(archives) do
150 if not by_type[archive.type] then
151 by_type[archive.type] = {}
152 end
153 table.insert(by_type[archive.type], archive)
154 end
155
156 local selected = {}
157 local skipped = {}
158
159 -- Sort each group by mtime (descending) and pick newest
160 for archive_type, group in pairs(by_type) do
161 table.sort(group, function(a, b) return a.mtime > b.mtime end)
162
163 -- First (newest) is selected
164 table.insert(selected, group[1])
165
166 -- Rest are skipped (older archives)
167 for i = 2, #group do
168 table.insert(skipped, group[i])
169 end
170 end
171
172 return selected, skipped
173end
174-- }}}
175
176-- {{{ function extract_archive_data
177local function extract_archive_data(archive_info, temp_base_dir)
178 local temp_dir = temp_base_dir .. "/" .. archive_info.type
179 local extract_dir = temp_dir .. "/extract"
180
181 print("📂 Creating temporary directory: " .. relative_path(extract_dir))
182 os.execute("mkdir -p " .. extract_dir)
183
184 local extract_files = {}
185 if archive_info.type == "fediverse" then
186 extract_files = {"outbox.json"}
187 -- Also extract media_attachments directory for fediverse archives
188 -- Use bash to properly handle wildcard expansion for unzip (suppress verbose output)
189 local media_cmd = string.format(
190 "bash -c 'unzip -o \"%s\" \"media_attachments/files/*\" -d \"%s\"' >/dev/null 2>&1",
191 archive_info.path, extract_dir)
192 local media_result = os.execute(media_cmd)
193 -- Issue 7-006: Full-line coloring for success messages
194 if media_result == 0 or media_result == true then
195 print(COLOR_GREEN .. "✅ Extracted media_attachments directory" .. COLOR_RESET)
196 else
197 -- Try alternative: extract all and filter, or list-and-extract approach (suppress verbose output)
198 local alt_cmd = string.format(
199 "unzip -l '%s' 2>/dev/null | grep media_attachments | awk '{print $4}' | xargs -I{} unzip -o '%s' '{}' -d '%s' >/dev/null 2>&1",
200 archive_info.path, archive_info.path, extract_dir)
201 os.execute(alt_cmd)
202 print(COLOR_GREEN .. "✅ Extracted media_attachments directory" .. COLOR_RESET)
203 end
204 elseif archive_info.type == "messages" then
205 -- Matrix exports have nested directory structure
206 extract_files = {"export.json", "export/export.json", "*/export.json"} -- Try multiple patterns
207 elseif archive_info.type == "notes" then
208 -- Extract entire notes directory or all text files (suppress verbose output)
209 local extract_cmd = string.format("unzip -j '%s' 'notes/*' -d '%s' >/dev/null 2>&1",
210 archive_info.path, extract_dir)
211 local result1 = os.execute(extract_cmd)
212
213 -- Also try extracting top-level text files
214 local extract_txt_cmd = string.format("unzip -j '%s' '*.txt' '*.md' -d '%s' >/dev/null 2>&1",
215 archive_info.path, extract_dir)
216 local result2 = os.execute(extract_txt_cmd)
217
218 -- If either extraction worked, we're good
219 -- Issue 7-006: Full-line coloring for success messages
220 if result1 == 0 or result2 == 0 then
221 print(COLOR_GREEN .. "✅ Extracted notes directory/text files" .. COLOR_RESET)
222 end
223
224 -- Skip the normal file extraction loop for notes
225 extract_files = {}
226 end
227
228 local extracted_count = 0
229 for _, file in ipairs(extract_files) do
230 local cmd = string.format("unzip -j '%s' '%s' -d '%s' >/dev/null 2>&1",
231 archive_info.path, file, extract_dir)
232 local result = os.execute(cmd)
233 -- Issue 7-006: Full-line coloring for success messages
234 if result == 0 then
235 print(COLOR_GREEN .. "✅ Extracted: " .. file .. COLOR_RESET)
236 extracted_count = extracted_count + 1
237 break -- Stop after first successful extraction
238 end
239 end
240
241 -- Check if any files were actually extracted to the directory
242 local check_cmd = string.format("find '%s' -type f | head -1", extract_dir)
243 local check_handle = io.popen(check_cmd)
244 local found_file = check_handle:read("*l")
245 check_handle:close()
246
247 -- Issue 7-006: Full-line coloring for success/error messages
248 if found_file and found_file ~= "" then
249 print(COLOR_GREEN .. "✅ Successfully extracted " .. archive_info.type .. " data from " .. archive_info.basename .. COLOR_RESET)
250 return temp_dir
251 else
252 print(COLOR_RED .. "❌ No extractable files found in " .. archive_info.basename .. COLOR_RESET)
253 return nil
254 end
255end
256-- }}}
257
258-- {{{ function create_extraction_summary
259local function create_extraction_summary(archives, temp_base_dir)
260 local summary = {
261 total_archives = #archives,
262 extracted_archives = 0,
263 by_type = {},
264 extraction_paths = {},
265 timestamp = os.date("%Y-%m-%dT%H:%M:%SZ")
266 }
267
268 for _, archive in ipairs(archives) do
269 local extraction_path = extract_archive_data(archive, temp_base_dir)
270 if extraction_path then
271 summary.extracted_archives = summary.extracted_archives + 1
272 summary.extraction_paths[archive.type] = extraction_path
273
274 if not summary.by_type[archive.type] then
275 summary.by_type[archive.type] = 0
276 end
277 summary.by_type[archive.type] = summary.by_type[archive.type] + 1
278 end
279 end
280
281 return summary
282end
283-- }}}
284
285-- Main execution
286-- Issue 7-003: Removed duplicate "Starting extraction" message (parent script already printed it)
287
288local all_archives = detect_archives(DIR .. "/input")
289
290-- Issue 7-006: Full-line coloring for error messages
291if #all_archives == 0 then
292 print(COLOR_RED .. "❌ No valid archives found to extract" .. COLOR_RESET)
293 os.exit(1)
294end
295
296-- Issue 7-005: Select most recent archive per type, warn about skipped
297local selected, skipped = select_archives_by_type(all_archives)
298
299print("\n📊 Archive selection:")
300print(" Total found: " .. #all_archives .. ", Selected: " .. #selected .. ", Skipped: " .. #skipped)
301
302-- Warn about skipped archives with full yellow text
303if #skipped > 0 then
304 for _, archive in ipairs(skipped) do
305 print(COLOR_YELLOW .. " ⚠️ Skipped older: " .. relative_path(archive.path) ..
306 " (" .. archive.mtime_date .. ")" .. COLOR_RESET)
307 end
308end
309
310-- Extract only selected archives (most recent per type)
311local summary = create_extraction_summary(selected, TEMP_DIR)
312
313print("\n📋 Extraction summary:")
314print(" Archives processed: " .. summary.extracted_archives .. "/" .. summary.total_archives)
315for archive_type, count in pairs(summary.by_type) do
316 print(" " .. archive_type .. ": " .. count .. " archive(s)")
317 if summary.extraction_paths[archive_type] then
318 print(" → " .. relative_path(summary.extraction_paths[archive_type]))
319 end
320end
321
322-- Save extraction summary for other scripts
323local summary_file = TEMP_DIR .. "/extraction-summary.json"
324local f = io.open(summary_file, "w")
325f:write(dkjson.encode(summary, { indent = true }))
326f:close()
327
328print("💾 Extraction summary saved: " .. relative_path(summary_file))
329-- Issue 7-003: Removed redundant "extraction completed" line - summary already shows completion
330
331-- Issue 7-006: Full-line coloring for error messages
332if summary.extracted_archives == 0 then
333 print(COLOR_RED .. "❌ No archives could be extracted" .. COLOR_RESET)
334 os.exit(1)
335end