scripts/extract-messages.lua
1
2-- Messages content extraction script
3-- Parses exported message JSON and extracts formatted content
4
5-- {{{ setup_dir_path
6local function setup_dir_path(provided_dir)
7 if provided_dir then
8 return provided_dir
9 end
10 return "/mnt/mtwo/programming/ai-stuff/neocities-modernization"
11end
12-- }}}
13
14-- Get project directory from command line or use default
15local DIR = setup_dir_path(arg and arg[1])
16local OVERRIDE_SOURCE = arg and arg[2] -- Optional override for temporary extraction
17
18-- Set up package path to find libs
19package.path = DIR .. "/libs/?.lua;" .. package.path
20local dkjson = require("dkjson")
21local exclusion_filter = require("exclusion-filter")
22
23-- Issue 10-003: Load unified config from config.lua
24local config_loader = require("config-loader")
25config_loader.set_project_root(DIR)
26local config = config_loader.load()
27
28-- Issue 10-015: Load sources configuration for multi-directory support
29local sources_loader = require("sources-loader")
30sources_loader.set_project_root(DIR)
31
32-- ANSI color codes for terminal output
33local COLOR_GREEN = "\027[92m" -- Bright green for success (✓, ✅)
34local COLOR_BLUE = "\027[94m" -- Bright blue for info (ℹ️)
35local COLOR_RED = "\027[91m" -- Bright red for errors (✗, ❌)
36local COLOR_YELLOW = "\027[93m" -- Bright yellow for warnings (⚠️)
37local COLOR_RESET = "\027[0m" -- Reset to default
38
39-- {{{ local function relative_path
40-- Issue 7-003: Show project name instead of "./" when path equals DIR
41local function relative_path(absolute_path)
42 if absolute_path == DIR or absolute_path == DIR .. "/" then
43 local dir_name = DIR:match("([^/]+)/?$")
44 return dir_name .. "/"
45 end
46 if absolute_path:sub(1, #DIR) == DIR then
47 local rel = absolute_path:sub(#DIR + 1)
48 if rel:sub(1, 1) == "/" then rel = rel:sub(2) end
49 return "./" .. rel
50 end
51 return absolute_path
52end
53-- }}}
54
55-- Issue 10-015a: Get messages path from unified sources config (no fallback - errors if not configured)
56local messages_directories = sources_loader.get_directories("messages")
57if #messages_directories == 0 then
58 print(COLOR_RED .. "❌ Error: sources.messages not configured in config.lua" .. COLOR_RESET)
59 os.exit(1)
60end
61-- Use the primary directory from sources config
62local messages_backup_path = messages_directories[1].path
63-- Strip DIR prefix if present (sources-loader returns absolute paths)
64if messages_backup_path:sub(1, #DIR) == DIR then
65 messages_backup_path = messages_backup_path:sub(#DIR + 2) -- +2 for the slash
66end
67
68-- Use override path if provided (for ZIP extraction), otherwise use configured path
69local source_base_path
70if OVERRIDE_SOURCE then
71 source_base_path = OVERRIDE_SOURCE
72 print("🔄 Using temporary extraction source: " .. relative_path(source_base_path))
73else
74 source_base_path = DIR .. "/" .. messages_backup_path
75 print("🔄 Using configured source: " .. relative_path(source_base_path))
76end
77
78-- Set up file paths (try multiple possible locations for export.json)
79-- Priority: 1. source_base_path/extract/export.json (backup dir)
80-- 2. source_base_path/extract/export/export.json (nested backup)
81-- 3. DIR/input/extract/export/export.json (temp extraction from run-messages)
82local file = source_base_path .. "/extract/export.json"
83local file_handle = io.open(file, "r")
84if not file_handle then
85 file = source_base_path .. "/extract/export/export.json"
86 file_handle = io.open(file, "r")
87end
88-- Issue 8-054: Fallback to temp extraction location (run-messages extracts to input/extract/)
89if not file_handle then
90 file = DIR .. "/input/extract/export/export.json"
91 file_handle = io.open(file, "r")
92 if file_handle then
93 print("🔄 Found export.json in temp extraction: " .. relative_path(file))
94 end
95end
96local save_location = DIR .. "/" .. messages_backup_path .. "/files"
97
98if not file_handle then
99 -- Issue 7-006: Full-line coloring for error messages
100 print(COLOR_RED .. "❌ Error: Could not find export.json at any expected location" .. COLOR_RESET)
101 print(" Tried: " .. relative_path(source_base_path .. "/extract/export.json"))
102 print(" Tried: " .. relative_path(source_base_path .. "/extract/export/export.json"))
103 print(" Tried: " .. relative_path(DIR .. "/input/extract/export/export.json"))
104 os.exit(1)
105end
106
107local opened_file_string = file_handle:read("*a")
108io.close(file_handle)
109
110local data = dkjson.decode(opened_file_string)
111local messages = {}
112
113-- {{{ function format_date
114local function format_date(timestamp)
115 if type(timestamp) ~= "number" then
116 print("Warning: Invalid timestamp, using current time.")
117 timestamp = os.time()
118 end
119 return os.date("%Y-%m-%d %H:%M:%S", timestamp)
120end
121-- }}}
122
123-- {{{ function format_iso_date
124local function format_iso_date(timestamp)
125 if type(timestamp) ~= "number" then
126 timestamp = os.time()
127 end
128 return os.date("%Y-%m-%dT%H:%M:%SZ", timestamp)
129end
130-- }}}
131
132-- {{{ function generate_timestamp
133local function generate_timestamp(timestamp)
134 if timestamp then
135 timestamp = math.floor(timestamp / 1000) -- Convert ms → s
136 else
137 timestamp = os.time() -- Fallback
138 end
139 return format_date(timestamp)
140end
141-- }}}
142
143-- {{{ function generate_iso_timestamp
144local function generate_iso_timestamp(timestamp)
145 if timestamp then
146 timestamp = math.floor(timestamp / 1000) -- Convert ms → s
147 else
148 timestamp = os.time() -- Fallback
149 end
150 return format_iso_date(timestamp)
151end
152-- }}}
153
154-- {{{ function generate_poem_metadata
155local function generate_poem_metadata(content, source_data)
156 local metadata = {
157 character_count = string.len(content),
158 word_count = select(2, content:gsub("%S+", "")),
159 has_content_warning = false, -- Messages typically don't have CW
160 extraction_timestamp = os.date("%Y-%m-%dT%H:%M:%SZ")
161 }
162
163 if source_data and source_data.origin_server_ts then
164 metadata.creation_date = generate_iso_timestamp(tonumber(source_data.origin_server_ts))
165 end
166
167 return metadata
168end
169-- }}}
170
171-- Issue 6-031: Load poem exclusion filter
172-- For messages, exclusion IDs are the message index (numeric)
173local poem_exclusions = exclusion_filter.load_default(DIR)
174if poem_exclusions:count("messages") > 0 then
175 -- Issue 7-006: Full-line coloring for info messages
176 print(COLOR_YELLOW .. "🚫 Messages exclusion filter: " .. poem_exclusions:count("messages") .. " entries" .. COLOR_RESET)
177end
178
179-- {{{ Issue 8-054: Build image lookup from extract/images/ directory
180-- Matrix exports store decrypted images alongside export.json
181-- Matrix renames files to: {original_basename}-{M-D-YYYY} at {H-MM-SS AM/PM}.{ext}
182-- We need to map both exact and prefix matches
183local function build_image_lookup(extract_dir)
184 local lookup = {}
185 local images_dir = extract_dir .. "/images"
186 local handle = io.popen('ls "' .. images_dir .. '" 2>/dev/null')
187 if handle then
188 for filename in handle:lines() do
189 local full_path = images_dir .. "/" .. filename
190 -- Exact match (unlikely but possible)
191 lookup[filename] = full_path
192
193 -- Extract original filename by removing Matrix timestamp suffix
194 -- Pattern: {original}-{M-D-YYYY} at {time}.{ext} → {original}.{ext}
195 local basename, ext = filename:match("^(.+)%-[%d]+%-[%d]+%-[%d]+ at .+%.([^.]+)$")
196 if basename and ext then
197 local original_name = basename .. "." .. ext
198 lookup[original_name] = full_path
199 end
200 end
201 handle:close()
202 end
203 return lookup
204end
205
206-- Build lookup from the extract directory
207-- Issue 8-054: Try multiple possible image locations
208-- 1. Override source (for temporary extraction)
209-- 2. Persistent images directory in messages backup
210-- 3. Temporary extraction directory in input/
211local extract_dir = OVERRIDE_SOURCE or source_base_path
212local image_lookup = build_image_lookup(extract_dir)
213-- Fallback: check persistent images directory
214if next(image_lookup) == nil then
215 image_lookup = build_image_lookup(source_base_path)
216end
217-- Fallback: check input/extract/export (where run-messages puts it)
218if next(image_lookup) == nil then
219 image_lookup = build_image_lookup(DIR .. "/input/extract/export")
220end
221local image_lookup_count = 0
222for _ in pairs(image_lookup) do image_lookup_count = image_lookup_count + 1 end
223if image_lookup_count > 0 then
224 -- Issue 7-006: Full-line coloring for info messages
225 print(COLOR_BLUE .. "ℹ️ Found " .. image_lookup_count .. " images in extract/images/" .. COLOR_RESET)
226end
227-- }}}
228
229-- {{{ Issue 8-054: Detect if content is a bare filename
230-- Returns true if the body looks like just a filename (no meaningful text)
231local function is_bare_filename(body)
232 if not body then return false end
233 -- Match: word characters, dots, hyphens, underscores followed by extension
234 -- Must not contain spaces (captions have spaces)
235 return body:match("^[%w%.%-_]+%.[%w]+$") ~= nil
236end
237-- }}}
238
239local excluded_count = 0
241local poems_json = {}
242local i = 1
243
244for key, value in pairs(data.messages) do
245 -- Issue 6-031: Generate poem ID early for exclusion check
246 local poem_id = string.format("%04d", i)
247
248 -- Issue 6-031: Check exclusion filter (tombstone - leaves gap in ID sequence)
249 if poem_exclusions:is_excluded("messages", poem_id) then
250 excluded_count = excluded_count + 1
251 i = i + 1 -- Increment to maintain ID stability (tombstoning)
252 goto continue
253 end
254
255 local content = value.content.body or " "
256 local msgtype = value.content.msgtype
257
258 -- Issue 8-054: Handle media messages (m.image, m.video, m.audio, m.file)
259 local attachments = nil
260 if msgtype == "m.image" or msgtype == "m.video" or msgtype == "m.audio" or msgtype == "m.file" then
261 local filename = value.content.body or ""
262 local local_path = image_lookup[filename]
263
264 -- Build attachment metadata (matching fediverse attachment format)
265 local attachment = {
266 media_type = (value.content.info and value.content.info.mimetype) or "application/octet-stream",
267 width = value.content.info and value.content.info.w,
268 height = value.content.info and value.content.info.h,
269 alt_text = nil, -- Matrix doesn't provide alt-text for images
270 relative_path = local_path
271 }
272 attachments = { attachment }
273
274 -- If the body is just a bare filename, replace with descriptive placeholder
275 -- This gives the poem meaningful content for embedding generation
276 if is_bare_filename(filename) then
277 local media_label = "Image"
278 if msgtype == "m.video" then media_label = "Video"
279 elseif msgtype == "m.audio" then media_label = "Audio"
280 elseif msgtype == "m.file" then media_label = "File"
281 end
282 content = "[" .. media_label .. ": " .. filename .. "]"
283 image_count = image_count + 1
284 end
285 end
286
287 -- Generate JSON format for HTML generation
288 local poem_entry = {
289 id = poem_id,
290 category = "messages",
291 source_file = "export.json",
292 creation_date = generate_iso_timestamp(tonumber(value.origin_server_ts)),
293 content_warning = nil,
294 content = content,
295 raw_content = content, -- Messages don't have HTML markup
296 metadata = generate_poem_metadata(content, value),
297 attachments = attachments -- Issue 8-054: May be nil for text-only messages
298 }
299 table.insert(poems_json, poem_entry)
300
301 i = i + 1
302 ::continue::
303end
304
305-- {{{ Generate JSON output for HTML generation
306-- Create output directory
307os.execute("mkdir -p " .. save_location)
308
309-- Generate JSON output
310local json_output = {
311 poems = poems_json,
312 extraction_summary = {
313 total_poems = #poems_json,
314 poems_excluded = excluded_count, -- Issue 6-031: Excluded poem count
315 image_messages = image_count, -- Issue 8-054: Media message count
316 by_category = { messages = #poems_json },
317 content_warnings = {}, -- Messages typically don't have content warnings
318 extraction_date = os.date("%Y-%m-%dT%H:%M:%SZ")
319 }
320}
321
322local json_file = save_location .. "/poems.json"
323local f = io.open(json_file, "w")
324f:write(dkjson.encode(json_output, { indent = true }))
325f:close()
326
327-- Issue 7-006: Full-line coloring for success messages
328print(COLOR_GREEN .. "✅ Messages extraction complete" .. COLOR_RESET)
329print(" 📄 Generated: " .. relative_path(json_file))
330print(" 📊 Messages processed: " .. #poems_json)
331if image_count > 0 then
332 print(" 🖼️ Media messages: " .. image_count .. " (with attachments)")
333end
334if excluded_count > 0 then
335 print(" 🚫 Excluded: " .. excluded_count .. " (tombstoned)")
336end
337-- }}}
338
339