scripts/extract-fediverse.lua

768 lines

1
2-- Fediverse content extraction script
3-- Parses ActivityPub JSON and extracts formatted posts with attachment metadata
4--
5-- ACTIVITYPUB ATTACHMENT FORMAT (Mastodon/W3C Standard):
6-- Each Note object in outbox.json may contain an "attachment" array:
7-- {
8-- "type": "Create",
9-- "object": {
10-- "type": "Note",
11-- "content": "<p>Post text here</p>",
12-- "attachment": [
13-- {
14-- "type": "Document",
15-- "mediaType": "image/png", -- MIME type (image/png, image/jpeg, video/mp4, etc.)
16-- "url": "https://server.com/media/files/123/456/789/original/abc123.png",
17-- "name": "Alt text description", -- User-provided alt text (may be null)
18-- "blurhash": "LEHV6nWB2yk8...", -- Blur hash for placeholder (optional)
19-- "width": 1920, -- Image dimensions (optional)
20-- "height": 1080
21-- }
22-- ]
23-- }
24-- }
25--
26-- URL PATH MAPPING:
27-- The URL path structure maps directly to local media_attachments directory:
28-- URL: https://tech.lgbt/media/files/113/464/378/730/595/557/original/658cbf8cc6804a09.png
29-- Local: input/media_attachments/files/113/464/378/730/595/557/original/658cbf8cc6804a09.png
30--
31-- The numeric segments (113/464/378/...) are derived from Mastodon's internal attachment ID
32-- split into 3-digit chunks for filesystem distribution.
33
34-- {{{ setup_dir_path
35local function setup_dir_path(provided_dir)
36 if provided_dir then
37 return provided_dir
38 end
39 return "/mnt/mtwo/programming/ai-stuff/neocities-modernization"
40end
41-- }}}
42
43-- {{{ parse_args
44-- Parse command line arguments for DIR, source override, and boost inclusion
45local function parse_args(args)
46 local dir = nil
47 local source_override = nil
48 local include_boosts = nil -- nil means use config default
49 local i = 1
50
51 while i <= #(args or {}) do
52 local a = args[i]
53 if a == "--include-boosts" then
54 include_boosts = true
55 i = i + 1
56 elseif a == "--no-boosts" then
57 include_boosts = false
58 i = i + 1
59 elseif not a:match("^%-") then
60 -- Positional arguments: first is DIR, second is source override
61 if not dir then
62 dir = a
63 else
64 source_override = a
65 end
66 i = i + 1
67 else
68 i = i + 1
69 end
70 end
71
72 return dir, source_override, include_boosts
73end
74-- }}}
75
76-- Get project directory and options from command line
77local parsed_dir, OVERRIDE_SOURCE, CLI_INCLUDE_BOOSTS = parse_args(arg)
78local DIR = setup_dir_path(parsed_dir)
79
80-- Set up package path to find libs
81package.path = DIR .. "/libs/?.lua;" .. package.path
82local dkjson = require("dkjson")
83local exclusion_filter = require("exclusion-filter")
84
85-- Issue 10-003: Load unified config from config.lua
86local config_loader = require("config-loader")
87config_loader.set_project_root(DIR)
88local config = config_loader.load()
89
90-- Issue 10-015: Load sources configuration for multi-directory support
91local sources_loader = require("sources-loader")
92sources_loader.set_project_root(DIR)
93
94-- ANSI color codes for terminal output
95local COLOR_GREEN = "\027[92m" -- Bright green for success (✓, ✅)
96local COLOR_BLUE = "\027[94m" -- Bright blue for info (ℹ️)
97local COLOR_RED = "\027[91m" -- Bright red for errors (✗, ❌)
98local COLOR_YELLOW = "\027[93m" -- Bright yellow for warnings (⚠️)
99local COLOR_RESET = "\027[0m" -- Reset to default
100
101-- {{{ local function relative_path
102-- Issue 7-003: Show project name instead of "./" when path equals DIR
103local function relative_path(absolute_path)
104 if absolute_path == DIR or absolute_path == DIR .. "/" then
105 local dir_name = DIR:match("([^/]+)/?$")
106 return dir_name .. "/"
107 end
108 if absolute_path:sub(1, #DIR) == DIR then
109 local rel = absolute_path:sub(#DIR + 1)
110 if rel:sub(1, 1) == "/" then rel = rel:sub(2) end
111 return "./" .. rel
112 end
113 return absolute_path
114end
115-- }}}
116
117-- Issue 10-015a: Get fediverse path from unified sources config (no fallback - errors if not configured)
118local fediverse_directories = sources_loader.get_directories("fediverse")
119if #fediverse_directories == 0 then
120 print(COLOR_RED .. "❌ Error: sources.fediverse not configured in config.lua" .. COLOR_RESET)
121 os.exit(1)
122end
123-- Use the primary directory from sources config
124local fediverse_backup_path = fediverse_directories[1].path
125-- Strip DIR prefix if present (sources-loader returns absolute paths)
126if fediverse_backup_path:sub(1, #DIR) == DIR then
127 fediverse_backup_path = fediverse_backup_path:sub(#DIR + 2) -- +2 for the slash
128end
129
130-- Privacy configuration from unified config
131-- CLI flags --include-boosts/--no-boosts override config value
132local function get_include_boosts()
133 if CLI_INCLUDE_BOOSTS ~= nil then
134 return CLI_INCLUDE_BOOSTS
135 end
136 return config.privacy.include_boosts or false
137end
138
139-- {{{ local function load_boost_content_cache
140-- Load scraped boost content cache from assets/boost-content-cache.json
141-- Returns a table mapping URI -> cached content data
142local boost_content_cache = nil
143local function load_boost_content_cache()
144 if boost_content_cache then
145 return boost_content_cache
146 end
147
148 local cache_path = DIR .. "/assets/boost-content-cache.json"
149 local file = io.open(cache_path, "r")
150 if not file then
151 boost_content_cache = {}
152 return boost_content_cache
153 end
154
155 local content = file:read("*a")
156 file:close()
157
158 local data, pos, err = dkjson.decode(content)
159 if err or not data or not data.entries then
160 boost_content_cache = {}
161 return boost_content_cache
162 end
163
164 boost_content_cache = data.entries
165 local count = 0
166 for _ in pairs(boost_content_cache) do count = count + 1 end
167 print(" 📥 Loaded boost content cache: " .. count .. " entries")
168 return boost_content_cache
169end
170-- }}}
171
172local privacy_config = {
173 mode = config.privacy.mode or "clean",
174 anonymization_prefix = config.privacy.anonymization_prefix or "user-",
175 include_boosts = get_include_boosts(),
176 preserve_original_length = config.privacy.preserve_original_length or true,
177 store_anonymization_map = config.privacy.store_anonymization_map or false,
178 local_server_domain = config.privacy.local_server_domain or "tech.lgbt",
179 debug_anonymization = false -- Debug flag, not in config
180}
181
182-- Log boost inclusion status
183if privacy_config.include_boosts then
184 print("📤 Including fediverse boosts in extraction (CLI flag or config)")
185end
186
187-- Use override path if provided (for ZIP extraction), otherwise use configured path
188local source_base_path
189if OVERRIDE_SOURCE then
190 source_base_path = OVERRIDE_SOURCE
191 print("🔄 Using temporary extraction source: " .. relative_path(source_base_path))
192else
193 source_base_path = DIR .. "/" .. fediverse_backup_path
194 print("🔄 Using configured source: " .. relative_path(source_base_path))
195end
196
197-- Set up file paths - check if we're already in extract directory
198local file
199if source_base_path:match("extract$") then
200 file = source_base_path .. "/outbox.json"
201else
202 file = source_base_path .. "/extract/outbox.json"
203end
204local save_location = DIR .. "/" .. fediverse_backup_path .. "/files"
205
206-- Load and parse ActivityPub data
207print("🔄 Loading ActivityPub data from: " .. relative_path(file))
208local opened_file = io.open(file, "r")
209if not opened_file then
210 -- Issue 7-006: Full-line coloring for error messages
211 print(COLOR_RED .. "❌ Error: Could not open file " .. file .. COLOR_RESET)
212 print(" Make sure the file exists and is readable")
213 os.exit(1)
214end
215
216local opened_file_string = opened_file:read("*a")
217opened_file:close()
218
219local data = dkjson.decode(opened_file_string)
220if not data then
221 -- Issue 7-006: Full-line coloring for error messages
222 print(COLOR_RED .. "❌ Error: Could not parse JSON data from " .. file .. COLOR_RESET)
223 os.exit(1)
224end
225
226-- Issue 7-006: Full-line coloring for success messages
227print(COLOR_GREEN .. "✅ Loaded ActivityPub data: " .. (data.totalItems or #data.orderedItems) .. " activities" .. COLOR_RESET)
228
229-- Issue 6-031: Load poem exclusion filter
230-- Excluded poems leave gaps in the ID sequence (tombstoning) to preserve stable anchor links
231local poem_exclusions = exclusion_filter.load_default(DIR)
232if poem_exclusions:count() > 0 then
233 -- Issue 7-006: Full-line coloring for info messages
234 print(COLOR_YELLOW .. "🚫 Exclusion filter loaded: " .. poem_exclusions:summary() .. COLOR_RESET)
235end
236
237-- Privacy system variables
238local user_anonymization_map = {}
239local user_counter = 1
240
241-- {{{ function normalize_username
242local function normalize_username(username)
243 -- Strip ID paths and normalize username variations for consistent mapping
244 -- Remove paths like "/111978500472309702" from usernames
245 local normalized = username:gsub("/[0-9]+", "")
246
247 -- Handle specific username variations - map shorter forms to longer canonical forms
248 -- This is based on observed patterns in the fediverse data
249 local username_mappings = {
250 ["wyatt"] = "wyatt8740", -- Map @wyatt to @wyatt8740 for consistency
251 -- Add other mappings here as needed
252 }
253
254 -- Apply username mapping if one exists
255 if username_mappings[normalized] then
256 normalized = username_mappings[normalized]
257 end
258
259 return normalized
260end
261-- }}}
262
263-- {{{ function anonymize_mention
264local function anonymize_mention(username, server)
265 -- Normalize username to handle variations and ID paths
266 local normalized_username = normalize_username(username)
267
268 -- Debug logging to track anonymization mappings
269 if privacy_config.debug_anonymization then
270 io.stderr:write(string.format("DEBUG: anonymize_mention: '%s' -> '%s' @ '%s'\n",
271 username, normalized_username, server or "local"))
272 end
273
274 -- IMPORTANT: Consider users with same username on different servers as the same person
275 -- This handles server migrations and cross-server mentions of the same person
276 -- We only use the username for mapping, ignoring the server domain entirely
277 local map_key = normalized_username -- Just username, no server
278
279 if not user_anonymization_map[map_key] then
280 user_anonymization_map[map_key] = privacy_config.anonymization_prefix .. user_counter
281 user_counter = user_counter + 1
282 if privacy_config.debug_anonymization then
283 io.stderr:write(string.format(" -> New mapping: %s = %s\n", map_key, user_anonymization_map[map_key]))
284 end
285 end
286 return user_anonymization_map[map_key]
287end
288-- }}}
289
290-- {{{ function process_mentions_for_privacy
291local function process_mentions_for_privacy(content, privacy_mode)
292 if privacy_mode ~= "clean" then
293 return content, content -- Return original for dirty mode
294 end
295
296 local original_content = content
297 local processed_content = content
298
299 -- Handle HTML mention markup: <span class="h-card">...<a href="https://server/@user">@<span>user</span></a></span>
300 processed_content = processed_content:gsub('<span class="h%-card"[^>]*>.-<a href="[^"]*://([^/"]+)/@([^"/?"]*)[^"]*"[^>]*>@<span>([^<]*)</span></a></span>', function(server, user, display_user)
301 -- Use the URL username (user) which is more reliable than display text
302 -- The URL contains the actual username, display might be shortened
303 -- Extract only the username part, not any path segments or IDs after it
304 return "@" .. anonymize_mention(user, server)
305 end)
306
307 -- Handle simpler HTML mentions: <a href="https://server/users/user" class="u-url mention">@<span>user</span></a>
308 processed_content = processed_content:gsub('<a href="[^"]*://([^/"]+)/users/([^"/?"]*)[^"]*"[^>]*>@<span>([^<]*)</span></a>', function(server, user, display_user)
309 -- Use the URL username (user) which is more reliable than display text
310 -- Extract only the username part, not any path segments after it
311 return "@" .. anonymize_mention(user, server)
312 end)
313
314 -- 6-027a Patterns: Handle plain text mentions as specified in sub-issue
315 -- Pattern 1: Full mentions @user@domain.com
316 processed_content = processed_content:gsub("@([%w%.%-_]+)@([%w%.%-]+%.%w+)", function(user, server)
317 return "@" .. anonymize_mention(user, server)
318 end)
319
320 -- Pattern 2: Multiple usernames at start - handle sequences like "@user1 @user2 @user3 content"
321 -- This pattern handles multiple consecutive mentions at the beginning
322 while processed_content:match("^@[%w%.%-_]+%s+@") do
323 processed_content = processed_content:gsub("^@([%w%.%-_]+)(%s+)", function(user, space)
324 return "@" .. anonymize_mention(user, nil) .. space
325 end)
326 end
327
328 -- Pattern 3: Single username at start (after multiple handling)
329 processed_content = processed_content:gsub("^@([%w%.%-_]+)%s", function(user)
330 return "@" .. anonymize_mention(user, nil) .. " "
331 end)
332
333 -- Pattern 4: Local mentions @user (same server, followed by whitespace)
334 processed_content = processed_content:gsub("@([%w%.%-_]+)%s", function(user)
335 return "@" .. anonymize_mention(user, nil) .. " "
336 end)
337
338 -- Pattern 5: @user at end of content (no trailing space)
339 processed_content = processed_content:gsub("@([%w%.%-_]+)$", function(user)
340 return "@" .. anonymize_mention(user, nil)
341 end)
342
343 -- Pattern 6: Catch any remaining @username patterns in the middle of text
344 -- This catches mentions followed by punctuation or other non-space characters
345 processed_content = processed_content:gsub("@([%w%.%-_]+)([^%w%.%-_@])", function(user, following_char)
346 return "@" .. anonymize_mention(user, nil) .. following_char
347 end)
348
349 return processed_content, original_content
350end
351-- }}}
352
353-- {{{ function categorize_activity
354local function categorize_activity(activity)
355 if activity.type == "Create" and activity.object and activity.object.type == "Note" then
356 return "original_post", activity.object
357 elseif activity.type == "Announce" then
358 return "boost", activity.object
359 else
360 return "unknown", nil
361 end
362end
363-- }}}
364
365-- {{{ function extract_boost_content
366local function extract_boost_content(announce_activity)
367 local boosted_object = announce_activity.object
368
369 -- If object is URI, check cache for scraped content first
370 if type(boosted_object) == "string" then
371 local cache = load_boost_content_cache()
372 local cached = cache[boosted_object]
373
374 -- Issue 10-037: Check for non-empty content (empty string "" is truthy in Lua)
375 -- Cache entries with empty content should fall back to "External post:" format
376 if cached and cached.content and cached.content ~= "" then
377 -- Use cached scraped content instead of placeholder
378 -- The cached content is HTML that will be processed by clean_html later
379 return {
380 type = "cached_external_boost",
381 uri = boosted_object,
382 boost_timestamp = announce_activity.published,
383 content = cached.content,
384 content_warning = cached.summary, -- CW from original post
385 sensitive = cached.sensitive,
386 original_published = cached.published,
387 original_author = cached.attributed_to,
388 metadata = {
389 is_boost = true,
390 boost_type = "cached_external",
391 original_uri = boosted_object,
392 boost_date = announce_activity.published,
393 scraped_at = cached.scraped_at,
394 original_author = cached.attributed_to
395 }
396 }
397 end
398
399 -- No cache entry - fall back to placeholder
400 return {
401 type = "external_boost",
402 uri = boosted_object,
403 boost_timestamp = announce_activity.published,
404 content = "External post: " .. boosted_object,
405 metadata = {
406 is_boost = true,
407 boost_type = "external",
408 original_uri = boosted_object,
409 boost_date = announce_activity.published
410 }
411 }
412 end
413
414 -- If object is embedded, extract full content
415 -- Issue 10-037: Also check for non-empty embedded content
416 if type(boosted_object) == "table" and boosted_object.content and boosted_object.content ~= "" then
417 return {
418 type = "embedded_boost",
419 content = boosted_object.content,
420 original_author = boosted_object.attributedTo,
421 boost_timestamp = announce_activity.published,
422 original_timestamp = boosted_object.published,
423 metadata = {
424 is_boost = true,
425 boost_type = "embedded",
426 original_author = boosted_object.attributedTo,
427 boost_date = announce_activity.published,
428 original_date = boosted_object.published
429 }
430 }
431 end
432
433 -- Issue 10-037: Fallback for embedded objects with empty content
434 -- Extract URI from the object's id field and create placeholder entry
435 if type(boosted_object) == "table" and boosted_object.id then
436 return {
437 type = "external_boost",
438 uri = boosted_object.id,
439 boost_timestamp = announce_activity.published,
440 content = "External post: " .. boosted_object.id,
441 metadata = {
442 is_boost = true,
443 boost_type = "embedded_empty",
444 original_uri = boosted_object.id,
445 original_author = boosted_object.attributedTo,
446 boost_date = announce_activity.published,
447 content_unavailable = true
448 }
449 }
450 end
451
452 return nil
453end
454-- }}}
455
456-- {{{ local function clean_html
457local function clean_html(content)
458 -- Clean HTML markup to get plain text (what Mastodon counts)
459 local clean = content:gsub("<p>", "\n\n")
460 -- Issue 6-032: Handle all BR tag variants (<br>, <br/>, <br />)
461 -- Mastodon uses XHTML-style <br /> which was causing words to run together
462 clean = clean:gsub("<br%s*/?>", "\n")
463 clean = clean:gsub("&amp;", "&")
464 clean = clean:gsub("&#39;", "'")
465 clean = clean:gsub("&quot;", "\"")
466 clean = clean:gsub("&lt;", "<")
467 clean = clean:gsub("&gt;", ">")
468 clean = clean:gsub("\\\"", "\"")
469 clean = clean:gsub(" _^", "^_^")
470 clean = clean:gsub("^^_^", "^_^")
471 clean = clean:gsub("<[^>]+>", "")
472 clean = clean:gsub("^\n+", ""):gsub("\n+$", "") -- Trim newlines
473 return clean
474end
475-- }}}
476
477-- {{{ function process_fediverse_content
478local function process_fediverse_content(raw_content, cw, privacy_mode)
479 if not raw_content then return nil end
480
481 -- Process mentions for privacy BEFORE HTML cleaning to preserve structure
482 local privacy_processed_content, original_content = process_mentions_for_privacy(raw_content, privacy_mode)
483
484 -- Clean HTML for display content (after anonymization)
485 local clean_content = clean_html(privacy_processed_content)
486
487 -- Clean HTML for golden poem calculation (before anonymization, preserves @mentions)
488 local golden_poem_content = clean_html(original_content)
489
490 return {
491 content = clean_content,
492 raw_content = raw_content,
493 original_content = original_content,
494 golden_poem_content = golden_poem_content, -- HTML-cleaned, pre-anonymization (for 1024 char count)
495 content_warning = (cw and cw ~= "") and cw or nil,
496 privacy_applied = (privacy_mode == "clean")
497 }
498end
499-- }}}
500
501-- {{{ function extract_date
502local function extract_date(timestamp)
503 return timestamp and timestamp:match("(%d%d%d%d%-%d%d%-%d%d)") or "0000-00-00"
504end
505-- }}}
506
507-- {{{ function extract_full_date
508local function extract_full_date(timestamp)
509 if timestamp then
510 return timestamp:match("(%d%d%d%d%-%d%d%-%d%dT%d%d:%d%d:%d%d)") or timestamp
511 end
512 return os.date("%Y-%m-%dT%H:%M:%S")
513end
514-- }}}
515
516-- {{{ function generate_poem_metadata
517local function generate_poem_metadata(content, cw, source_data, golden_poem_content)
518 -- Golden poem calculation: HTML-cleaned content (before anonymization) + content warning text
519 -- This matches what Mastodon counts: text content + @mentions + CW text
520 local golden_content = golden_poem_content or content
521 local golden_poem_length = string.len(golden_content)
522
523 -- Add content warning text to golden poem calculation (exclude "CW: " prefix as per 6-027)
524 if cw and cw ~= "" then
525 golden_poem_length = golden_poem_length + string.len(cw)
526 end
527
528 local metadata = {
529 character_count = string.len(content), -- Display content length (post-privacy)
530 golden_poem_character_count = golden_poem_length, -- For golden poem qualification (1024 chars)
531 is_golden_poem = (golden_poem_length == 1024),
532 word_count = select(2, content:gsub("%S+", "")),
533 has_content_warning = (cw and cw ~= ""),
534 extraction_timestamp = os.date("%Y-%m-%dT%H:%M:%SZ")
535 }
536
537 if source_data and source_data.published then
538 metadata.creation_date = extract_full_date(source_data.published)
539 end
540
541 return metadata
542end
543-- }}}
544
545-- {{{ function extract_attachments
546local function extract_attachments(content_object)
547 -- Extract media attachment metadata from ActivityPub Note object
548 -- Returns nil if no attachments, or array of attachment metadata
549 if not content_object.attachment then
550 return nil
551 end
552
553 local attachments = {}
554 for _, attachment in ipairs(content_object.attachment) do
555 -- Only process Document type attachments (images, videos, etc.)
556 if attachment.type == "Document" and attachment.url then
557 -- Extract the relative path from the URL
558 -- URL format: https://server.com/media/files/123/456/789/original/filename.ext
559 -- We extract: files/123/456/789/original/filename.ext
560 local relative_path = attachment.url:match("/files/(.+)$")
561 if relative_path then
562 relative_path = "files/" .. relative_path
563 end
564
565 local attachment_entry = {
566 media_type = attachment.mediaType,
567 url = attachment.url,
568 relative_path = relative_path,
569 alt_text = attachment.name, -- May be nil if no alt text provided
570 width = attachment.width,
571 height = attachment.height,
572 blurhash = attachment.blurhash
573 }
574 table.insert(attachments, attachment_entry)
575 end
576 end
577
578 if #attachments > 0 then
579 return attachments
580 end
581 return nil
582end
583-- }}}
584
585local poems_json = {}
586local boost_count = 0
587local original_count = 0
588local attachment_count = 0
589local excluded_count = 0 -- Issue 6-031: Track excluded poems
590-- Issue 10-038: Separate ID numbering for fediverse_boost category
591-- Boosts get their own sequential IDs starting from 0001, independent of fediverse posts
592local boost_id_counter = 1
593
594print("🔄 Processing activities with privacy mode: " .. privacy_config.mode)
595print("🔄 Include boosts: " .. tostring(privacy_config.include_boosts))
596
597for key, activity in pairs(data.orderedItems) do
598 local activity_type, content_object = categorize_activity(activity)
599
600 -- Issue 6-031: Generate poem ID early for exclusion check
601 -- IDs are assigned before exclusion filter runs, preserving stable anchors
602 local poem_id = string.format("%04d", key)
603
604 if activity_type == "original_post" then
605 -- Issue 6-031: Check exclusion filter (tombstone - leaves gap in ID sequence)
606 if poem_exclusions:is_excluded("fediverse", poem_id) then
607 excluded_count = excluded_count + 1
608 goto continue
609 end
610
611 -- Process original posts (Create activities)
612 local cw = content_object.summary or ""
613 local content = content_object.content
614
615 -- Process content with privacy settings
616 local processed_content = process_fediverse_content(content, cw, privacy_config.mode)
617 if processed_content then
618 local poem_entry = {
619 id = poem_id,
620 category = "fediverse",
621 source_file = "outbox.json",
622 creation_date = extract_full_date(activity.published),
623 content_warning = processed_content.content_warning,
624 content = processed_content.content,
625 raw_content = processed_content.raw_content,
626 metadata = generate_poem_metadata(processed_content.content, cw, activity, processed_content.golden_poem_content)
627 }
628
629 -- Add privacy metadata
630 if processed_content.privacy_applied then
631 poem_entry.metadata.privacy_mode = privacy_config.mode
632 poem_entry.metadata.mentions_anonymized = true
633 if privacy_config.preserve_original_length then
634 poem_entry.metadata.original_character_count = string.len(processed_content.original_content)
635 end
636 end
637
638 -- Extract media attachments from the Note object
639 -- Attachments contain image/video URLs that map to local media_attachments directory
640 local attachments = extract_attachments(content_object)
641 if attachments then
642 poem_entry.attachments = attachments
643 poem_entry.metadata.has_attachments = true
644 poem_entry.metadata.attachment_count = #attachments
645 attachment_count = attachment_count + #attachments
646 end
647
648 table.insert(poems_json, poem_entry)
649 original_count = original_count + 1
650 end
651
652 elseif activity_type == "boost" and privacy_config.include_boosts then
653 -- Issue 10-038: Generate separate boost ID using boost_id_counter
654 -- Boosts have their own ID sequence: fediverse_boost/0001, 0002, etc.
655 local boost_id = string.format("%04d", boost_id_counter)
656
657 -- Issue 6-031: Check exclusion filter for boosts (using boost-specific ID)
658 if poem_exclusions:is_excluded("fediverse_boost", boost_id) then
659 excluded_count = excluded_count + 1
660 goto continue
661 end
662
663 -- Process boosted content when enabled
664 local boost_content = extract_boost_content(activity)
665 if boost_content then
666 -- Apply privacy processing to boost content too
667 local processed_boost = process_fediverse_content(boost_content.content, "", privacy_config.mode)
668 if processed_boost then
669 local boost_entry = {
670 id = boost_id,
671 category = "fediverse_boost",
672 source_file = "outbox.json",
673 creation_date = extract_full_date(activity.published),
674 content = processed_boost.content,
675 raw_content = processed_boost.raw_content,
676 metadata = boost_content.metadata
677 }
678
679 -- Add privacy metadata for boosts
680 if processed_boost.privacy_applied then
681 boost_entry.metadata.privacy_mode = privacy_config.mode
682 boost_entry.metadata.mentions_anonymized = true
683 end
684
685 table.insert(poems_json, boost_entry)
686 boost_count = boost_count + 1
687 -- Issue 10-038: Increment boost ID counter for next boost
688 boost_id_counter = boost_id_counter + 1
689 end
690 end
691 end
692
693 ::continue::
694end
695
696-- {{{ Generate JSON output for HTML generation
697-- Create output directory
698os.execute("mkdir -p " .. save_location)
699
700-- Count posts with attachments for statistics
701local posts_with_attachments = 0
702for _, poem in ipairs(poems_json) do
703 if poem.attachments then
704 posts_with_attachments = posts_with_attachments + 1
705 end
706end
707
708-- Generate JSON output
709local json_output = {
710 poems = poems_json,
711 extraction_summary = {
712 total_poems = #poems_json,
713 original_posts = original_count,
714 boosted_posts = boost_count,
715 poems_excluded = excluded_count, -- Issue 6-031: Excluded poem count
716 by_category = {
717 fediverse = original_count,
718 fediverse_boost = boost_count
719 },
720 content_warnings = {},
721 extraction_date = os.date("%Y-%m-%dT%H:%M:%SZ"),
722 privacy_settings = {
723 mode = privacy_config.mode,
724 include_boosts = privacy_config.include_boosts,
725 mentions_anonymized = (privacy_config.mode == "clean"),
726 anonymization_prefix = privacy_config.anonymization_prefix
727 },
728 attachment_statistics = {
729 total_attachments = attachment_count,
730 posts_with_attachments = posts_with_attachments
731 }
732 }
733}
734
735-- Collect unique content warnings
736local cw_set = {}
737for _, poem in ipairs(poems_json) do
738 if poem.content_warning then
739 cw_set[poem.content_warning] = true
740 end
741end
742for cw, _ in pairs(cw_set) do
743 table.insert(json_output.extraction_summary.content_warnings, cw)
744end
745
746local json_file = save_location .. "/poems.json"
747local f = io.open(json_file, "w")
748f:write(dkjson.encode(json_output, { indent = true }))
749f:close()
750
751-- Issue 7-006: Full-line coloring for success messages
752print(COLOR_GREEN .. "✅ Fediverse extraction complete" .. COLOR_RESET)
753print(" 📄 Generated: " .. relative_path(json_file))
754print(" 📊 Total posts processed: " .. #poems_json)
755print(" 📝 Original posts: " .. original_count)
756print(" 🔄 Boosted posts: " .. boost_count)
757if excluded_count > 0 then
758 print(" 🚫 Excluded posts: " .. excluded_count .. " (tombstoned)")
759end
760print(" 🖼️ Attachments found: " .. attachment_count .. " in " .. posts_with_attachments .. " posts")
761print(" 🚨 Content warnings: " .. #json_output.extraction_summary.content_warnings)
762print(" 🔒 Privacy mode: " .. privacy_config.mode)
763if privacy_config.mode == "clean" then
764 print(" 🎭 Mentions anonymized: " .. user_counter - 1 .. " users")
765end
766-- }}}
767
768