scripts/extract-fediverse.lua

768 lines

2-- Fediverse content extraction script

3-- Parses ActivityPub JSON and extracts formatted posts with attachment metadata

4--

5-- ACTIVITYPUB ATTACHMENT FORMAT (Mastodon/W3C Standard):

6-- Each Note object in outbox.json may contain an "attachment" array:

7-- {

8-- "type": "Create",

9-- "object": {

10-- "type": "Note",

11-- "content": "Post text here",

12-- "attachment": [

13-- {

14-- "type": "Document",

15-- "mediaType": "image/png", -- MIME type (image/png, image/jpeg, video/mp4, etc.)

16-- "url": "https://server.com/media/files/123/456/789/original/abc123.png",

17-- "name": "Alt text description", -- User-provided alt text (may be null)

18-- "blurhash": "LEHV6nWB2yk8...", -- Blur hash for placeholder (optional)

19-- "width": 1920, -- Image dimensions (optional)

20-- "height": 1080

21-- }

22-- ]

23-- }

24-- }

25--

26-- URL PATH MAPPING:

27-- The URL path structure maps directly to local media_attachments directory:

28-- URL: https://tech.lgbt/media/files/113/464/378/730/595/557/original/658cbf8cc6804a09.png

29-- Local: input/media_attachments/files/113/464/378/730/595/557/original/658cbf8cc6804a09.png

30--

31-- The numeric segments (113/464/378/...) are derived from Mastodon's internal attachment ID

32-- split into 3-digit chunks for filesystem distribution.

34-- {{{ setup_dir_path

35local function setup_dir_path(provided_dir)

36 if provided_dir then

37 return provided_dir

38 end

39 return "/mnt/mtwo/programming/ai-stuff/neocities-modernization"

40end

41-- }}}

43-- {{{ parse_args

44-- Parse command line arguments for DIR, source override, and boost inclusion

45local function parse_args(args)

46 local dir = nil

47 local source_override = nil

48 local include_boosts = nil -- nil means use config default

49 local i = 1

51 while i <= #(args or {}) do

52 local a = args[i]

53 if a == "--include-boosts" then

54 include_boosts = true

55 i = i + 1

56 elseif a == "--no-boosts" then

57 include_boosts = false

58 i = i + 1

59 elseif not a:match("^%-") then

60 -- Positional arguments: first is DIR, second is source override

61 if not dir then

62 dir = a

63 else

64 source_override = a

65 end

66 i = i + 1

67 else

68 i = i + 1

69 end

70 end

72 return dir, source_override, include_boosts

73end

74-- }}}

76-- Get project directory and options from command line

77local parsed_dir, OVERRIDE_SOURCE, CLI_INCLUDE_BOOSTS = parse_args(arg)

78local DIR = setup_dir_path(parsed_dir)

80-- Set up package path to find libs

81package.path = DIR .. "/libs/?.lua;" .. package.path

82local dkjson = require("dkjson")

83local exclusion_filter = require("exclusion-filter")

85-- Issue 10-003: Load unified config from config.lua

86local config_loader = require("config-loader")

87config_loader.set_project_root(DIR)

88local config = config_loader.load()

90-- Issue 10-015: Load sources configuration for multi-directory support

91local sources_loader = require("sources-loader")

92sources_loader.set_project_root(DIR)

94-- ANSI color codes for terminal output

95local COLOR_GREEN = "\027[92m" -- Bright green for success (✓, ✅)

96local COLOR_BLUE = "\027[94m" -- Bright blue for info (ℹ️)

97local COLOR_RED = "\027[91m" -- Bright red for errors (✗, ❌)

98local COLOR_YELLOW = "\027[93m" -- Bright yellow for warnings (⚠️)

99local COLOR_RESET = "\027[0m" -- Reset to default

100

101-- {{{ local function relative_path

102-- Issue 7-003: Show project name instead of "./" when path equals DIR

103local function relative_path(absolute_path)

104 if absolute_path == DIR or absolute_path == DIR .. "/" then

105 local dir_name = DIR:match("([^/]+)/?$")

106 return dir_name .. "/"

107 end

108 if absolute_path:sub(1, #DIR) == DIR then

109 local rel = absolute_path:sub(#DIR + 1)

110 if rel:sub(1, 1) == "/" then rel = rel:sub(2) end

111 return "./" .. rel

112 end

113 return absolute_path

114end

115-- }}}

116

117-- Issue 10-015a: Get fediverse path from unified sources config (no fallback - errors if not configured)

118local fediverse_directories = sources_loader.get_directories("fediverse")

119if #fediverse_directories == 0 then

120 print(COLOR_RED .. "❌ Error: sources.fediverse not configured in config.lua" .. COLOR_RESET)

121 os.exit(1)

122end

123-- Use the primary directory from sources config

124local fediverse_backup_path = fediverse_directories[1].path

125-- Strip DIR prefix if present (sources-loader returns absolute paths)

126if fediverse_backup_path:sub(1, #DIR) == DIR then

127 fediverse_backup_path = fediverse_backup_path:sub(#DIR + 2) -- +2 for the slash

128end

129

130-- Privacy configuration from unified config

131-- CLI flags --include-boosts/--no-boosts override config value

132local function get_include_boosts()

133 if CLI_INCLUDE_BOOSTS ~= nil then

134 return CLI_INCLUDE_BOOSTS

135 end

136 return config.privacy.include_boosts or false

137end

138

139-- {{{ local function load_boost_content_cache

140-- Load scraped boost content cache from assets/boost-content-cache.json

141-- Returns a table mapping URI -> cached content data

142local boost_content_cache = nil

143local function load_boost_content_cache()

144 if boost_content_cache then

145 return boost_content_cache

146 end

147

148 local cache_path = DIR .. "/assets/boost-content-cache.json"

149 local file = io.open(cache_path, "r")

150 if not file then

151 boost_content_cache = {}

152 return boost_content_cache

153 end

154

155 local content = file:read("*a")

156 file:close()

157

158 local data, pos, err = dkjson.decode(content)

159 if err or not data or not data.entries then

160 boost_content_cache = {}

161 return boost_content_cache

162 end

163

164 boost_content_cache = data.entries

165 local count = 0

166 for _ in pairs(boost_content_cache) do count = count + 1 end

167 print(" 📥 Loaded boost content cache: " .. count .. " entries")

168 return boost_content_cache

169end

170-- }}}

171

172local privacy_config = {

173 mode = config.privacy.mode or "clean",

174 anonymization_prefix = config.privacy.anonymization_prefix or "user-",

175 include_boosts = get_include_boosts(),

176 preserve_original_length = config.privacy.preserve_original_length or true,

177 store_anonymization_map = config.privacy.store_anonymization_map or false,

178 local_server_domain = config.privacy.local_server_domain or "tech.lgbt",

179 debug_anonymization = false -- Debug flag, not in config

180}

181

182-- Log boost inclusion status

183if privacy_config.include_boosts then

184 print("📤 Including fediverse boosts in extraction (CLI flag or config)")

185end

186

187-- Use override path if provided (for ZIP extraction), otherwise use configured path

188local source_base_path

189if OVERRIDE_SOURCE then

190 source_base_path = OVERRIDE_SOURCE

191 print("🔄 Using temporary extraction source: " .. relative_path(source_base_path))

192else

193 source_base_path = DIR .. "/" .. fediverse_backup_path

194 print("🔄 Using configured source: " .. relative_path(source_base_path))

195end

196

197-- Set up file paths - check if we're already in extract directory

198local file

199if source_base_path:match("extract$") then

200 file = source_base_path .. "/outbox.json"

201else

202 file = source_base_path .. "/extract/outbox.json"

203end

204local save_location = DIR .. "/" .. fediverse_backup_path .. "/files"

205

206-- Load and parse ActivityPub data

207print("🔄 Loading ActivityPub data from: " .. relative_path(file))

208local opened_file = io.open(file, "r")

209if not opened_file then

210 -- Issue 7-006: Full-line coloring for error messages

211 print(COLOR_RED .. "❌ Error: Could not open file " .. file .. COLOR_RESET)

212 print(" Make sure the file exists and is readable")

213 os.exit(1)

214end

215

216local opened_file_string = opened_file:read("*a")

217opened_file:close()

218

219local data = dkjson.decode(opened_file_string)

220if not data then

221 -- Issue 7-006: Full-line coloring for error messages

222 print(COLOR_RED .. "❌ Error: Could not parse JSON data from " .. file .. COLOR_RESET)

223 os.exit(1)

224end

225

226-- Issue 7-006: Full-line coloring for success messages

227print(COLOR_GREEN .. "✅ Loaded ActivityPub data: " .. (data.totalItems or #data.orderedItems) .. " activities" .. COLOR_RESET)

228

229-- Issue 6-031: Load poem exclusion filter

230-- Excluded poems leave gaps in the ID sequence (tombstoning) to preserve stable anchor links

231local poem_exclusions = exclusion_filter.load_default(DIR)

232if poem_exclusions:count() > 0 then

233 -- Issue 7-006: Full-line coloring for info messages

234 print(COLOR_YELLOW .. "🚫 Exclusion filter loaded: " .. poem_exclusions:summary() .. COLOR_RESET)

235end

236

237-- Privacy system variables

238local user_anonymization_map = {}

239local user_counter = 1

240

241-- {{{ function normalize_username

242local function normalize_username(username)

243 -- Strip ID paths and normalize username variations for consistent mapping

244 -- Remove paths like "/111978500472309702" from usernames

245 local normalized = username:gsub("/[0-9]+", "")

246

247 -- Handle specific username variations - map shorter forms to longer canonical forms

248 -- This is based on observed patterns in the fediverse data

249 local username_mappings = {

250 ["wyatt"] = "wyatt8740", -- Map @wyatt to @wyatt8740 for consistency

251 -- Add other mappings here as needed

252 }

253

254 -- Apply username mapping if one exists

255 if username_mappings[normalized] then

256 normalized = username_mappings[normalized]

257 end

258

259 return normalized

260end

261-- }}}

262

263-- {{{ function anonymize_mention

264local function anonymize_mention(username, server)

265 -- Normalize username to handle variations and ID paths

266 local normalized_username = normalize_username(username)

267

268 -- Debug logging to track anonymization mappings

269 if privacy_config.debug_anonymization then

270 io.stderr:write(string.format("DEBUG: anonymize_mention: '%s' -> '%s' @ '%s'\n",

271 username, normalized_username, server or "local"))

272 end

273

274 -- IMPORTANT: Consider users with same username on different servers as the same person

275 -- This handles server migrations and cross-server mentions of the same person

276 -- We only use the username for mapping, ignoring the server domain entirely

277 local map_key = normalized_username -- Just username, no server

278

279 if not user_anonymization_map[map_key] then

280 user_anonymization_map[map_key] = privacy_config.anonymization_prefix .. user_counter

281 user_counter = user_counter + 1

282 if privacy_config.debug_anonymization then

283 io.stderr:write(string.format(" -> New mapping: %s = %s\n", map_key, user_anonymization_map[map_key]))

284 end

285 end

286 return user_anonymization_map[map_key]

287end

288-- }}}

289

290-- {{{ function process_mentions_for_privacy

291local function process_mentions_for_privacy(content, privacy_mode)

292 if privacy_mode ~= "clean" then

293 return content, content -- Return original for dirty mode

294 end

295

296 local original_content = content

297 local processed_content = content

298

299 -- Handle HTML mention markup: ...<a href="https://server/@user">@user</a>

300 processed_content = processed_content:gsub(']*>.-<a href="[^"]*://([^/"]+)/@([^"/?"]*)[^"]*"[^>]*>@([^<]*)</a>', function(server, user, display_user)

301 -- Use the URL username (user) which is more reliable than display text

302 -- The URL contains the actual username, display might be shortened

303 -- Extract only the username part, not any path segments or IDs after it

304 return "@" .. anonymize_mention(user, server)

305 end)

306

307 -- Handle simpler HTML mentions: <a href="https://server/users/user" class="u-url mention">@user</a>

308 processed_content = processed_content:gsub('<a href="[^"]*://([^/"]+)/users/([^"/?"]*)[^"]*"[^>]*>@([^<]*)</a>', function(server, user, display_user)

309 -- Use the URL username (user) which is more reliable than display text

310 -- Extract only the username part, not any path segments after it

311 return "@" .. anonymize_mention(user, server)

312 end)

313

314 -- 6-027a Patterns: Handle plain text mentions as specified in sub-issue

315 -- Pattern 1: Full mentions @user@domain.com

316 processed_content = processed_content:gsub("@([%w%.%-_]+)@([%w%.%-]+%.%w+)", function(user, server)

317 return "@" .. anonymize_mention(user, server)

318 end)

319

320 -- Pattern 2: Multiple usernames at start - handle sequences like "@user1 @user2 @user3 content"

321 -- This pattern handles multiple consecutive mentions at the beginning

322 while processed_content:match("^@[%w%.%-_]+%s+@") do

323 processed_content = processed_content:gsub("^@([%w%.%-_]+)(%s+)", function(user, space)

324 return "@" .. anonymize_mention(user, nil) .. space

325 end)

326 end

327

328 -- Pattern 3: Single username at start (after multiple handling)

329 processed_content = processed_content:gsub("^@([%w%.%-_]+)%s", function(user)

330 return "@" .. anonymize_mention(user, nil) .. " "

331 end)

332

333 -- Pattern 4: Local mentions @user (same server, followed by whitespace)

334 processed_content = processed_content:gsub("@([%w%.%-_]+)%s", function(user)

335 return "@" .. anonymize_mention(user, nil) .. " "

336 end)

337

338 -- Pattern 5: @user at end of content (no trailing space)

339 processed_content = processed_content:gsub("@([%w%.%-_]+)$", function(user)

340 return "@" .. anonymize_mention(user, nil)

341 end)

342

343 -- Pattern 6: Catch any remaining @username patterns in the middle of text

344 -- This catches mentions followed by punctuation or other non-space characters

345 processed_content = processed_content:gsub("@([%w%.%-_]+)([^%w%.%-_@])", function(user, following_char)

346 return "@" .. anonymize_mention(user, nil) .. following_char

347 end)

348

349 return processed_content, original_content

350end

351-- }}}

352

353-- {{{ function categorize_activity

354local function categorize_activity(activity)

355 if activity.type == "Create" and activity.object and activity.object.type == "Note" then

356 return "original_post", activity.object

357 elseif activity.type == "Announce" then

358 return "boost", activity.object

359 else

360 return "unknown", nil

361 end

362end

363-- }}}

364

365-- {{{ function extract_boost_content

366local function extract_boost_content(announce_activity)

367 local boosted_object = announce_activity.object

368

369 -- If object is URI, check cache for scraped content first

370 if type(boosted_object) == "string" then

371 local cache = load_boost_content_cache()

372 local cached = cache[boosted_object]

373

374 -- Issue 10-037: Check for non-empty content (empty string "" is truthy in Lua)

375 -- Cache entries with empty content should fall back to "External post:" format

376 if cached and cached.content and cached.content ~= "" then

377 -- Use cached scraped content instead of placeholder

378 -- The cached content is HTML that will be processed by clean_html later

379 return {

380 type = "cached_external_boost",

381 uri = boosted_object,

382 boost_timestamp = announce_activity.published,

383 content = cached.content,

384 content_warning = cached.summary, -- CW from original post

385 sensitive = cached.sensitive,

386 original_published = cached.published,

387 original_author = cached.attributed_to,

388 metadata = {

389 is_boost = true,

390 boost_type = "cached_external",

391 original_uri = boosted_object,

392 boost_date = announce_activity.published,

393 scraped_at = cached.scraped_at,

394 original_author = cached.attributed_to

395 }

396 }

397 end

398

399 -- No cache entry - fall back to placeholder

400 return {

401 type = "external_boost",

402 uri = boosted_object,

403 boost_timestamp = announce_activity.published,

404 content = "External post: " .. boosted_object,

405 metadata = {

406 is_boost = true,

407 boost_type = "external",

408 original_uri = boosted_object,

409 boost_date = announce_activity.published

410 }

411 }

412 end

413

414 -- If object is embedded, extract full content

415 -- Issue 10-037: Also check for non-empty embedded content

416 if type(boosted_object) == "table" and boosted_object.content and boosted_object.content ~= "" then

417 return {

418 type = "embedded_boost",

419 content = boosted_object.content,

420 original_author = boosted_object.attributedTo,

421 boost_timestamp = announce_activity.published,

422 original_timestamp = boosted_object.published,

423 metadata = {

424 is_boost = true,

425 boost_type = "embedded",

426 original_author = boosted_object.attributedTo,

427 boost_date = announce_activity.published,

428 original_date = boosted_object.published

429 }

430 }

431 end

432

433 -- Issue 10-037: Fallback for embedded objects with empty content

434 -- Extract URI from the object's id field and create placeholder entry

435 if type(boosted_object) == "table" and boosted_object.id then

436 return {

437 type = "external_boost",

438 uri = boosted_object.id,

439 boost_timestamp = announce_activity.published,

440 content = "External post: " .. boosted_object.id,

441 metadata = {

442 is_boost = true,

443 boost_type = "embedded_empty",

444 original_uri = boosted_object.id,

445 original_author = boosted_object.attributedTo,

446 boost_date = announce_activity.published,

447 content_unavailable = true

448 }

449 }

450 end

451

452 return nil

453end

454-- }}}

455

456-- {{{ local function clean_html

457local function clean_html(content)

458 -- Clean HTML markup to get plain text (what Mastodon counts)

459 local clean = content:gsub("", "\n\n")

460 -- Issue 6-032: Handle all BR tag variants ( , , )

461 -- Mastodon uses XHTML-style which was causing words to run together

462 clean = clean:gsub("<br%s*/?>", "\n")

463 clean = clean:gsub("&", "&")

464 clean = clean:gsub("'", "'")

465 clean = clean:gsub(""", "\"")

466 clean = clean:gsub("<", "<")

467 clean = clean:gsub(">", ">")

468 clean = clean:gsub("\\\"", "\"")

469 clean = clean:gsub(" _^", "^_^")

470 clean = clean:gsub("^^_^", "^_^")

471 clean = clean:gsub("<[^>]+>", "")

472 clean = clean:gsub("^\n+", ""):gsub("\n+$", "") -- Trim newlines

473 return clean

474end

475-- }}}

476

477-- {{{ function process_fediverse_content

478local function process_fediverse_content(raw_content, cw, privacy_mode)

479 if not raw_content then return nil end

480

481 -- Process mentions for privacy BEFORE HTML cleaning to preserve structure

482 local privacy_processed_content, original_content = process_mentions_for_privacy(raw_content, privacy_mode)

483

484 -- Clean HTML for display content (after anonymization)

485 local clean_content = clean_html(privacy_processed_content)

486

487 -- Clean HTML for golden poem calculation (before anonymization, preserves @mentions)

488 local golden_poem_content = clean_html(original_content)

489

490 return {

491 content = clean_content,

492 raw_content = raw_content,

493 original_content = original_content,

494 golden_poem_content = golden_poem_content, -- HTML-cleaned, pre-anonymization (for 1024 char count)

495 content_warning = (cw and cw ~= "") and cw or nil,

496 privacy_applied = (privacy_mode == "clean")

497 }

498end

499-- }}}

500

501-- {{{ function extract_date

502local function extract_date(timestamp)

503 return timestamp and timestamp:match("(%d%d%d%d%-%d%d%-%d%d)") or "0000-00-00"

504end

505-- }}}

506

507-- {{{ function extract_full_date

508local function extract_full_date(timestamp)

509 if timestamp then

510 return timestamp:match("(%d%d%d%d%-%d%d%-%d%dT%d%d:%d%d:%d%d)") or timestamp

511 end

512 return os.date("%Y-%m-%dT%H:%M:%S")

513end

514-- }}}

515

516-- {{{ function generate_poem_metadata

517local function generate_poem_metadata(content, cw, source_data, golden_poem_content)

518 -- Golden poem calculation: HTML-cleaned content (before anonymization) + content warning text

519 -- This matches what Mastodon counts: text content + @mentions + CW text

520 local golden_content = golden_poem_content or content

521 local golden_poem_length = string.len(golden_content)

522

523 -- Add content warning text to golden poem calculation (exclude "CW: " prefix as per 6-027)

524 if cw and cw ~= "" then

525 golden_poem_length = golden_poem_length + string.len(cw)

526 end

527

528 local metadata = {

529 character_count = string.len(content), -- Display content length (post-privacy)

530 golden_poem_character_count = golden_poem_length, -- For golden poem qualification (1024 chars)

531 is_golden_poem = (golden_poem_length == 1024),

532 word_count = select(2, content:gsub("%S+", "")),

533 has_content_warning = (cw and cw ~= ""),

534 extraction_timestamp = os.date("%Y-%m-%dT%H:%M:%SZ")

535 }

536

537 if source_data and source_data.published then

538 metadata.creation_date = extract_full_date(source_data.published)

539 end

540

541 return metadata

542end

543-- }}}

544

545-- {{{ function extract_attachments

546local function extract_attachments(content_object)

547 -- Extract media attachment metadata from ActivityPub Note object

548 -- Returns nil if no attachments, or array of attachment metadata

549 if not content_object.attachment then

550 return nil

551 end

552

553 local attachments = {}

554 for _, attachment in ipairs(content_object.attachment) do

555 -- Only process Document type attachments (images, videos, etc.)

556 if attachment.type == "Document" and attachment.url then

557 -- Extract the relative path from the URL

558 -- URL format: https://server.com/media/files/123/456/789/original/filename.ext

559 -- We extract: files/123/456/789/original/filename.ext

560 local relative_path = attachment.url:match("/files/(.+)$")

561 if relative_path then

562 relative_path = "files/" .. relative_path

563 end

564

565 local attachment_entry = {

566 media_type = attachment.mediaType,

567 url = attachment.url,

568 relative_path = relative_path,

569 alt_text = attachment.name, -- May be nil if no alt text provided

570 width = attachment.width,

571 height = attachment.height,

572 blurhash = attachment.blurhash

573 }

574 table.insert(attachments, attachment_entry)

575 end

576 end

577

578 if #attachments > 0 then

579 return attachments

580 end

581 return nil

582end

583-- }}}

584

585local poems_json = {}

586local boost_count = 0

587local original_count = 0

588local attachment_count = 0

589local excluded_count = 0 -- Issue 6-031: Track excluded poems

590-- Issue 10-038: Separate ID numbering for fediverse_boost category

591-- Boosts get their own sequential IDs starting from 0001, independent of fediverse posts

592local boost_id_counter = 1

593

594print("🔄 Processing activities with privacy mode: " .. privacy_config.mode)

595print("🔄 Include boosts: " .. tostring(privacy_config.include_boosts))

596

597for key, activity in pairs(data.orderedItems) do

598 local activity_type, content_object = categorize_activity(activity)

599

600 -- Issue 6-031: Generate poem ID early for exclusion check

601 -- IDs are assigned before exclusion filter runs, preserving stable anchors

602 local poem_id = string.format("%04d", key)

603

604 if activity_type == "original_post" then

605 -- Issue 6-031: Check exclusion filter (tombstone - leaves gap in ID sequence)

606 if poem_exclusions:is_excluded("fediverse", poem_id) then

607 excluded_count = excluded_count + 1

608 goto continue

609 end

610

611 -- Process original posts (Create activities)

612 local cw = content_object.summary or ""

613 local content = content_object.content

614

615 -- Process content with privacy settings

616 local processed_content = process_fediverse_content(content, cw, privacy_config.mode)

617 if processed_content then

618 local poem_entry = {

619 id = poem_id,

620 category = "fediverse",

621 source_file = "outbox.json",

622 creation_date = extract_full_date(activity.published),

623 content_warning = processed_content.content_warning,

624 content = processed_content.content,

625 raw_content = processed_content.raw_content,

626 metadata = generate_poem_metadata(processed_content.content, cw, activity, processed_content.golden_poem_content)

627 }

628

629 -- Add privacy metadata

630 if processed_content.privacy_applied then

631 poem_entry.metadata.privacy_mode = privacy_config.mode

632 poem_entry.metadata.mentions_anonymized = true

633 if privacy_config.preserve_original_length then

634 poem_entry.metadata.original_character_count = string.len(processed_content.original_content)

635 end

636 end

637

638 -- Extract media attachments from the Note object

639 -- Attachments contain image/video URLs that map to local media_attachments directory

640 local attachments = extract_attachments(content_object)

641 if attachments then

642 poem_entry.attachments = attachments

643 poem_entry.metadata.has_attachments = true

644 poem_entry.metadata.attachment_count = #attachments

645 attachment_count = attachment_count + #attachments

646 end

647

648 table.insert(poems_json, poem_entry)

649 original_count = original_count + 1

650 end

651

652 elseif activity_type == "boost" and privacy_config.include_boosts then

653 -- Issue 10-038: Generate separate boost ID using boost_id_counter

654 -- Boosts have their own ID sequence: fediverse_boost/0001, 0002, etc.

655 local boost_id = string.format("%04d", boost_id_counter)

656

657 -- Issue 6-031: Check exclusion filter for boosts (using boost-specific ID)

658 if poem_exclusions:is_excluded("fediverse_boost", boost_id) then

659 excluded_count = excluded_count + 1

660 goto continue

661 end

662

663 -- Process boosted content when enabled

664 local boost_content = extract_boost_content(activity)

665 if boost_content then

666 -- Apply privacy processing to boost content too

667 local processed_boost = process_fediverse_content(boost_content.content, "", privacy_config.mode)

668 if processed_boost then

669 local boost_entry = {

670 id = boost_id,

671 category = "fediverse_boost",

672 source_file = "outbox.json",

673 creation_date = extract_full_date(activity.published),

674 content = processed_boost.content,

675 raw_content = processed_boost.raw_content,

676 metadata = boost_content.metadata

677 }

678

679 -- Add privacy metadata for boosts

680 if processed_boost.privacy_applied then

681 boost_entry.metadata.privacy_mode = privacy_config.mode

682 boost_entry.metadata.mentions_anonymized = true

683 end

684

685 table.insert(poems_json, boost_entry)

686 boost_count = boost_count + 1

687 -- Issue 10-038: Increment boost ID counter for next boost

688 boost_id_counter = boost_id_counter + 1

689 end

690 end

691 end

692

693 ::continue::

694end

695

696-- {{{ Generate JSON output for HTML generation

697-- Create output directory

698os.execute("mkdir -p " .. save_location)

699

700-- Count posts with attachments for statistics

701local posts_with_attachments = 0

702for _, poem in ipairs(poems_json) do

703 if poem.attachments then

704 posts_with_attachments = posts_with_attachments + 1

705 end

706end

707

708-- Generate JSON output

709local json_output = {

710 poems = poems_json,

711 extraction_summary = {

712 total_poems = #poems_json,

713 original_posts = original_count,

714 boosted_posts = boost_count,

715 poems_excluded = excluded_count, -- Issue 6-031: Excluded poem count

716 by_category = {

717 fediverse = original_count,

718 fediverse_boost = boost_count

719 },

720 content_warnings = {},

721 extraction_date = os.date("%Y-%m-%dT%H:%M:%SZ"),

722 privacy_settings = {

723 mode = privacy_config.mode,

724 include_boosts = privacy_config.include_boosts,

725 mentions_anonymized = (privacy_config.mode == "clean"),

726 anonymization_prefix = privacy_config.anonymization_prefix

727 },

728 attachment_statistics = {

729 total_attachments = attachment_count,

730 posts_with_attachments = posts_with_attachments

731 }

732 }

733}

734

735-- Collect unique content warnings

736local cw_set = {}

737for _, poem in ipairs(poems_json) do

738 if poem.content_warning then

739 cw_set[poem.content_warning] = true

740 end

741end

742for cw, _ in pairs(cw_set) do

743 table.insert(json_output.extraction_summary.content_warnings, cw)

744end

745

746local json_file = save_location .. "/poems.json"

747local f = io.open(json_file, "w")

748f:write(dkjson.encode(json_output, { indent = true }))

749f:close()

750

751-- Issue 7-006: Full-line coloring for success messages

752print(COLOR_GREEN .. "✅ Fediverse extraction complete" .. COLOR_RESET)

753print(" 📄 Generated: " .. relative_path(json_file))

754print(" 📊 Total posts processed: " .. #poems_json)

755print(" 📝 Original posts: " .. original_count)

756print(" 🔄 Boosted posts: " .. boost_count)

757if excluded_count > 0 then

758 print(" 🚫 Excluded posts: " .. excluded_count .. " (tombstoned)")

759end

760print(" 🖼️ Attachments found: " .. attachment_count .. " in " .. posts_with_attachments .. " posts")

761print(" 🚨 Content warnings: " .. #json_output.extraction_summary.content_warnings)

762print(" 🔒 Privacy mode: " .. privacy_config.mode)

763if privacy_config.mode == "clean" then

764 print(" 🎭 Mentions anonymized: " .. user_counter - 1 .. " users")

765end

766-- }}}

767

768