scripts/scrape-boost-content.lua

445 lines

1#!/usr/bin/env lua
2
3-- Boost Content Scraper
4-- ====================
5-- This script fetches the actual content of external fediverse boost URIs.
6-- It respects rate limits (1 second between requests, 2 seconds for same domain)
7-- and caches results to avoid re-scraping. Output is saved to assets/boost-content-cache.json
8-- for use by extract-fediverse.lua during poem extraction.
9--
10-- Usage:
11-- lua scripts/scrape-boost-content.lua [DIR] [--force]
12--
13-- The --force flag will re-scrape URIs even if they are already cached.
14
15-- {{{ local function setup_dir_path
16local function setup_dir_path(provided_dir)
17 -- Return provided dir if it's a valid path (not a flag)
18 if provided_dir and not provided_dir:match("^%-%-") then
19 return provided_dir
20 end
21 return "/mnt/mtwo/programming/ai-stuff/neocities-modernization"
22end
23-- }}}
24
25-- Script configuration
26local DIR = setup_dir_path(arg and arg[1])
27local FORCE_RESCRAPE = false
28local DRY_RUN = false
29local MAX_SCRAPE = nil -- Limit number of URIs to scrape (for testing)
30
31-- Parse arguments
32for i = 1, #arg do
33 if arg[i] == "--force" then
34 FORCE_RESCRAPE = true
35 elseif arg[i] == "--dry-run" then
36 DRY_RUN = true
37 elseif arg[i]:match("^--max=(%d+)$") then
38 MAX_SCRAPE = tonumber(arg[i]:match("^--max=(%d+)$"))
39 end
40end
41
42-- Load required libraries
43package.path = DIR .. "/libs/?.lua;" .. package.path
44local dkjson = require("dkjson")
45
46-- Configuration
47local CONFIG = {
48 -- Rate limiting
49 delay_between_requests = 1.0, -- seconds between any two requests
50 delay_same_domain = 2.0, -- additional delay when hitting same domain consecutively
51 request_timeout = 10, -- curl timeout in seconds
52
53 -- File paths
54 poems_json_path = DIR .. "/input/fediverse/files/poems.json",
55 cache_path = DIR .. "/assets/boost-content-cache.json",
56
57 -- HTTP settings
58 user_agent = "NeocitiesModernization/1.0 (Fediverse content aggregator for poetry project)",
59 accept_header = "application/activity+json",
60
61 -- Retry settings
62 max_retries = 2,
63 retry_delay = 3.0,
64}
65
66-- {{{ local function read_json_file
67local function read_json_file(path)
68 local file = io.open(path, "r")
69 if not file then
70 return nil, "Could not open file: " .. path
71 end
72 local content = file:read("*a")
73 file:close()
74
75 local data, pos, err = dkjson.decode(content)
76 if err then
77 return nil, "JSON parse error: " .. tostring(err)
78 end
79 return data
80end
81-- }}}
82
83-- {{{ local function write_json_file
84local function write_json_file(path, data)
85 local json_str = dkjson.encode(data, { indent = true })
86 local file = io.open(path, "w")
87 if not file then
88 return false, "Could not write file: " .. path
89 end
90 file:write(json_str)
91 file:close()
92 return true
93end
94-- }}}
95
96-- {{{ local function extract_domain
97local function extract_domain(uri)
98 -- Extract domain from URI like https://mastodon.social/users/foo/statuses/123
99 local domain = uri:match("https?://([^/]+)")
100 return domain
101end
102-- }}}
103
104-- {{{ local function sleep
105local function sleep(seconds)
106 -- Use os.execute with sleep command since Lua doesn't have native sleep
107 -- Convert to milliseconds for more precision
108 local ms = math.floor(seconds * 1000)
109 os.execute("sleep " .. (ms / 1000))
110end
111-- }}}
112
113-- {{{ local function fetch_activitypub_json
114local function fetch_activitypub_json(uri)
115 -- Fetch ActivityPub JSON representation of a URI
116 -- Returns: {success=bool, data=table or nil, error=string or nil, status_code=int}
117
118 -- Build curl command with proper headers for ActivityPub
119 -- We escape the URI and use proper quoting
120 local escaped_uri = uri:gsub("'", "'\\''")
121 local cmd = string.format(
122 "curl -s -w '\\n%%{http_code}' --max-time %d " ..
123 "-H 'Accept: %s' " ..
124 "-H 'User-Agent: %s' " ..
125 "'%s' 2>&1",
126 CONFIG.request_timeout,
127 CONFIG.accept_header,
128 CONFIG.user_agent,
129 escaped_uri
130 )
131
132 local handle = io.popen(cmd)
133 local output = handle:read("*a")
134 handle:close()
135
136 -- Parse output - last line is status code
137 local lines = {}
138 for line in output:gmatch("[^\n]+") do
139 table.insert(lines, line)
140 end
141
142 if #lines < 1 then
143 return { success = false, error = "Empty response", status_code = 0 }
144 end
145
146 -- Last line is HTTP status code
147 local status_code = tonumber(lines[#lines]) or 0
148
149 -- Everything except last line is the body
150 table.remove(lines)
151 local body = table.concat(lines, "\n")
152
153 -- Check status code
154 if status_code ~= 200 then
155 return {
156 success = false,
157 error = "HTTP " .. status_code,
158 status_code = status_code
159 }
160 end
161
162 -- Parse JSON
163 local data, pos, err = dkjson.decode(body)
164 if err then
165 return {
166 success = false,
167 error = "JSON parse error: " .. tostring(err),
168 status_code = status_code
169 }
170 end
171
172 return { success = true, data = data, status_code = status_code }
173end
174-- }}}
175
176-- {{{ local function extract_content_from_activitypub
177local function extract_content_from_activitypub(ap_data)
178 -- Extract relevant fields from ActivityPub Note object
179 -- Returns: {content, summary (CW), sensitive, published, attributedTo}
180
181 if not ap_data then
182 return nil, "No data provided"
183 end
184
185 -- The content field contains HTML
186 local content = ap_data.content
187 if not content then
188 -- Try contentMap (localized content)
189 if ap_data.contentMap then
190 -- Get first available content
191 for lang, text in pairs(ap_data.contentMap) do
192 content = text
193 break
194 end
195 end
196 end
197
198 if not content then
199 return nil, "No content field found"
200 end
201
202 return {
203 content = content, -- HTML content
204 summary = ap_data.summary, -- Content warning (may be nil)
205 sensitive = ap_data.sensitive or false,
206 published = ap_data.published,
207 attributed_to = ap_data.attributedTo, -- Author URI (for anonymization reference)
208 type = ap_data.type, -- Usually "Note"
209 }
210end
211-- }}}
212
213-- {{{ local function load_cache
214local function load_cache()
215 local cache = read_json_file(CONFIG.cache_path)
216 if not cache then
217 return {
218 metadata = {
219 created = os.date("!%Y-%m-%dT%H:%M:%SZ"),
220 last_updated = os.date("!%Y-%m-%dT%H:%M:%SZ"),
221 version = 1
222 },
223 entries = {},
224 errors = {}
225 }
226 end
227 return cache
228end
229-- }}}
230
231-- {{{ local function save_cache
232local function save_cache(cache)
233 cache.metadata.last_updated = os.date("!%Y-%m-%dT%H:%M:%SZ")
234 return write_json_file(CONFIG.cache_path, cache)
235end
236-- }}}
237
238-- {{{ local function get_external_boost_uris
239local function get_external_boost_uris()
240 -- Read poems.json and extract all external boost URIs
241 local poems_data = read_json_file(CONFIG.poems_json_path)
242 if not poems_data then
243 return nil, "Could not read poems.json"
244 end
245
246 local uris = {}
247 for _, poem in ipairs(poems_data.poems or {}) do
248 if poem.metadata and poem.metadata.boost_type == "external" then
249 local uri = poem.metadata.original_uri
250 if uri then
251 table.insert(uris, uri)
252 end
253 end
254 end
255
256 return uris
257end
258-- }}}
259
260-- {{{ local function scrape_boosts
261local function scrape_boosts()
262 print("=== Boost Content Scraper ===")
263 print("Project directory: " .. DIR)
264 print("Force rescrape: " .. tostring(FORCE_RESCRAPE))
265 print("Dry run: " .. tostring(DRY_RUN))
266 if MAX_SCRAPE then
267 print("Max URIs to scrape: " .. MAX_SCRAPE)
268 end
269 print("")
270
271 -- Load existing cache
272 local cache = load_cache()
273
274 -- Count existing entries
275 local existing_count = 0
276 for _ in pairs(cache.entries or {}) do
277 existing_count = existing_count + 1
278 end
279 print("Existing cache entries: " .. existing_count)
280
281 -- Get list of URIs to scrape
282 local uris, err = get_external_boost_uris()
283 if not uris then
284 print("ERROR: " .. err)
285 return false
286 end
287 print("Found " .. #uris .. " external boost URIs in poems.json")
288 print("")
289
290 -- Filter to URIs not yet cached (unless force rescrape)
291 local to_scrape = {}
292 local skipped = 0
293 for _, uri in ipairs(uris) do
294 if FORCE_RESCRAPE or not cache.entries[uri] then
295 table.insert(to_scrape, uri)
296 else
297 skipped = skipped + 1
298 end
299 end
300
301 print("URIs already cached: " .. skipped)
302 print("URIs to scrape: " .. #to_scrape)
303
304 -- Apply max scrape limit
305 if MAX_SCRAPE and #to_scrape > MAX_SCRAPE then
306 print("Limiting to first " .. MAX_SCRAPE .. " URIs")
307 local limited = {}
308 for i = 1, MAX_SCRAPE do
309 limited[i] = to_scrape[i]
310 end
311 to_scrape = limited
312 end
313
314 print("")
315
316 if #to_scrape == 0 then
317 print("Nothing to scrape - all URIs already cached!")
318 return true
319 end
320
321 -- Sort URIs by domain for better rate limiting efficiency
322 table.sort(to_scrape, function(a, b)
323 return extract_domain(a) < extract_domain(b)
324 end)
325
326 -- Dry run: just show what would be scraped
327 if DRY_RUN then
328 print("=== DRY RUN - Would scrape these URIs ===")
329 local domains = {}
330 for _, uri in ipairs(to_scrape) do
331 local domain = extract_domain(uri)
332 domains[domain] = (domains[domain] or 0) + 1
333 print(" " .. uri)
334 end
335 print("")
336 print("=== Domains to contact ===")
337 local domain_list = {}
338 for domain, count in pairs(domains) do
339 table.insert(domain_list, {domain = domain, count = count})
340 end
341 table.sort(domain_list, function(a, b) return a.count > b.count end)
342 for _, d in ipairs(domain_list) do
343 print(string.format(" %3d URIs from %s", d.count, d.domain))
344 end
345 print("")
346 print("Estimated time: " .. math.ceil(#to_scrape * CONFIG.delay_between_requests / 60) .. " minutes")
347 return true
348 end
349
350 -- Scrape each URI
351 local last_domain = nil
352 local success_count = 0
353 local error_count = 0
354 local start_time = os.time()
355
356 for i, uri in ipairs(to_scrape) do
357 local domain = extract_domain(uri)
358
359 -- Rate limiting
360 if i > 1 then
361 local delay = CONFIG.delay_between_requests
362 if domain == last_domain then
363 delay = delay + CONFIG.delay_same_domain
364 end
365 sleep(delay)
366 end
367
368 -- Progress indicator
369 io.write(string.format("[%d/%d] %s ... ", i, #to_scrape, uri:sub(1, 60)))
370 io.flush()
371
372 -- Fetch with retries
373 local result = nil
374 for attempt = 1, CONFIG.max_retries do
375 result = fetch_activitypub_json(uri)
376 if result.success then
377 break
378 end
379 if attempt < CONFIG.max_retries then
380 io.write("retry " .. attempt .. " ... ")
381 io.flush()
382 sleep(CONFIG.retry_delay)
383 end
384 end
385
386 if result.success then
387 -- Extract content
388 local content_data, extract_err = extract_content_from_activitypub(result.data)
389 if content_data then
390 cache.entries[uri] = {
391 scraped_at = os.date("!%Y-%m-%dT%H:%M:%SZ"),
392 content = content_data.content,
393 summary = content_data.summary,
394 sensitive = content_data.sensitive,
395 published = content_data.published,
396 attributed_to = content_data.attributed_to,
397 type = content_data.type
398 }
399 success_count = success_count + 1
400 print("OK")
401 else
402 cache.errors[uri] = {
403 attempted_at = os.date("!%Y-%m-%dT%H:%M:%SZ"),
404 error = extract_err or "Unknown extraction error"
405 }
406 error_count = error_count + 1
407 print("EXTRACT ERROR: " .. (extract_err or "unknown"))
408 end
409 else
410 cache.errors[uri] = {
411 attempted_at = os.date("!%Y-%m-%dT%H:%M:%SZ"),
412 error = result.error,
413 status_code = result.status_code
414 }
415 error_count = error_count + 1
416 print("ERROR: " .. result.error)
417 end
418
419 last_domain = domain
420
421 -- Save cache periodically (every 10 successful scrapes)
422 if success_count > 0 and success_count % 10 == 0 then
423 save_cache(cache)
424 end
425 end
426
427 -- Final save
428 save_cache(cache)
429
430 -- Summary
431 local elapsed = os.time() - start_time
432 print("")
433 print("=== Scraping Complete ===")
434 print("Scraped: " .. success_count .. " URIs")
435 print("Errors: " .. error_count .. " URIs")
436 print("Time: " .. elapsed .. " seconds")
437 print("Cache saved to: " .. CONFIG.cache_path)
438
439 return true
440end
441-- }}}
442
443-- Run scraper
444scrape_boosts()
445