scripts/scrape-boost-content.lua

445 lines

1#!/usr/bin/env lua

3-- Boost Content Scraper

4-- ====================

5-- This script fetches the actual content of external fediverse boost URIs.

6-- It respects rate limits (1 second between requests, 2 seconds for same domain)

7-- and caches results to avoid re-scraping. Output is saved to assets/boost-content-cache.json

8-- for use by extract-fediverse.lua during poem extraction.

9--

10-- Usage:

11-- lua scripts/scrape-boost-content.lua [DIR] [--force]

12--

13-- The --force flag will re-scrape URIs even if they are already cached.

15-- {{{ local function setup_dir_path

16local function setup_dir_path(provided_dir)

17 -- Return provided dir if it's a valid path (not a flag)

18 if provided_dir and not provided_dir:match("^%-%-") then

19 return provided_dir

20 end

21 return "/mnt/mtwo/programming/ai-stuff/neocities-modernization"

22end

23-- }}}

25-- Script configuration

26local DIR = setup_dir_path(arg and arg[1])

27local FORCE_RESCRAPE = false

28local DRY_RUN = false

29local MAX_SCRAPE = nil -- Limit number of URIs to scrape (for testing)

31-- Parse arguments

32for i = 1, #arg do

33 if arg[i] == "--force" then

34 FORCE_RESCRAPE = true

35 elseif arg[i] == "--dry-run" then

36 DRY_RUN = true

37 elseif arg[i]:match("^--max=(%d+)$") then

38 MAX_SCRAPE = tonumber(arg[i]:match("^--max=(%d+)$"))

39 end

40end

42-- Load required libraries

43package.path = DIR .. "/libs/?.lua;" .. package.path

44local dkjson = require("dkjson")

46-- Configuration

47local CONFIG = {

48 -- Rate limiting

49 delay_between_requests = 1.0, -- seconds between any two requests

50 delay_same_domain = 2.0, -- additional delay when hitting same domain consecutively

51 request_timeout = 10, -- curl timeout in seconds

53 -- File paths

54 poems_json_path = DIR .. "/input/fediverse/files/poems.json",

55 cache_path = DIR .. "/assets/boost-content-cache.json",

57 -- HTTP settings

58 user_agent = "NeocitiesModernization/1.0 (Fediverse content aggregator for poetry project)",

59 accept_header = "application/activity+json",

61 -- Retry settings

62 max_retries = 2,

63 retry_delay = 3.0,

64}

66-- {{{ local function read_json_file

67local function read_json_file(path)

68 local file = io.open(path, "r")

69 if not file then

70 return nil, "Could not open file: " .. path

71 end

72 local content = file:read("*a")

73 file:close()

75 local data, pos, err = dkjson.decode(content)

76 if err then

77 return nil, "JSON parse error: " .. tostring(err)

78 end

79 return data

80end

81-- }}}

83-- {{{ local function write_json_file

84local function write_json_file(path, data)

85 local json_str = dkjson.encode(data, { indent = true })

86 local file = io.open(path, "w")

87 if not file then

88 return false, "Could not write file: " .. path

89 end

90 file:write(json_str)

91 file:close()

92 return true

93end

94-- }}}

96-- {{{ local function extract_domain

97local function extract_domain(uri)

98 -- Extract domain from URI like https://mastodon.social/users/foo/statuses/123

99 local domain = uri:match("https?://([^/]+)")

100 return domain

101end

102-- }}}

103

104-- {{{ local function sleep

105local function sleep(seconds)

106 -- Use os.execute with sleep command since Lua doesn't have native sleep

107 -- Convert to milliseconds for more precision

108 local ms = math.floor(seconds * 1000)

109 os.execute("sleep " .. (ms / 1000))

110end

111-- }}}

112

113-- {{{ local function fetch_activitypub_json

114local function fetch_activitypub_json(uri)

115 -- Fetch ActivityPub JSON representation of a URI

116 -- Returns: {success=bool, data=table or nil, error=string or nil, status_code=int}

117

118 -- Build curl command with proper headers for ActivityPub

119 -- We escape the URI and use proper quoting

120 local escaped_uri = uri:gsub("'", "'\\''")

121 local cmd = string.format(

122 "curl -s -w '\\n%%{http_code}' --max-time %d " ..

123 "-H 'Accept: %s' " ..

124 "-H 'User-Agent: %s' " ..

125 "'%s' 2>&1",

126 CONFIG.request_timeout,

127 CONFIG.accept_header,

128 CONFIG.user_agent,

129 escaped_uri

130 )

131

132 local handle = io.popen(cmd)

133 local output = handle:read("*a")

134 handle:close()

135

136 -- Parse output - last line is status code

137 local lines = {}

138 for line in output:gmatch("[^\n]+") do

139 table.insert(lines, line)

140 end

141

142 if #lines < 1 then

143 return { success = false, error = "Empty response", status_code = 0 }

144 end

145

146 -- Last line is HTTP status code

147 local status_code = tonumber(lines[#lines]) or 0

148

149 -- Everything except last line is the body

150 table.remove(lines)

151 local body = table.concat(lines, "\n")

152

153 -- Check status code

154 if status_code ~= 200 then

155 return {

156 success = false,

157 error = "HTTP " .. status_code,

158 status_code = status_code

159 }

160 end

161

162 -- Parse JSON

163 local data, pos, err = dkjson.decode(body)

164 if err then

165 return {

166 success = false,

167 error = "JSON parse error: " .. tostring(err),

168 status_code = status_code

169 }

170 end

171

172 return { success = true, data = data, status_code = status_code }

173end

174-- }}}

175

176-- {{{ local function extract_content_from_activitypub

177local function extract_content_from_activitypub(ap_data)

178 -- Extract relevant fields from ActivityPub Note object

179 -- Returns: {content, summary (CW), sensitive, published, attributedTo}

180

181 if not ap_data then

182 return nil, "No data provided"

183 end

184

185 -- The content field contains HTML

186 local content = ap_data.content

187 if not content then

188 -- Try contentMap (localized content)

189 if ap_data.contentMap then

190 -- Get first available content

191 for lang, text in pairs(ap_data.contentMap) do

192 content = text

193 break

194 end

195 end

196 end

197

198 if not content then

199 return nil, "No content field found"

200 end

201

202 return {

203 content = content, -- HTML content

204 summary = ap_data.summary, -- Content warning (may be nil)

205 sensitive = ap_data.sensitive or false,

206 published = ap_data.published,

207 attributed_to = ap_data.attributedTo, -- Author URI (for anonymization reference)

208 type = ap_data.type, -- Usually "Note"

209 }

210end

211-- }}}

212

213-- {{{ local function load_cache

214local function load_cache()

215 local cache = read_json_file(CONFIG.cache_path)

216 if not cache then

217 return {

218 metadata = {

219 created = os.date("!%Y-%m-%dT%H:%M:%SZ"),

220 last_updated = os.date("!%Y-%m-%dT%H:%M:%SZ"),

221 version = 1

222 },

223 entries = {},

224 errors = {}

225 }

226 end

227 return cache

228end

229-- }}}

230

231-- {{{ local function save_cache

232local function save_cache(cache)

233 cache.metadata.last_updated = os.date("!%Y-%m-%dT%H:%M:%SZ")

234 return write_json_file(CONFIG.cache_path, cache)

235end

236-- }}}

237

238-- {{{ local function get_external_boost_uris

239local function get_external_boost_uris()

240 -- Read poems.json and extract all external boost URIs

241 local poems_data = read_json_file(CONFIG.poems_json_path)

242 if not poems_data then

243 return nil, "Could not read poems.json"

244 end

245

246 local uris = {}

247 for _, poem in ipairs(poems_data.poems or {}) do

248 if poem.metadata and poem.metadata.boost_type == "external" then

249 local uri = poem.metadata.original_uri

250 if uri then

251 table.insert(uris, uri)

252 end

253 end

254 end

255

256 return uris

257end

258-- }}}

259

260-- {{{ local function scrape_boosts

261local function scrape_boosts()

262 print("=== Boost Content Scraper ===")

263 print("Project directory: " .. DIR)

264 print("Force rescrape: " .. tostring(FORCE_RESCRAPE))

265 print("Dry run: " .. tostring(DRY_RUN))

266 if MAX_SCRAPE then

267 print("Max URIs to scrape: " .. MAX_SCRAPE)

268 end

269 print("")

270

271 -- Load existing cache

272 local cache = load_cache()

273

274 -- Count existing entries

275 local existing_count = 0

276 for _ in pairs(cache.entries or {}) do

277 existing_count = existing_count + 1

278 end

279 print("Existing cache entries: " .. existing_count)

280

281 -- Get list of URIs to scrape

282 local uris, err = get_external_boost_uris()

283 if not uris then

284 print("ERROR: " .. err)

285 return false

286 end

287 print("Found " .. #uris .. " external boost URIs in poems.json")

288 print("")

289

290 -- Filter to URIs not yet cached (unless force rescrape)

291 local to_scrape = {}

292 local skipped = 0

293 for _, uri in ipairs(uris) do

294 if FORCE_RESCRAPE or not cache.entries[uri] then

295 table.insert(to_scrape, uri)

296 else

297 skipped = skipped + 1

298 end

299 end

300

301 print("URIs already cached: " .. skipped)

302 print("URIs to scrape: " .. #to_scrape)

303

304 -- Apply max scrape limit

305 if MAX_SCRAPE and #to_scrape > MAX_SCRAPE then

306 print("Limiting to first " .. MAX_SCRAPE .. " URIs")

307 local limited = {}

308 for i = 1, MAX_SCRAPE do

309 limited[i] = to_scrape[i]

310 end

311 to_scrape = limited

312 end

313

314 print("")

315

316 if #to_scrape == 0 then

317 print("Nothing to scrape - all URIs already cached!")

318 return true

319 end

320

321 -- Sort URIs by domain for better rate limiting efficiency

322 table.sort(to_scrape, function(a, b)

323 return extract_domain(a) < extract_domain(b)

324 end)

325

326 -- Dry run: just show what would be scraped

327 if DRY_RUN then

328 print("=== DRY RUN - Would scrape these URIs ===")

329 local domains = {}

330 for _, uri in ipairs(to_scrape) do

331 local domain = extract_domain(uri)

332 domains[domain] = (domains[domain] or 0) + 1

333 print(" " .. uri)

334 end

335 print("")

336 print("=== Domains to contact ===")

337 local domain_list = {}

338 for domain, count in pairs(domains) do

339 table.insert(domain_list, {domain = domain, count = count})

340 end

341 table.sort(domain_list, function(a, b) return a.count > b.count end)

342 for _, d in ipairs(domain_list) do

343 print(string.format(" %3d URIs from %s", d.count, d.domain))

344 end

345 print("")

346 print("Estimated time: " .. math.ceil(#to_scrape * CONFIG.delay_between_requests / 60) .. " minutes")

347 return true

348 end

349

350 -- Scrape each URI

351 local last_domain = nil

352 local success_count = 0

353 local error_count = 0

354 local start_time = os.time()

355

356 for i, uri in ipairs(to_scrape) do

357 local domain = extract_domain(uri)

358

359 -- Rate limiting

360 if i > 1 then

361 local delay = CONFIG.delay_between_requests

362 if domain == last_domain then

363 delay = delay + CONFIG.delay_same_domain

364 end

365 sleep(delay)

366 end

367

368 -- Progress indicator

369 io.write(string.format("[%d/%d] %s ... ", i, #to_scrape, uri:sub(1, 60)))

370 io.flush()

371

372 -- Fetch with retries

373 local result = nil

374 for attempt = 1, CONFIG.max_retries do

375 result = fetch_activitypub_json(uri)

376 if result.success then

377 break

378 end

379 if attempt < CONFIG.max_retries then

380 io.write("retry " .. attempt .. " ... ")

381 io.flush()

382 sleep(CONFIG.retry_delay)

383 end

384 end

385

386 if result.success then

387 -- Extract content

388 local content_data, extract_err = extract_content_from_activitypub(result.data)

389 if content_data then

390 cache.entries[uri] = {

391 scraped_at = os.date("!%Y-%m-%dT%H:%M:%SZ"),

392 content = content_data.content,

393 summary = content_data.summary,

394 sensitive = content_data.sensitive,

395 published = content_data.published,

396 attributed_to = content_data.attributed_to,

397 type = content_data.type

398 }

399 success_count = success_count + 1

400 print("OK")

401 else

402 cache.errors[uri] = {

403 attempted_at = os.date("!%Y-%m-%dT%H:%M:%SZ"),

404 error = extract_err or "Unknown extraction error"

405 }

406 error_count = error_count + 1

407 print("EXTRACT ERROR: " .. (extract_err or "unknown"))

408 end

409 else

410 cache.errors[uri] = {

411 attempted_at = os.date("!%Y-%m-%dT%H:%M:%SZ"),

412 error = result.error,

413 status_code = result.status_code

414 }

415 error_count = error_count + 1

416 print("ERROR: " .. result.error)

417 end

418

419 last_domain = domain

420

421 -- Save cache periodically (every 10 successful scrapes)

422 if success_count > 0 and success_count % 10 == 0 then

423 save_cache(cache)

424 end

425 end

426

427 -- Final save

428 save_cache(cache)

429

430 -- Summary

431 local elapsed = os.time() - start_time

432 print("")

433 print("=== Scraping Complete ===")

434 print("Scraped: " .. success_count .. " URIs")

435 print("Errors: " .. error_count .. " URIs")

436 print("Time: " .. elapsed .. " seconds")

437 print("Cache saved to: " .. CONFIG.cache_path)

438

439 return true

440end

441-- }}}

442

443-- Run scraper

444scrape_boosts()

445