src/poem-extractor.lua

3-- {{{ local function setup_dir_path

4local function setup_dir_path(provided_dir)

5 if provided_dir then

6 return provided_dir

7 end

8 return "/mnt/mtwo/programming/ai-stuff/neocities-modernization"

9end

10-- }}}

23-- {{{ local function relative_path

24local function relative_path(absolute_path)

25 if absolute_path:sub(1, #DIR) == DIR then

26 local rel = absolute_path:sub(#DIR + 1)

27 if rel:sub(1, 1) == "/" then rel = rel:sub(2) end

28 return "./" .. rel

29 end

30 return absolute_path

31end

32-- }}}

36-- {{{ function load_json_file

37local function load_json_file(filepath)

38 local file = io.open(filepath, "r")

39 if not file then

40 return nil

41 end

42

43 local content = file:read("*a")

44 file:close()

45

46 local data, pos, err = dkjson.decode(content, 1, nil)

47 if err then

48 print("Warning: Failed to parse JSON file " .. filepath .. ": " .. err)

49 return nil

50 end

51

52 return data

53end

54-- }}}

56-- {{{ local function is_image_only_post

57-- Issue 9-004: Detect if a poem is an "image-only" post

58-- Image-only posts have attachments but minimal text content (just emoji or <10 chars)

59-- These posts cannot be meaningfully embedded because there's no semantic content

60local function is_image_only_post(poem)

61 -- Must have attachments to be an image post

62 if not poem.attachments or #poem.attachments == 0 then

63 return false

64 end

65

66 -- Get content and strip common image-related emojis and whitespace

67 local content = poem.content or ""

68 -- Remove whitespace

69 local stripped = content:gsub("%s+", "")

70 -- Remove common image emojis (these don't carry semantic meaning)

71 -- Using string patterns since Lua patterns don't handle UTF-8 well

72 stripped = stripped:gsub("[📷📸🖼🎨🌅🌄🌃🌉🏞️]", "")

73

74 -- If remaining content is less than 10 chars, it's image-only

75 return #stripped < 10

76end

77-- }}}

79-- {{{ local function parse_iso8601_timestamp

80-- Parse ISO 8601 timestamp to Unix epoch for comparison

81-- Handles formats like "2024-03-15T10:30:00Z" or "2024-03-15T10:30:00.000Z"

82local function parse_iso8601_timestamp(timestamp)

83 if not timestamp then return 0 end

84

85 local year, month, day, hour, min, sec = timestamp:match(

86 "(%d%d%d%d)%-(%d%d)%-(%d%d)T(%d%d):(%d%d):(%d%d)"

87 )

88

89 if year then

90 return os.time({

91 year = tonumber(year),

92 month = tonumber(month),

93 day = tonumber(day),

94 hour = tonumber(hour),

95 min = tonumber(min),

96 sec = tonumber(sec)

97 })

98 end

99

100 -- Fallback: try date-only format

101 year, month, day = timestamp:match("(%d%d%d%d)%-(%d%d)%-(%d%d)")

102 if year then

103 return os.time({

104 year = tonumber(year),

105 month = tonumber(month),

106 day = tonumber(day),

107 hour = 0, min = 0, sec = 0

108 })

109 end

110

111 return 0

112end

113-- }}}

115-- {{{ local function mark_image_only_posts

116-- Issue 9-010: Mark image-only posts and find nearest text poem for embedding inheritance

117-- Images stay on their original post; embedding inherits from nearest text poem

118-- This replaces the old association system from Issue 9-004

119local function mark_image_only_posts(poems)

120 -- Separate text poems and image-only posts

121 local text_poems = {}

122 local image_posts = {}

123

124 for _, poem in ipairs(poems) do

125 poem.is_image_only = is_image_only_post(poem)

126 if poem.is_image_only then

127 table.insert(image_posts, poem)

128 else

129 table.insert(text_poems, poem)

130 end

131 end

132

133 if #image_posts == 0 then

134 return poems -- No image-only posts to process

135 end

136

137 print(string.format(" Found %d image-only posts for embedding inheritance", #image_posts))

138

139 -- Sort text poems by timestamp for efficient searching

140 table.sort(text_poems, function(a, b)

141 return parse_iso8601_timestamp(a.creation_date) < parse_iso8601_timestamp(b.creation_date)

142 end)

143

144 -- Find nearest text poem for each image-only post (for embedding inheritance only)

145 local linked_count = 0

146 for _, img_post in ipairs(image_posts) do

147 local img_time = parse_iso8601_timestamp(img_post.creation_date)

148 local nearest = nil

149 local nearest_delta = math.huge

150

151 -- Linear search for nearest text poem

152 for _, text_poem in ipairs(text_poems) do

153 local text_time = parse_iso8601_timestamp(text_poem.creation_date)

154 local delta = math.abs(img_time - text_time)

155

156 if delta < nearest_delta then

157 nearest_delta = delta

158 nearest = text_poem

159 end

160 end

161

162 -- Store reference to nearest text poem for embedding inheritance

163 -- Note: nearest_text_poem_index will be set after poem_index assignment

164 if nearest then

165 img_post.nearest_text_poem_id = nearest.id

166 img_post.nearest_text_poem_category = nearest.category

167 img_post.nearest_text_time_delta = nearest_delta

168 linked_count = linked_count + 1

169 end

170 end

171

172 print(string.format(" Linked %d image-only posts to nearest text poems for embedding", linked_count))

173

174 -- Return original poems list (now modified with is_image_only flags)

175 return poems

176end

177-- }}}

179-- {{{ local function extract_poem_info

180local function extract_poem_info(header_line)

181 -- Extract info from lines like: " -> file: messages/0767.txt", " -> file: fediverse/1234.txt", etc.

182 local path = header_line:match("%->%s*file:%s*(.+)")

183 if not path then

184 return nil, nil, nil

185 end

186

187 -- Try to extract numeric ID from filename

188 local id = path:match("(%d+)%.txt$")

189 id = id and tonumber(id) or nil

190

191 -- Determine category

192 local category = path:match("^([^/]+)/")

193

194 return path, id, category

195end

196-- }}}

198-- {{{ local function parse_compiled_file

199local function parse_compiled_file(filepath)

200 local file = io.open(filepath, "r")

201 if not file then

202 error("Could not open file: " .. filepath)

203 end

204

205 local poems = {}

206 local current_poem = nil

207 local content_lines = {}

208 local in_poem_content = false

209

210 for line in file:lines() do

211 -- Check for poem header

212 if line:match("^%s*%->%s*file:") then

213 -- Save previous poem if exists

214 if current_poem then

215 current_poem.content = table.concat(content_lines, "\n"):gsub("^%s*", ""):gsub("%s*$", "")

216 current_poem.length = #current_poem.content

217 table.insert(poems, current_poem)

218 end

219

220 -- Start new poem

221 local filepath, id, category = extract_poem_info(line)

222 if filepath then

223 current_poem = {

224 id = id,

225 filepath = filepath,

226 category = category,

227 content = "",

228 length = 0

229 }

230 content_lines = {}

231 in_poem_content = false

232 end

233 elseif line:match("^%-%-%-%-%-%-%-%-%-") then

234 -- Separator line - next content belongs to current poem

235 in_poem_content = true

236 elseif current_poem and in_poem_content then

237 -- Collect poem content

238 table.insert(content_lines, line)

239 end

240 end

241

242 -- Don't forget the last poem

243 if current_poem then

244 current_poem.content = table.concat(content_lines, "\n"):gsub("^%s*", ""):gsub("%s*$", "")

245 current_poem.length = #current_poem.content

246 table.insert(poems, current_poem)

247 end

248

249 file:close()

250 return poems

251end

252-- }}}

254-- {{{ function M.load_extracted_json

255function M.load_extracted_json(input_directory)

256 local poems = {}

257

258 -- Load fediverse poems

259 local fediverse_file = input_directory .. "/fediverse/files/poems.json"

260 local fediverse_data = load_json_file(fediverse_file)

261 local attachment_count = 0

262 if fediverse_data and fediverse_data.poems then

263 print("Loading " .. #fediverse_data.poems .. " fediverse poems from JSON")

264 for _, poem in ipairs(fediverse_data.poems) do

265 local poem_entry = {

266 id = tonumber(poem.id),

267 filepath = poem.category .. "/" .. poem.id .. ".txt", -- Reconstruct legacy path format

268 category = poem.category,

269 content = poem.content,

270 raw_content = poem.raw_content,

271 creation_date = poem.creation_date,

272 content_warning = poem.content_warning,

273 length = poem.metadata and poem.metadata.character_count or #(poem.content or ""),

274 metadata = poem.metadata

275 }

276 -- Preserve media attachments if present (from ActivityPub extraction)

277 -- Attachments contain image/video metadata that can be used for HTML generation

278 if poem.attachments then

279 poem_entry.attachments = poem.attachments

280 attachment_count = attachment_count + #poem.attachments

281 end

282 table.insert(poems, poem_entry)

283 end

284 if attachment_count > 0 then

285 print(" Found " .. attachment_count .. " media attachments in fediverse poems")

286 end

287 else

288 print("No fediverse poems found at: " .. fediverse_file)

289 end

290

291 -- Load messages poems

292 local messages_file = input_directory .. "/messages/files/poems.json"

293 local messages_data = load_json_file(messages_file)

294 if messages_data and messages_data.poems then

295 print("Loading " .. #messages_data.poems .. " messages poems from JSON")

296 for _, poem in ipairs(messages_data.poems) do

297 table.insert(poems, {

298 id = tonumber(poem.id),

299 filepath = poem.category .. "/" .. poem.id .. ".txt", -- Reconstruct legacy path format

300 category = poem.category,

301 content = poem.content,

302 creation_date = poem.creation_date,

303 length = poem.metadata and poem.metadata.character_count or #(poem.content or ""),

304 metadata = poem.metadata

305 })

306 end

307 else

308 print("No messages poems found at: " .. messages_file)

309 end

310

311 -- Load notes poems

312 local notes_file = input_directory .. "/notes/files/poems.json"

313 local notes_data = load_json_file(notes_file)

314 if notes_data and notes_data.poems then

315 print("Loading " .. #notes_data.poems .. " notes poems from JSON")

316 for _, poem in ipairs(notes_data.poems) do

317 table.insert(poems, {

318 id = tonumber(poem.id),

319 filepath = poem.category .. "/" .. poem.id .. ".txt", -- Reconstruct legacy path format

320 category = poem.category,

321 content = poem.content,

322 creation_date = poem.creation_date,

323 content_warning = poem.content_warning,

324 length = poem.metadata and poem.metadata.character_count or #(poem.content or ""),

325 metadata = poem.metadata

326 })

327 end

328 else

329 print("No notes poems found at: " .. notes_file)

330 end

331

332 -- Load bluesky poems

333 local bluesky_file = input_directory .. "/bluesky/files/poems.json"

334 local bluesky_data = load_json_file(bluesky_file)

335 if bluesky_data and bluesky_data.poems then

336 print("Loading " .. #bluesky_data.poems .. " bluesky poems from JSON")

337 for _, poem in ipairs(bluesky_data.poems) do

338 table.insert(poems, {

339 id = tonumber(poem.id),

340 filepath = poem.category or "bluesky" .. "/" .. poem.id .. ".txt", -- Reconstruct legacy path format

341 category = poem.category or "bluesky",

342 content = poem.content,

343 creation_date = poem.created_at or poem.creation_date,

344 content_warning = poem.content_warning,

345 length = poem.metadata and poem.metadata.character_count or #(poem.content or ""),

346 metadata = poem.metadata

347 })

348 end

349 else

350 print("No bluesky poems found at: " .. bluesky_file)

351 end

352

353 -- Issue 9-010: Mark image-only posts for embedding inheritance

354 -- Images stay on original post; embedding inherits from nearest text poem

355 poems = mark_image_only_posts(poems)

356

357 return poems

358end

359-- }}}

361-- {{{ function M.detect_input_mode

362function M.detect_input_mode(base_directory)

363 local input_dir = base_directory .. "/input"

364 local compiled_file = base_directory .. "/compiled.txt"

365

366 -- Check for modern JSON extraction

367 local fediverse_json = input_dir .. "/fediverse/files/poems.json"

368 local messages_json = input_dir .. "/messages/files/poems.json"

369 local notes_json = input_dir .. "/notes/files/poems.json"

370 local bluesky_json = input_dir .. "/bluesky/files/poems.json"

371

372 -- Check if any JSON file exists

373 local fediverse_file = io.open(fediverse_json, "r")

374 local messages_file = io.open(messages_json, "r")

375 local notes_file = io.open(notes_json, "r")

376 local bluesky_file = io.open(bluesky_json, "r")

377

378 if fediverse_file or messages_file or notes_file or bluesky_file then

379 if fediverse_file then io.close(fediverse_file) end

380 if messages_file then io.close(messages_file) end

381 if notes_file then io.close(notes_file) end

382 if bluesky_file then io.close(bluesky_file) end

383 return "json", input_dir

384 end

385

386 local compiled_handle = io.open(compiled_file, "r")

387 if compiled_handle then

388 io.close(compiled_handle)

389 return "compiled", compiled_file

390 else

391 return "none", nil

392 end

393end

394-- }}}

396-- {{{ function assign_nearest_text_poem_index

397-- Issue 9-010: After poem_index is assigned, map nearest_text_poem to poem_index

398-- This allows the embedding generator to look up embeddings by index

399local function assign_nearest_text_poem_index(poems)

400 -- Build lookup table: (category, id) -> poem_index

401 local lookup = {}

402 for _, poem in ipairs(poems) do

403 if not poem.is_image_only then

404 local key = (poem.category or "") .. "/" .. (poem.id or "")

405 lookup[key] = poem.poem_index

406 end

407 end

408

409 -- Assign nearest_text_poem_index to image-only posts

410 local assigned_count = 0

411 for _, poem in ipairs(poems) do

412 if poem.is_image_only and poem.nearest_text_poem_id then

413 local key = (poem.nearest_text_poem_category or "") .. "/" .. poem.nearest_text_poem_id

414 local poem_index = lookup[key]

415 if poem_index then

416 poem.nearest_text_poem_index = poem_index

417 assigned_count = assigned_count + 1

418 end

419 end

420 end

421

422 if assigned_count > 0 then

423 print(string.format(" Assigned nearest_text_poem_index to %d image-only posts", assigned_count))

424 end

425

426 return poems

427end

428-- }}}

430-- {{{ function M.extract_poems_auto

431-- opts.include_boosts (default true) controls whether reshared "boost" posts

432-- are kept. The caller resolves CLI-flag-over-config and passes the boolean.

433function M.extract_poems_auto(base_directory, output_file, opts)

434 opts = opts or {}

435 local include_boosts = opts.include_boosts

436 if include_boosts == nil then include_boosts = true end

437

438 local mode, source_path = M.detect_input_mode(base_directory)

439

440 local poems

441 if mode == "json" then

442 print("Using modern JSON extraction from: " .. relative_path(source_path))

443 poems = M.load_extracted_json(source_path)

444 elseif mode == "compiled" then

445 print("Using legacy compiled.txt extraction from: " .. relative_path(source_path))

446 poems = parse_compiled_file(source_path)

447 else

448 error("No valid input found: neither JSON extracts nor compiled.txt available in " .. base_directory)

449 end

450

451 print("Found " .. #poems .. " poems")

452

453 -- Optionally drop reshared boost posts. A boost is identified by its

454 -- directory-derived category (e.g. "fediverse_boost") or a metadata flag.

455 -- Filtering BEFORE poem_index assignment keeps the indices contiguous so

456 -- downstream caches stay array-aligned.

457 if not include_boosts then

458 local kept, removed = {}, 0

459 for _, p in ipairs(poems) do

460 local is_boost = (p.category and p.category:lower():find("boost", 1, true) ~= nil)

461 or (p.metadata and p.metadata.is_boost == true)

462 if is_boost then removed = removed + 1 else kept[#kept + 1] = p end

463 end

464 poems = kept

465 print(string.format("Excluded %d boost posts (include_boosts=false)", removed))

466 end

467

468 -- Sort poems by category, then by ID for consistent ordering

469 table.sort(poems, function(a, b)

470 if a.category ~= b.category then

471 return (a.category or "") < (b.category or "")

472 end

473 return (a.id or 0) < (b.id or 0)

474 end)

475

476 -- Assign poem_index after sorting (unique, array-aligned identifier)

477 -- This solves cross-category ID collisions: fediverse/0002.txt and messages/0002.txt

478 -- both have id=2 but different poem_index values. See Issue 8-019.

479 for i, poem in ipairs(poems) do

480 poem.poem_index = i

481 end

482

483 -- Issue 9-010: Assign nearest_text_poem_index for embedding inheritance

484 -- Image-only posts inherit embeddings from nearest text poem

485 print("Assigning nearest text poem indices for embedding inheritance...")

486 poems = assign_nearest_text_poem_index(poems)

487

488 -- Create output structure

489 local output_data = {

490 metadata = {

491 source_mode = mode,

492 source_path = source_path,

493 extracted_at = os.date("%Y-%m-%d %H:%M:%S"),

494 total_poems = #poems,

495 extraction_version = "2.3", -- Bumped for embedding inheritance (Issue 9-010)

496 features = {

497 poem_index = true, -- Issue 8-019

498 embedding_inheritance = true -- Issue 9-010 (replaces image_only_association)

499 }

500 },

501 poems = poems

502 }

503

504 if output_file then

505 -- Save to JSON file

506 local json_output = dkjson.encode(output_data, { indent = true })

507

508 local output = io.open(output_file, "w")

509 if not output then

510 error("Could not create output file: " .. output_file)

511 end

512

513 output:write(json_output)

514 output:close()

515

516 print("Poems extracted and saved to: " .. relative_path(output_file))

517 end

518

519 return output_data

520end

521-- }}}

523-- {{{ function M.extract_poems

524function M.extract_poems(input_file, output_file)

525 print("Extracting poems from: " .. relative_path(input_file))

526

527 local poems = parse_compiled_file(input_file)

528

529 print("Found " .. #poems .. " poems")

530

531 -- Sort poems by category, then by ID for consistent ordering

532 table.sort(poems, function(a, b)

533 if a.category ~= b.category then

534 return (a.category or "") < (b.category or "")

535 end

536 return (a.id or 0) < (b.id or 0)

537 end)

538

539 -- Assign poem_index after sorting (unique, array-aligned identifier)

540 -- See Issue 8-019 for rationale.

541 for i, poem in ipairs(poems) do

542 poem.poem_index = i

543 end

544

545 -- Create output structure

546 local output_data = {

547 metadata = {

548 source_file = input_file,

549 extracted_at = os.date("%Y-%m-%d %H:%M:%S"),

550 total_poems = #poems,

551 extraction_version = "1.1" -- Bumped for poem_index addition

552 },

553 poems = poems

554 }

555

556 -- Save to JSON file

557 local json_output = dkjson.encode(output_data, { indent = true })

558

559 local output = io.open(output_file, "w")

560 if not output then

561 error("Could not create output file: " .. output_file)

562 end

563

564 output:write(json_output)

565 output:close()

566

567 print("Poems extracted and saved to: " .. relative_path(output_file))

568 return output_data

569end

570-- }}}

572-- {{{ function M.main

573function M.main(interactive_mode)

574 if interactive_mode then

575 print("=== Poem Extraction Tool ===")

576 print("1. Auto-detect input source (JSON or compiled.txt)")

577 print("2. Force extract from compiled.txt")

578 print("3. Force extract from custom file")

579 io.write("Select option (1-3): ")

580 local choice = io.read()

581

582 local output_file = utils.asset_path("poems.json")

583

584 if choice == "1" then

585 M.extract_poems_auto(DIR, output_file)

586 elseif choice == "2" then

587 local input_file = DIR .. "/compiled.txt"

588 M.extract_poems(input_file, output_file)

589 elseif choice == "3" then

590 io.write("Enter input file path: ")

591 local input_file = io.read()

592 io.write("Enter output file path: ")

593 output_file = io.read()

594 M.extract_poems(input_file, output_file)

595 else

596 print("Invalid choice")

597 return

598 end

599 else

600 -- Default non-interactive mode - use auto-detection

601 local output_file = utils.asset_path("poems.json")

602 M.extract_poems_auto(DIR, output_file)

603 end

604end

605-- }}}

620-- {{{ function remove_reply_syntax

621local function remove_reply_syntax(content)

622 -- Remove reply syntax from content for embedding generation

623 -- This removes @username and @username@server.domain patterns to improve embedding quality

624

625 -- Remove @username@server.domain patterns (federated mentions) first

626 content = content:gsub("@[%w%.%-_]+@[%w%.%-]+%.%w+", "")

627

628 -- Remove @username patterns (local mentions) - handle multiple consecutive mentions

629 -- Use a loop to handle multiple consecutive mentions like "@user1 @user2 @user3"

630 local prev_content

631 repeat

632 prev_content = content

633 -- Pattern 1: @username at start of line or after whitespace

634 content = content:gsub("^@[%w%.%-_]+%s*", "")

635 content = content:gsub("(%s)@[%w%.%-_]+%s*", "%1")

636 content = content:gsub("(%s)@[%w%.%-_]+$", "%1")

637 content = content:gsub("(%s)@[%w%.%-_]+([%p])", "%1%2")

638 until content == prev_content

639

640 -- Final cleanup: remove any remaining isolated @ mentions

641 content = content:gsub("@[%w%.%-_]+", "")

642

643 -- Clean up extra whitespace left behind

644 content = content:gsub("%s+", " "):gsub("^%s*", ""):gsub("%s*$", "")

645

646 return content

647end

648-- }}}

650-- {{{ function M.extract_pure_poem_content

651function M.extract_pure_poem_content(processed_content)

652 local content = processed_content or ""

653

654 -- Remove date stamp (YYYY-MM-DD\n)

655 content = content:gsub("^%d%d%d%d%-%d%d%-%d%d\n", "")

656

657 -- Extract content warning text (without "CW: " prefix)

658 local cw_text = ""

659 local cw_pattern = "CW:%s*([^\n]*)\n"

660 local cw_match = content:match(cw_pattern)

661 if cw_match then

662 cw_text = cw_match:gsub("^%s*", ""):gsub("%s*$", "") -- trim whitespace

663 content = content:gsub(cw_pattern, "") -- remove entire CW line

664 end

665

666 -- NEW: Remove reply syntax from both content warning and main content

667 if cw_text ~= "" then

668 cw_text = remove_reply_syntax(cw_text)

669 end

670 content = remove_reply_syntax(content)

671

672 -- Remove extra formatting newlines (multiple consecutive newlines)

673 content = content:gsub("\n\n+", "\n"):gsub("^\n", ""):gsub("\n$", "")

674

675 -- Remove any title/ID/separator artifacts if present

676 -- (These shouldn't be in poem.content but safety check)

677 content = content:gsub("^%s*%->%s*file:.-\n", "") -- file headers

678 content = content:gsub("^%-%-%-%-+\n", "") -- separator lines

679 content = content:gsub("\n%-%-%-%-+$", "") -- trailing separators

680

681 -- Combine pure content: cleaned content warning + cleaned poem content

682 local pure_content = ""

683 if cw_text ~= "" and content ~= "" then

684 pure_content = cw_text .. "\n" .. content

685 elseif cw_text ~= "" then

686 pure_content = cw_text

687 else

688 pure_content = content

689 end

690

691 return pure_content

692end

693-- }}}

695-- {{{ function M.extract_pure_poem_content_for_embedding

696-- Enhanced version of extract_pure_poem_content for embedding generation

697-- Issue 6-033: Additional preprocessing for better embedding quality

698-- Key differences from extract_pure_poem_content:

699-- 1. Converts dashes to spaces (better tokenization: "cannabis-mentioned" → "cannabis mentioned")

700-- 2. Strips file path metadata that leaked into content

701-- 3. Strips separator lines (----)

702-- 4. Isolates single poem if multiple are concatenated

703function M.extract_pure_poem_content_for_embedding(processed_content)

704 -- Start with the standard pure content extraction

705 local content = M.extract_pure_poem_content(processed_content)

706

707 -- First, check for concatenated poems and isolate the first one

708 -- Look for patterns like "\n----" or "\n -> file:" that indicate poem boundaries

709 local separator_pos = content:find("\n%-%-%-%-")

710 if separator_pos then

711 content = content:sub(1, separator_pos - 1)

712 end

713

714 -- Remove file path metadata lines

715 -- Pattern 1: " -> file: fediverse/1678.txt" style

716 content = content:gsub("%s*%->%s*file:[^\n]*\n?", " ")

717 -- Pattern 2: "file: /home/ritz/..." absolute path style

718 content = content:gsub("file:%s*/[^\n]*\n?", " ")

719

720 -- Remove any remaining separator lines (4+ dashes)

721 content = content:gsub("%-%-%-%-+", " ")

722

723 -- Convert dashes to spaces for better embedding tokenization

724 -- "cannabis-mentioned" becomes "cannabis mentioned" which tokenizes better

725 -- This helps the model understand compound concepts

726 content = content:gsub("%-", " ")

727

728 -- Clean up multiple consecutive spaces and whitespace artifacts

729 content = content:gsub("%s+", " ")

730 content = content:gsub("^%s*", "")

731 content = content:gsub("%s*$", "")

732

733 return content

734end

735-- }}}