src/poem-extractor.lua

737 lines

1#!/usr/bin/env lua
2
3-- {{{ local function setup_dir_path
4local function setup_dir_path(provided_dir)
5 if provided_dir then
6 return provided_dir
7 end
8 return "/mnt/mtwo/programming/ai-stuff/neocities-modernization"
9end
10-- }}}
11
12-- Script configuration
13local DIR = setup_dir_path(arg and arg[1])
14
15-- Load required libraries
16package.path = DIR .. "/libs/?.lua;" .. package.path
17local dkjson = require("dkjson")
18local utils = require("utils")
19
20-- Initialize asset path configuration for standalone execution
21utils.init_assets_root(arg)
22
23-- {{{ local function relative_path
24local function relative_path(absolute_path)
25 if absolute_path:sub(1, #DIR) == DIR then
26 local rel = absolute_path:sub(#DIR + 1)
27 if rel:sub(1, 1) == "/" then rel = rel:sub(2) end
28 return "./" .. rel
29 end
30 return absolute_path
31end
32-- }}}
33
34local M = {}
35
36-- {{{ function load_json_file
37local function load_json_file(filepath)
38 local file = io.open(filepath, "r")
39 if not file then
40 return nil
41 end
42
43 local content = file:read("*a")
44 file:close()
45
46 local data, pos, err = dkjson.decode(content, 1, nil)
47 if err then
48 print("Warning: Failed to parse JSON file " .. filepath .. ": " .. err)
49 return nil
50 end
51
52 return data
53end
54-- }}}
55
56-- {{{ local function is_image_only_post
57-- Issue 9-004: Detect if a poem is an "image-only" post
58-- Image-only posts have attachments but minimal text content (just emoji or <10 chars)
59-- These posts cannot be meaningfully embedded because there's no semantic content
60local function is_image_only_post(poem)
61 -- Must have attachments to be an image post
62 if not poem.attachments or #poem.attachments == 0 then
63 return false
64 end
65
66 -- Get content and strip common image-related emojis and whitespace
67 local content = poem.content or ""
68 -- Remove whitespace
69 local stripped = content:gsub("%s+", "")
70 -- Remove common image emojis (these don't carry semantic meaning)
71 -- Using string patterns since Lua patterns don't handle UTF-8 well
72 stripped = stripped:gsub("[πŸ“·πŸ“ΈπŸ–ΌπŸŽ¨πŸŒ…πŸŒ„πŸŒƒπŸŒ‰πŸžοΈ]", "")
73
74 -- If remaining content is less than 10 chars, it's image-only
75 return #stripped < 10
76end
77-- }}}
78
79-- {{{ local function parse_iso8601_timestamp
80-- Parse ISO 8601 timestamp to Unix epoch for comparison
81-- Handles formats like "2024-03-15T10:30:00Z" or "2024-03-15T10:30:00.000Z"
82local function parse_iso8601_timestamp(timestamp)
83 if not timestamp then return 0 end
84
85 local year, month, day, hour, min, sec = timestamp:match(
86 "(%d%d%d%d)%-(%d%d)%-(%d%d)T(%d%d):(%d%d):(%d%d)"
87 )
88
89 if year then
90 return os.time({
91 year = tonumber(year),
92 month = tonumber(month),
93 day = tonumber(day),
94 hour = tonumber(hour),
95 min = tonumber(min),
96 sec = tonumber(sec)
97 })
98 end
99
100 -- Fallback: try date-only format
101 year, month, day = timestamp:match("(%d%d%d%d)%-(%d%d)%-(%d%d)")
102 if year then
103 return os.time({
104 year = tonumber(year),
105 month = tonumber(month),
106 day = tonumber(day),
107 hour = 0, min = 0, sec = 0
108 })
109 end
110
111 return 0
112end
113-- }}}
114
115-- {{{ local function mark_image_only_posts
116-- Issue 9-010: Mark image-only posts and find nearest text poem for embedding inheritance
117-- Images stay on their original post; embedding inherits from nearest text poem
118-- This replaces the old association system from Issue 9-004
119local function mark_image_only_posts(poems)
120 -- Separate text poems and image-only posts
121 local text_poems = {}
122 local image_posts = {}
123
124 for _, poem in ipairs(poems) do
125 poem.is_image_only = is_image_only_post(poem)
126 if poem.is_image_only then
127 table.insert(image_posts, poem)
128 else
129 table.insert(text_poems, poem)
130 end
131 end
132
133 if #image_posts == 0 then
134 return poems -- No image-only posts to process
135 end
136
137 print(string.format(" Found %d image-only posts for embedding inheritance", #image_posts))
138
139 -- Sort text poems by timestamp for efficient searching
140 table.sort(text_poems, function(a, b)
141 return parse_iso8601_timestamp(a.creation_date) < parse_iso8601_timestamp(b.creation_date)
142 end)
143
144 -- Find nearest text poem for each image-only post (for embedding inheritance only)
145 local linked_count = 0
146 for _, img_post in ipairs(image_posts) do
147 local img_time = parse_iso8601_timestamp(img_post.creation_date)
148 local nearest = nil
149 local nearest_delta = math.huge
150
151 -- Linear search for nearest text poem
152 for _, text_poem in ipairs(text_poems) do
153 local text_time = parse_iso8601_timestamp(text_poem.creation_date)
154 local delta = math.abs(img_time - text_time)
155
156 if delta < nearest_delta then
157 nearest_delta = delta
158 nearest = text_poem
159 end
160 end
161
162 -- Store reference to nearest text poem for embedding inheritance
163 -- Note: nearest_text_poem_index will be set after poem_index assignment
164 if nearest then
165 img_post.nearest_text_poem_id = nearest.id
166 img_post.nearest_text_poem_category = nearest.category
167 img_post.nearest_text_time_delta = nearest_delta
168 linked_count = linked_count + 1
169 end
170 end
171
172 print(string.format(" Linked %d image-only posts to nearest text poems for embedding", linked_count))
173
174 -- Return original poems list (now modified with is_image_only flags)
175 return poems
176end
177-- }}}
178
179-- {{{ local function extract_poem_info
180local function extract_poem_info(header_line)
181 -- Extract info from lines like: " -> file: messages/0767.txt", " -> file: fediverse/1234.txt", etc.
182 local path = header_line:match("%->%s*file:%s*(.+)")
183 if not path then
184 return nil, nil, nil
185 end
186
187 -- Try to extract numeric ID from filename
188 local id = path:match("(%d+)%.txt$")
189 id = id and tonumber(id) or nil
190
191 -- Determine category
192 local category = path:match("^([^/]+)/")
193
194 return path, id, category
195end
196-- }}}
197
198-- {{{ local function parse_compiled_file
199local function parse_compiled_file(filepath)
200 local file = io.open(filepath, "r")
201 if not file then
202 error("Could not open file: " .. filepath)
203 end
204
205 local poems = {}
206 local current_poem = nil
207 local content_lines = {}
208 local in_poem_content = false
209
210 for line in file:lines() do
211 -- Check for poem header
212 if line:match("^%s*%->%s*file:") then
213 -- Save previous poem if exists
214 if current_poem then
215 current_poem.content = table.concat(content_lines, "\n"):gsub("^%s*", ""):gsub("%s*$", "")
216 current_poem.length = #current_poem.content
217 table.insert(poems, current_poem)
218 end
219
220 -- Start new poem
221 local filepath, id, category = extract_poem_info(line)
222 if filepath then
223 current_poem = {
224 id = id,
225 filepath = filepath,
226 category = category,
227 content = "",
228 length = 0
229 }
230 content_lines = {}
231 in_poem_content = false
232 end
233 elseif line:match("^%-%-%-%-%-%-%-%-%-") then
234 -- Separator line - next content belongs to current poem
235 in_poem_content = true
236 elseif current_poem and in_poem_content then
237 -- Collect poem content
238 table.insert(content_lines, line)
239 end
240 end
241
242 -- Don't forget the last poem
243 if current_poem then
244 current_poem.content = table.concat(content_lines, "\n"):gsub("^%s*", ""):gsub("%s*$", "")
245 current_poem.length = #current_poem.content
246 table.insert(poems, current_poem)
247 end
248
249 file:close()
250 return poems
251end
252-- }}}
253
254-- {{{ function M.load_extracted_json
255function M.load_extracted_json(input_directory)
256 local poems = {}
257
258 -- Load fediverse poems
259 local fediverse_file = input_directory .. "/fediverse/files/poems.json"
260 local fediverse_data = load_json_file(fediverse_file)
261 local attachment_count = 0
262 if fediverse_data and fediverse_data.poems then
263 print("Loading " .. #fediverse_data.poems .. " fediverse poems from JSON")
264 for _, poem in ipairs(fediverse_data.poems) do
265 local poem_entry = {
266 id = tonumber(poem.id),
267 filepath = poem.category .. "/" .. poem.id .. ".txt", -- Reconstruct legacy path format
268 category = poem.category,
269 content = poem.content,
270 raw_content = poem.raw_content,
271 creation_date = poem.creation_date,
272 content_warning = poem.content_warning,
273 length = poem.metadata and poem.metadata.character_count or #(poem.content or ""),
274 metadata = poem.metadata
275 }
276 -- Preserve media attachments if present (from ActivityPub extraction)
277 -- Attachments contain image/video metadata that can be used for HTML generation
278 if poem.attachments then
279 poem_entry.attachments = poem.attachments
280 attachment_count = attachment_count + #poem.attachments
281 end
282 table.insert(poems, poem_entry)
283 end
284 if attachment_count > 0 then
285 print(" Found " .. attachment_count .. " media attachments in fediverse poems")
286 end
287 else
288 print("No fediverse poems found at: " .. fediverse_file)
289 end
290
291 -- Load messages poems
292 local messages_file = input_directory .. "/messages/files/poems.json"
293 local messages_data = load_json_file(messages_file)
294 if messages_data and messages_data.poems then
295 print("Loading " .. #messages_data.poems .. " messages poems from JSON")
296 for _, poem in ipairs(messages_data.poems) do
297 table.insert(poems, {
298 id = tonumber(poem.id),
299 filepath = poem.category .. "/" .. poem.id .. ".txt", -- Reconstruct legacy path format
300 category = poem.category,
301 content = poem.content,
302 creation_date = poem.creation_date,
303 length = poem.metadata and poem.metadata.character_count or #(poem.content or ""),
304 metadata = poem.metadata
305 })
306 end
307 else
308 print("No messages poems found at: " .. messages_file)
309 end
310
311 -- Load notes poems
312 local notes_file = input_directory .. "/notes/files/poems.json"
313 local notes_data = load_json_file(notes_file)
314 if notes_data and notes_data.poems then
315 print("Loading " .. #notes_data.poems .. " notes poems from JSON")
316 for _, poem in ipairs(notes_data.poems) do
317 table.insert(poems, {
318 id = tonumber(poem.id),
319 filepath = poem.category .. "/" .. poem.id .. ".txt", -- Reconstruct legacy path format
320 category = poem.category,
321 content = poem.content,
322 creation_date = poem.creation_date,
323 content_warning = poem.content_warning,
324 length = poem.metadata and poem.metadata.character_count or #(poem.content or ""),
325 metadata = poem.metadata
326 })
327 end
328 else
329 print("No notes poems found at: " .. notes_file)
330 end
331
332 -- Load bluesky poems
333 local bluesky_file = input_directory .. "/bluesky/files/poems.json"
334 local bluesky_data = load_json_file(bluesky_file)
335 if bluesky_data and bluesky_data.poems then
336 print("Loading " .. #bluesky_data.poems .. " bluesky poems from JSON")
337 for _, poem in ipairs(bluesky_data.poems) do
338 table.insert(poems, {
339 id = tonumber(poem.id),
340 filepath = poem.category or "bluesky" .. "/" .. poem.id .. ".txt", -- Reconstruct legacy path format
341 category = poem.category or "bluesky",
342 content = poem.content,
343 creation_date = poem.created_at or poem.creation_date,
344 content_warning = poem.content_warning,
345 length = poem.metadata and poem.metadata.character_count or #(poem.content or ""),
346 metadata = poem.metadata
347 })
348 end
349 else
350 print("No bluesky poems found at: " .. bluesky_file)
351 end
352
353 -- Issue 9-010: Mark image-only posts for embedding inheritance
354 -- Images stay on original post; embedding inherits from nearest text poem
355 poems = mark_image_only_posts(poems)
356
357 return poems
358end
359-- }}}
360
361-- {{{ function M.detect_input_mode
362function M.detect_input_mode(base_directory)
363 local input_dir = base_directory .. "/input"
364 local compiled_file = base_directory .. "/compiled.txt"
365
366 -- Check for modern JSON extraction
367 local fediverse_json = input_dir .. "/fediverse/files/poems.json"
368 local messages_json = input_dir .. "/messages/files/poems.json"
369 local notes_json = input_dir .. "/notes/files/poems.json"
370 local bluesky_json = input_dir .. "/bluesky/files/poems.json"
371
372 -- Check if any JSON file exists
373 local fediverse_file = io.open(fediverse_json, "r")
374 local messages_file = io.open(messages_json, "r")
375 local notes_file = io.open(notes_json, "r")
376 local bluesky_file = io.open(bluesky_json, "r")
377
378 if fediverse_file or messages_file or notes_file or bluesky_file then
379 if fediverse_file then io.close(fediverse_file) end
380 if messages_file then io.close(messages_file) end
381 if notes_file then io.close(notes_file) end
382 if bluesky_file then io.close(bluesky_file) end
383 return "json", input_dir
384 end
385
386 local compiled_handle = io.open(compiled_file, "r")
387 if compiled_handle then
388 io.close(compiled_handle)
389 return "compiled", compiled_file
390 else
391 return "none", nil
392 end
393end
394-- }}}
395
396-- {{{ function assign_nearest_text_poem_index
397-- Issue 9-010: After poem_index is assigned, map nearest_text_poem to poem_index
398-- This allows the embedding generator to look up embeddings by index
399local function assign_nearest_text_poem_index(poems)
400 -- Build lookup table: (category, id) -> poem_index
401 local lookup = {}
402 for _, poem in ipairs(poems) do
403 if not poem.is_image_only then
404 local key = (poem.category or "") .. "/" .. (poem.id or "")
405 lookup[key] = poem.poem_index
406 end
407 end
408
409 -- Assign nearest_text_poem_index to image-only posts
410 local assigned_count = 0
411 for _, poem in ipairs(poems) do
412 if poem.is_image_only and poem.nearest_text_poem_id then
413 local key = (poem.nearest_text_poem_category or "") .. "/" .. poem.nearest_text_poem_id
414 local poem_index = lookup[key]
415 if poem_index then
416 poem.nearest_text_poem_index = poem_index
417 assigned_count = assigned_count + 1
418 end
419 end
420 end
421
422 if assigned_count > 0 then
423 print(string.format(" Assigned nearest_text_poem_index to %d image-only posts", assigned_count))
424 end
425
426 return poems
427end
428-- }}}
429
430-- {{{ function M.extract_poems_auto
431-- opts.include_boosts (default true) controls whether reshared "boost" posts
432-- are kept. The caller resolves CLI-flag-over-config and passes the boolean.
433function M.extract_poems_auto(base_directory, output_file, opts)
434 opts = opts or {}
435 local include_boosts = opts.include_boosts
436 if include_boosts == nil then include_boosts = true end
437
438 local mode, source_path = M.detect_input_mode(base_directory)
439
440 local poems
441 if mode == "json" then
442 print("Using modern JSON extraction from: " .. relative_path(source_path))
443 poems = M.load_extracted_json(source_path)
444 elseif mode == "compiled" then
445 print("Using legacy compiled.txt extraction from: " .. relative_path(source_path))
446 poems = parse_compiled_file(source_path)
447 else
448 error("No valid input found: neither JSON extracts nor compiled.txt available in " .. base_directory)
449 end
450
451 print("Found " .. #poems .. " poems")
452
453 -- Optionally drop reshared boost posts. A boost is identified by its
454 -- directory-derived category (e.g. "fediverse_boost") or a metadata flag.
455 -- Filtering BEFORE poem_index assignment keeps the indices contiguous so
456 -- downstream caches stay array-aligned.
457 if not include_boosts then
458 local kept, removed = {}, 0
459 for _, p in ipairs(poems) do
460 local is_boost = (p.category and p.category:lower():find("boost", 1, true) ~= nil)
461 or (p.metadata and p.metadata.is_boost == true)
462 if is_boost then removed = removed + 1 else kept[#kept + 1] = p end
463 end
464 poems = kept
465 print(string.format("Excluded %d boost posts (include_boosts=false)", removed))
466 end
467
468 -- Sort poems by category, then by ID for consistent ordering
469 table.sort(poems, function(a, b)
470 if a.category ~= b.category then
471 return (a.category or "") < (b.category or "")
472 end
473 return (a.id or 0) < (b.id or 0)
474 end)
475
476 -- Assign poem_index after sorting (unique, array-aligned identifier)
477 -- This solves cross-category ID collisions: fediverse/0002.txt and messages/0002.txt
478 -- both have id=2 but different poem_index values. See Issue 8-019.
479 for i, poem in ipairs(poems) do
480 poem.poem_index = i
481 end
482
483 -- Issue 9-010: Assign nearest_text_poem_index for embedding inheritance
484 -- Image-only posts inherit embeddings from nearest text poem
485 print("Assigning nearest text poem indices for embedding inheritance...")
486 poems = assign_nearest_text_poem_index(poems)
487
488 -- Create output structure
489 local output_data = {
490 metadata = {
491 source_mode = mode,
492 source_path = source_path,
493 extracted_at = os.date("%Y-%m-%d %H:%M:%S"),
494 total_poems = #poems,
495 extraction_version = "2.3", -- Bumped for embedding inheritance (Issue 9-010)
496 features = {
497 poem_index = true, -- Issue 8-019
498 embedding_inheritance = true -- Issue 9-010 (replaces image_only_association)
499 }
500 },
501 poems = poems
502 }
503
504 if output_file then
505 -- Save to JSON file
506 local json_output = dkjson.encode(output_data, { indent = true })
507
508 local output = io.open(output_file, "w")
509 if not output then
510 error("Could not create output file: " .. output_file)
511 end
512
513 output:write(json_output)
514 output:close()
515
516 print("Poems extracted and saved to: " .. relative_path(output_file))
517 end
518
519 return output_data
520end
521-- }}}
522
523-- {{{ function M.extract_poems
524function M.extract_poems(input_file, output_file)
525 print("Extracting poems from: " .. relative_path(input_file))
526
527 local poems = parse_compiled_file(input_file)
528
529 print("Found " .. #poems .. " poems")
530
531 -- Sort poems by category, then by ID for consistent ordering
532 table.sort(poems, function(a, b)
533 if a.category ~= b.category then
534 return (a.category or "") < (b.category or "")
535 end
536 return (a.id or 0) < (b.id or 0)
537 end)
538
539 -- Assign poem_index after sorting (unique, array-aligned identifier)
540 -- See Issue 8-019 for rationale.
541 for i, poem in ipairs(poems) do
542 poem.poem_index = i
543 end
544
545 -- Create output structure
546 local output_data = {
547 metadata = {
548 source_file = input_file,
549 extracted_at = os.date("%Y-%m-%d %H:%M:%S"),
550 total_poems = #poems,
551 extraction_version = "1.1" -- Bumped for poem_index addition
552 },
553 poems = poems
554 }
555
556 -- Save to JSON file
557 local json_output = dkjson.encode(output_data, { indent = true })
558
559 local output = io.open(output_file, "w")
560 if not output then
561 error("Could not create output file: " .. output_file)
562 end
563
564 output:write(json_output)
565 output:close()
566
567 print("Poems extracted and saved to: " .. relative_path(output_file))
568 return output_data
569end
570-- }}}
571
572-- {{{ function M.main
573function M.main(interactive_mode)
574 if interactive_mode then
575 print("=== Poem Extraction Tool ===")
576 print("1. Auto-detect input source (JSON or compiled.txt)")
577 print("2. Force extract from compiled.txt")
578 print("3. Force extract from custom file")
579 io.write("Select option (1-3): ")
580 local choice = io.read()
581
582 local output_file = utils.asset_path("poems.json")
583
584 if choice == "1" then
585 M.extract_poems_auto(DIR, output_file)
586 elseif choice == "2" then
587 local input_file = DIR .. "/compiled.txt"
588 M.extract_poems(input_file, output_file)
589 elseif choice == "3" then
590 io.write("Enter input file path: ")
591 local input_file = io.read()
592 io.write("Enter output file path: ")
593 output_file = io.read()
594 M.extract_poems(input_file, output_file)
595 else
596 print("Invalid choice")
597 return
598 end
599 else
600 -- Default non-interactive mode - use auto-detection
601 local output_file = utils.asset_path("poems.json")
602 M.extract_poems_auto(DIR, output_file)
603 end
604end
605-- }}}
606
607-- Command line execution (only when run directly, not when required)
608if arg and #arg > 0 and debug.getinfo(3) == nil then
609 local interactive_mode = false
610 for i, arg_val in ipairs(arg) do
611 if arg_val == "-I" then
612 interactive_mode = true
613 break
614 end
615 end
616
617 M.main(interactive_mode)
618end
619
620-- {{{ function remove_reply_syntax
621local function remove_reply_syntax(content)
622 -- Remove reply syntax from content for embedding generation
623 -- This removes @username and @username@server.domain patterns to improve embedding quality
624
625 -- Remove @username@server.domain patterns (federated mentions) first
626 content = content:gsub("@[%w%.%-_]+@[%w%.%-]+%.%w+", "")
627
628 -- Remove @username patterns (local mentions) - handle multiple consecutive mentions
629 -- Use a loop to handle multiple consecutive mentions like "@user1 @user2 @user3"
630 local prev_content
631 repeat
632 prev_content = content
633 -- Pattern 1: @username at start of line or after whitespace
634 content = content:gsub("^@[%w%.%-_]+%s*", "")
635 content = content:gsub("(%s)@[%w%.%-_]+%s*", "%1")
636 content = content:gsub("(%s)@[%w%.%-_]+$", "%1")
637 content = content:gsub("(%s)@[%w%.%-_]+([%p])", "%1%2")
638 until content == prev_content
639
640 -- Final cleanup: remove any remaining isolated @ mentions
641 content = content:gsub("@[%w%.%-_]+", "")
642
643 -- Clean up extra whitespace left behind
644 content = content:gsub("%s+", " "):gsub("^%s*", ""):gsub("%s*$", "")
645
646 return content
647end
648-- }}}
649
650-- {{{ function M.extract_pure_poem_content
651function M.extract_pure_poem_content(processed_content)
652 local content = processed_content or ""
653
654 -- Remove date stamp (YYYY-MM-DD\n)
655 content = content:gsub("^%d%d%d%d%-%d%d%-%d%d\n", "")
656
657 -- Extract content warning text (without "CW: " prefix)
658 local cw_text = ""
659 local cw_pattern = "CW:%s*([^\n]*)\n"
660 local cw_match = content:match(cw_pattern)
661 if cw_match then
662 cw_text = cw_match:gsub("^%s*", ""):gsub("%s*$", "") -- trim whitespace
663 content = content:gsub(cw_pattern, "") -- remove entire CW line
664 end
665
666 -- NEW: Remove reply syntax from both content warning and main content
667 if cw_text ~= "" then
668 cw_text = remove_reply_syntax(cw_text)
669 end
670 content = remove_reply_syntax(content)
671
672 -- Remove extra formatting newlines (multiple consecutive newlines)
673 content = content:gsub("\n\n+", "\n"):gsub("^\n", ""):gsub("\n$", "")
674
675 -- Remove any title/ID/separator artifacts if present
676 -- (These shouldn't be in poem.content but safety check)
677 content = content:gsub("^%s*%->%s*file:.-\n", "") -- file headers
678 content = content:gsub("^%-%-%-%-+\n", "") -- separator lines
679 content = content:gsub("\n%-%-%-%-+$", "") -- trailing separators
680
681 -- Combine pure content: cleaned content warning + cleaned poem content
682 local pure_content = ""
683 if cw_text ~= "" and content ~= "" then
684 pure_content = cw_text .. "\n" .. content
685 elseif cw_text ~= "" then
686 pure_content = cw_text
687 else
688 pure_content = content
689 end
690
691 return pure_content
692end
693-- }}}
694
695-- {{{ function M.extract_pure_poem_content_for_embedding
696-- Enhanced version of extract_pure_poem_content for embedding generation
697-- Issue 6-033: Additional preprocessing for better embedding quality
698-- Key differences from extract_pure_poem_content:
699-- 1. Converts dashes to spaces (better tokenization: "cannabis-mentioned" β†’ "cannabis mentioned")
700-- 2. Strips file path metadata that leaked into content
701-- 3. Strips separator lines (----)
702-- 4. Isolates single poem if multiple are concatenated
703function M.extract_pure_poem_content_for_embedding(processed_content)
704 -- Start with the standard pure content extraction
705 local content = M.extract_pure_poem_content(processed_content)
706
707 -- First, check for concatenated poems and isolate the first one
708 -- Look for patterns like "\n----" or "\n -> file:" that indicate poem boundaries
709 local separator_pos = content:find("\n%-%-%-%-")
710 if separator_pos then
711 content = content:sub(1, separator_pos - 1)
712 end
713
714 -- Remove file path metadata lines
715 -- Pattern 1: " -> file: fediverse/1678.txt" style
716 content = content:gsub("%s*%->%s*file:[^\n]*\n?", " ")
717 -- Pattern 2: "file: /home/ritz/..." absolute path style
718 content = content:gsub("file:%s*/[^\n]*\n?", " ")
719
720 -- Remove any remaining separator lines (4+ dashes)
721 content = content:gsub("%-%-%-%-+", " ")
722
723 -- Convert dashes to spaces for better embedding tokenization
724 -- "cannabis-mentioned" becomes "cannabis mentioned" which tokenizes better
725 -- This helps the model understand compound concepts
726 content = content:gsub("%-", " ")
727
728 -- Clean up multiple consecutive spaces and whitespace artifacts
729 content = content:gsub("%s+", " ")
730 content = content:gsub("^%s*", "")
731 content = content:gsub("%s*$", "")
732
733 return content
734end
735-- }}}
736
737return M