libs/text-formatter.lua
1-- {{{ text-formatter.lua
2-- Issue 8-056: Shared text formatting module for poem content rendering.
3-- Used by both main thread (chronological pages) and effil worker threads
4-- (similar/different pages) to ensure identical whitespace handling.
5--
6-- Design principle: Poetry is artistic content. The author's spacing decisions
7-- (leading whitespace, multi-space runs, paragraph breaks, indentation) must be
8-- respected regardless of source category. This module does NO word-wrapping.
9-- }}}
10
11local M = {}
12
13-- {{{ function M.format_poem_lines
14-- Splits poem text into lines, preserving all whitespace.
15-- Returns a table of lines with no modifications to spacing.
16--
17-- Input/Output examples:
18-- " hello world" -> {" hello world"} (leading spaces preserved)
19-- "hello world" -> {"hello world"} (multi-space run preserved)
20-- "a\nb\n\nc" -> {"a", "b", "", "c"} (paragraph breaks preserved)
21-- "short line" -> {"short line"} (no modification)
22-- 90-char line -> {90-char line} (no re-flow)
23--
24-- Why no word-wrapping: The `%S+` pattern used in word-wrapping destroys
25-- all whitespace structure. Poetry in poems.json already contains the
26-- author's intended line breaks. The rendering layer should faithfully
27-- reproduce them, not re-flow the text.
28function M.format_poem_lines(text)
29 if not text or text == "" then
30 return {}
31 end
32
33 local lines = {}
34 -- Match lines including empty ones (paragraph breaks)
35 -- The pattern (.-)\n matches everything up to each newline
36 -- Adding \n to the end ensures we capture the last line even without trailing newline
37 for line in (text .. "\n"):gmatch("(.-)\n") do
38 table.insert(lines, line)
39 end
40
41 -- Remove the extra empty line added by the trailing \n if the text
42 -- didn't originally end with a newline
43 if #lines > 0 and lines[#lines] == "" and not text:match("\n$") then
44 table.remove(lines)
45 end
46
47 return lines
48end
49-- }}}
50
51-- {{{ function M.format_poem_content
52-- Convenience function: formats poem content with word wrapping and left padding.
53-- Each line gets a 1-space left padding (standard for poem content area).
54-- Long lines are wrapped at word boundaries while preserving leading whitespace.
55--
56-- Issue 10-021: Re-enabled word wrapping (was disabled by 8-056) but now uses
57-- wrap_preserving_indent() which maintains artistic whitespace.
58--
59-- This is the main entry point for both main thread and worker thread
60-- poem content formatting.
61function M.format_poem_content(text, max_width)
62 max_width = max_width or 80
63 local lines = M.format_poem_lines(text)
64 local result_lines = {}
65
66 for _, line in ipairs(lines) do
67 -- Add 1-space left padding, then wrap if needed
68 local padded_line = " " .. line
69 local wrapped = M.wrap_preserving_indent(padded_line, max_width)
70 for _, wrapped_line in ipairs(wrapped) do
71 table.insert(result_lines, wrapped_line)
72 end
73 end
74
75 return result_lines
76end
77-- }}}
78
79-- {{{ function M.decode_html_entities_for_width
80-- Decodes HTML entities to their display characters for accurate width counting.
81-- Used when calculating padding - the visible width differs from byte count
82-- when content contains escaped entities like > (4 bytes, 1 display char).
83--
84-- NOTE: This only decodes for WIDTH CALCULATION. The actual rendered content
85-- must keep the entities for correct HTML display.
86function M.decode_html_entities_for_width(content)
87 local decoded = content
88 -- Strip HTML tags first (they're invisible in display)
89 decoded = decoded:gsub("<[^>]+>", "")
90 -- Decode common HTML entities to their display characters
91 decoded = decoded:gsub(">", ">")
92 decoded = decoded:gsub("<", "<")
93 decoded = decoded:gsub("&", "&")
94 decoded = decoded:gsub(""", '"')
95 decoded = decoded:gsub("'", "'")
96 decoded = decoded:gsub(" ", " ")
97 -- Numeric entities
98 decoded = decoded:gsub("&#(%d+);", function(n)
99 local num = tonumber(n)
100 if num and num < 256 then
101 return string.char(num)
102 end
103 return ""
104 end)
105 return decoded
106end
107-- }}}
108
109-- {{{ function M.utf8_char_count
110-- Counts UTF-8 characters (not bytes) in a string.
111-- Box-drawing chars are 3 bytes each, ASCII is 1 byte.
112-- This is important for correct padding calculations.
113function M.utf8_char_count(str)
114 if not str then return 0 end
115 -- Remove UTF-8 continuation bytes (0x80-0xBF), count what remains
116 local stripped = str:gsub("[\128-\191]", "")
117 return #stripped
118end
119-- }}}
120
121-- {{{ function M.calculate_visible_width
122-- Calculates the visible display width of a string containing HTML entities.
123-- Combines entity decoding and UTF-8 character counting.
124-- Used for padding calculations in golden poem formatting.
125function M.calculate_visible_width(content)
126 local decoded = M.decode_html_entities_for_width(content)
127 return M.utf8_char_count(decoded)
128end
129-- }}}
130
131-- {{{ function M.wrap_preserving_indent
132-- Issue 10-021: Wraps a single line to max_width while preserving leading whitespace.
133-- Continuation lines inherit the same indentation as the original line.
134--
135-- Key behaviors:
136-- - Lines <= max_width: returned unchanged (single-element table)
137-- - Leading whitespace: captured and prepended to all wrapped lines
138-- - Long words (URLs): broken at character boundaries if they exceed available width
139-- - Multi-space runs: preserved within content (splits on space boundaries)
140--
141-- Returns a table of wrapped lines.
142function M.wrap_preserving_indent(line, max_width)
143 max_width = max_width or 80
144
145 -- Short lines pass through unchanged
146 if #line <= max_width then
147 return {line}
148 end
149
150 -- Capture leading whitespace separately
151 local leading, remainder = line:match("^(%s*)(.*)$")
152 leading = leading or ""
153 remainder = remainder or line
154 local indent_width = #leading
155 local content_width = max_width - indent_width
156
157 -- Edge case: if indent is so large we can't fit meaningful content
158 if content_width < 10 then
159 return {line}
160 end
161
162 local result_lines = {}
163 local current = ""
164
165 -- Split remainder into words, preserving the spaces after each word
166 -- Pattern: capture non-spaces followed by any trailing spaces
167 for word, trailing_space in remainder:gmatch("(%S+)(%s*)") do
168 local segment = word .. trailing_space
169
170 if #current + #segment <= content_width then
171 -- Fits on current line
172 current = current .. segment
173 else
174 -- Doesn't fit - flush current line first
175 if #current > 0 then
176 -- Trim trailing spaces from the line being flushed
177 table.insert(result_lines, leading .. current:gsub("%s+$", ""))
178 end
179
180 -- Handle very long words (URLs) that exceed content_width
181 if #word > content_width then
182 -- Break the long word at character boundaries
183 local remaining_word = word
184 while #remaining_word > content_width do
185 local chunk = remaining_word:sub(1, content_width)
186 table.insert(result_lines, leading .. chunk)
187 remaining_word = remaining_word:sub(content_width + 1)
188 end
189 -- Whatever is left becomes start of new current line
190 current = remaining_word .. trailing_space
191 else
192 -- Normal word, just starts a new line
193 current = segment
194 end
195 end
196 end
197
198 -- Flush final line
199 if #current > 0 then
200 table.insert(result_lines, leading .. current:gsub("%s+$", ""))
201 end
202
203 return result_lines
204end
205-- }}}
206
207-- {{{ function M.wrap_external_url(prefix, url, content_width)
208-- Render `prefix .. url` as lines no wider than content_width, BREAKING the URL
209-- across lines so it fits its box instead of overflowing (the user prefers
210-- wrapping over truncation -- nothing is lost). The box renderer draws each
211-- line separately, so a single <a> spanning lines would be split across the box
212-- walls; therefore each line carries its OWN <a href=url> wrapping the same full
213-- URL, keeping every chunk clickable. Returns a "\n"-joined string ready for the
214-- box renderer. URLs are ASCII so byte slicing == character slicing here.
215function M.wrap_external_url(prefix, url, content_width)
216 prefix = prefix or ""
217 local lines = {}
218 local pos = 1
219 local budget = content_width - M.utf8_char_count(prefix)
220 if budget < 1 then budget = content_width end
221 while pos <= #url do
222 local chunk = url:sub(pos, pos + budget - 1)
223 local linked = string.format('<a href="%s" target="_blank" rel="noopener">%s</a>', url, chunk)
224 lines[#lines + 1] = (#lines == 0) and (prefix .. linked) or linked
225 pos = pos + budget
226 budget = content_width
227 end
228 if #lines == 0 then lines[1] = prefix end
229 return table.concat(lines, "\n")
230end
231-- }}}
232
233return M
234-- }}}
235