libs/text-chunking.lua
1-- {{{ text-chunking.lua
2-- Issue 10-050: shared long-text chunking + chunk-vector recombination.
3--
4-- Why this module exists: embedding models have a fixed context window
5-- (nomic-embed-text v1.5 caps at 2048 tokens). A poem longer than that is
6-- either rejected by the server or silently truncated by the model. Before
7-- this module, src/centroid-generator.lua had its own recursive splitter and
8-- every other consumer had none. This centralises one algorithm so the poem
9-- loop, word-cloud, colors, and centroids all chunk identically.
10--
11-- The two halves of the problem:
12-- 1. SPLIT a too-long text into pieces that each fit the model, preferring to
13-- cut at meaningful boundaries (paragraph > sentence > line > word) so a
14-- chunk is a coherent unit of meaning, not an arbitrary byte range.
15-- 2. RECOMBINE the per-chunk vectors back into one vector for the poem, so
16-- downstream code keeps seeing exactly one embedding per poem.
17--
18-- Chunk sizing is EXACT: the fit test is the real token count from an injected
19-- `count_fn(string) -> token_count`, never a character estimate. There is no
20-- chars-per-token heuristic anywhere — an estimate can undercount dense text and
21-- silently overflow the context. The count_fn is a parameter (not a hard
22-- dependency) so the algorithm stays a PURE function, unit-testable with a mock
23-- counter and no server. See libs/text-chunking-test.lua.
24--
25-- chunk_text_by_tokens returns (chunks, counts): the exact token count of each
26-- chunk, computed as a byproduct of sizing, so callers (request packing in
27-- fuzzy-computing) never have to re-estimate or re-tokenize.
28-- }}}
29
30local M = {}
31
32-- {{{ Tunables
33-- Boundary separators in descending priority. We try to cut at a paragraph
34-- break first (most semantically clean), fall back to sentence, then line, then
35-- word, and finally a hard token split if a single "word" is itself longer
36-- than a chunk (degenerate input, e.g. a giant URL or base64 blob).
37--
38-- ". " is a plain-string approximation of a sentence boundary. It misses
39-- abbreviations and "?"/"!" endings, but it is cheap and the consequence of a
40-- miss is only a slightly larger-or-smaller chunk, never a failure.
41M.SEPARATORS = { "\n\n", ". ", "\n", " " }
42-- }}}
43
44-- {{{ local function split_keeping_separator(text, sep)
45-- Split `text` on the literal string `sep`, but KEEP each separator attached to
46-- the end of the piece it followed. This matters because it makes the split
47-- lossless: table.concat(pieces) == text exactly, so re-packing the pieces can
48-- never corrupt or drop characters from a poem. (A naive gmatch split would
49-- silently eat the delimiters.) Splitting only ever happens at whitespace
50-- separators, which WordPiece never tokenizes across — so the token count of a
51-- concatenation equals the sum of its pieces' counts, which is what lets the
52-- packer below keep an EXACT running token total.
53local function split_keeping_separator(text, sep)
54 local pieces = {}
55 local start = 1
56 while true do
57 -- plain=true: treat sep as a literal string, not a Lua pattern, so
58 -- "." and other magic chars in a separator are matched verbatim.
59 local s, e = string.find(text, sep, start, true)
60 if not s then
61 pieces[#pieces + 1] = string.sub(text, start)
62 break
63 end
64 pieces[#pieces + 1] = string.sub(text, start, e)
65 start = e + 1
66 end
67 return pieces
68end
69-- }}}
70
71-- {{{ local function largest_prefix_within(text, count_fn, max_tokens)
72-- Binary-search the largest character prefix of `text` whose exact token count
73-- is <= max_tokens. Used only by the no-separator hard split below. Token count
74-- is monotonic non-decreasing in prefix length, so binary search is valid.
75local function largest_prefix_within(text, count_fn, max_tokens)
76 local lo, hi, best = 1, #text, 1
77 while lo <= hi do
78 local mid = math.floor((lo + hi) / 2)
79 if count_fn(string.sub(text, 1, mid)) <= max_tokens then
80 best = mid
81 lo = mid + 1
82 else
83 hi = mid - 1
84 end
85 end
86 return best
87end
88-- }}}
89
90-- {{{ local function hard_split_by_tokens(text, count_fn, max_tokens)
91-- Last resort when a blob has no separator of any priority yet exceeds the token
92-- budget (e.g. a giant unbroken URL/base64 run). Carves off the largest
93-- token-fitting prefix repeatedly. Rare in prose/poetry; included for safety.
94-- Returns (chunks, counts) with each chunk's exact token count.
95local function hard_split_by_tokens(text, count_fn, max_tokens)
96 local chunks, counts = {}, {}
97 local s = 1
98 while s <= #text do
99 local remaining = string.sub(text, s)
100 local len = largest_prefix_within(remaining, count_fn, max_tokens)
101 if len < 1 then len = 1 end -- always make progress
102 local piece = string.sub(remaining, 1, len)
103 chunks[#chunks + 1] = piece
104 counts[#counts + 1] = count_fn(piece)
105 s = s + len
106 end
107 return chunks, counts
108end
109-- }}}
110
111-- {{{ local function chunk_by_tokens_recursive(text, count_fn, max_tokens, sep_index)
112-- Greedily PACK pieces (split at the current priority's separator) into chunks
113-- up to max_tokens, making the fewest, fullest chunks that still respect
114-- boundaries. Any single piece that is itself too large is recursively re-split
115-- at the next-lower separator. Returns (chunks, counts) — the exact token count
116-- of every emitted chunk, free, because we already counted to size it.
117local function chunk_by_tokens_recursive(text, count_fn, max_tokens, sep_index)
118 -- Whole text fits: return it as a single chunk with its exact count. (We
119 -- tokenize even short texts — the count is needed for request packing, so
120 -- there is no char shortcut to skip it.)
121 local n_text = count_fn(text)
122 if n_text <= max_tokens then
123 return { text }, { n_text }
124 end
125
126 local sep = M.SEPARATORS[sep_index]
127 if not sep then
128 return hard_split_by_tokens(text, count_fn, max_tokens)
129 end
130
131 local pieces = split_keeping_separator(text, sep)
132 if #pieces == 1 then
133 return chunk_by_tokens_recursive(text, count_fn, max_tokens, sep_index + 1)
134 end
135
136 -- Greedy pack by summed exact piece counts. Because pieces split at
137 -- whitespace separators, the token count of a concatenation equals the sum
138 -- of its pieces' counts, so the running sum (current_tokens) is exact and
139 -- each assembled chunk is provably <= max_tokens.
140 local chunks, counts = {}, {}
141 local current = ""
142 local current_tokens = 0
143 for _, piece in ipairs(pieces) do
144 local pt = count_fn(piece)
145 if pt > max_tokens then
146 -- piece alone overflows: flush, then split it at a finer separator
147 if #current > 0 then
148 chunks[#chunks + 1] = current
149 counts[#counts + 1] = current_tokens
150 current = ""
151 current_tokens = 0
152 end
153 local subs, subcounts = chunk_by_tokens_recursive(piece, count_fn, max_tokens, sep_index + 1)
154 for i = 1, #subs do
155 chunks[#chunks + 1] = subs[i]
156 counts[#counts + 1] = subcounts[i]
157 end
158 elseif current_tokens + pt <= max_tokens then
159 current = current .. piece
160 current_tokens = current_tokens + pt
161 else
162 if #current > 0 then
163 chunks[#chunks + 1] = current
164 counts[#counts + 1] = current_tokens
165 end
166 current = piece
167 current_tokens = pt
168 end
169 end
170 if #current > 0 then
171 chunks[#chunks + 1] = current
172 counts[#counts + 1] = current_tokens
173 end
174 return chunks, counts
175end
176-- }}}
177
178-- {{{ function M.chunk_text_by_tokens(text, count_fn, max_tokens)
179-- Public entry point for exact, token-bounded chunking. count_fn(string) must
180-- return that string's exact token count under the embedding model's tokenizer.
181-- Returns (chunks, counts): chunks each <= max_tokens tokens, and counts[i] the
182-- exact token count of chunks[i]. A text that already fits comes back as one
183-- element; whitespace-only input returns ({}, {}). max_tokens is REQUIRED —
184-- compute it exactly (see fuzzy.embedding_chunk_budget); there is no default.
185function M.chunk_text_by_tokens(text, count_fn, max_tokens)
186 if not max_tokens then
187 error("chunk_text_by_tokens: max_tokens is required (compute it exactly, "
188 .. "e.g. fuzzy.embedding_chunk_budget)")
189 end
190 if type(text) ~= "string" or text:match("^%s*$") then
191 return {}, {}
192 end
193 return chunk_by_tokens_recursive(text, count_fn, max_tokens, 1)
194end
195-- }}}
196
197-- {{{ function M.combine_chunk_vectors(vectors, weights, strategy)
198-- Fold the per-chunk embedding vectors of ONE poem back into a single vector.
199--
200-- vectors : array of equal-length number arrays (one per chunk)
201-- weights : array of per-chunk weights (chunk char lengths for the default
202-- strategy); ignored by "mean" and "first_only". Optional.
203-- strategy : "length_weighted_mean" (default) | "mean" | "first_only"
204--
205-- Strategy rationale (full discussion in issues/10-050):
206-- length_weighted_mean — a chunk holding more text holds more meaning, so it
207-- pulls the combined vector harder. Best default for stanzas of uneven size.
208-- mean — equal weight per chunk; simplest, fine when chunks are even.
209-- first_only — keep only the opening chunk's vector; cheap, throws away the
210-- rest, useful only if a poem's head is treated as its anchor.
211--
212-- Returns nil if there are no vectors. A single vector is returned as-is.
213function M.combine_chunk_vectors(vectors, weights, strategy)
214 strategy = strategy or "length_weighted_mean"
215
216 if not vectors or #vectors == 0 then
217 return nil
218 end
219 if #vectors == 1 then
220 return vectors[1]
221 end
222 if strategy == "first_only" then
223 return vectors[1]
224 end
225
226 local dim = #vectors[1]
227 local combined = {}
228 for d = 1, dim do
229 combined[d] = 0
230 end
231
232 -- "mean" is just length_weighted_mean with every weight equal to 1, so we
233 -- collapse both into one accumulation pass driven by a per-chunk weight.
234 local total_weight = 0
235 for c = 1, #vectors do
236 local vec = vectors[c]
237 -- Guard: a chunk whose embedding failed/short-circuited would corrupt
238 -- the mean. Error loudly on a mismatched-dimension vector rather than
239 -- blend garbage (prefer breaking over silent fallback, per project policy).
240 if #vec ~= dim then
241 error(string.format(
242 "combine_chunk_vectors: chunk %d has dimension %d, expected %d",
243 c, #vec, dim))
244 end
245 local w
246 if strategy == "mean" then
247 w = 1
248 else
249 w = (weights and weights[c]) or 1
250 end
251 total_weight = total_weight + w
252 for d = 1, dim do
253 combined[d] = combined[d] + (vec[d] * w)
254 end
255 end
256
257 if total_weight == 0 then
258 return nil
259 end
260 for d = 1, dim do
261 combined[d] = combined[d] / total_weight
262 end
263 return combined
264end
265-- }}}
266
267return M
268
269-- vim: set foldmethod=marker:
270