libs/text-chunking.lua

270 lines

1-- {{{ text-chunking.lua

2-- Issue 10-050: shared long-text chunking + chunk-vector recombination.

3--

4-- Why this module exists: embedding models have a fixed context window

5-- (nomic-embed-text v1.5 caps at 2048 tokens). A poem longer than that is

6-- either rejected by the server or silently truncated by the model. Before

7-- this module, src/centroid-generator.lua had its own recursive splitter and

8-- every other consumer had none. This centralises one algorithm so the poem

9-- loop, word-cloud, colors, and centroids all chunk identically.

10--

11-- The two halves of the problem:

12-- 1. SPLIT a too-long text into pieces that each fit the model, preferring to

13-- cut at meaningful boundaries (paragraph > sentence > line > word) so a

14-- chunk is a coherent unit of meaning, not an arbitrary byte range.

15-- 2. RECOMBINE the per-chunk vectors back into one vector for the poem, so

16-- downstream code keeps seeing exactly one embedding per poem.

17--

18-- Chunk sizing is EXACT: the fit test is the real token count from an injected

19-- `count_fn(string) -> token_count`, never a character estimate. There is no

20-- chars-per-token heuristic anywhere — an estimate can undercount dense text and

21-- silently overflow the context. The count_fn is a parameter (not a hard

22-- dependency) so the algorithm stays a PURE function, unit-testable with a mock

23-- counter and no server. See libs/text-chunking-test.lua.

24--

25-- chunk_text_by_tokens returns (chunks, counts): the exact token count of each

26-- chunk, computed as a byproduct of sizing, so callers (request packing in

27-- fuzzy-computing) never have to re-estimate or re-tokenize.

28-- }}}

30local M = {}

32-- {{{ Tunables

33-- Boundary separators in descending priority. We try to cut at a paragraph

34-- break first (most semantically clean), fall back to sentence, then line, then

35-- word, and finally a hard token split if a single "word" is itself longer

36-- than a chunk (degenerate input, e.g. a giant URL or base64 blob).

37--

38-- ". " is a plain-string approximation of a sentence boundary. It misses

39-- abbreviations and "?"/"!" endings, but it is cheap and the consequence of a

40-- miss is only a slightly larger-or-smaller chunk, never a failure.

41M.SEPARATORS = { "\n\n", ". ", "\n", " " }

42-- }}}

44-- {{{ local function split_keeping_separator(text, sep)

45-- Split `text` on the literal string `sep`, but KEEP each separator attached to

46-- the end of the piece it followed. This matters because it makes the split

47-- lossless: table.concat(pieces) == text exactly, so re-packing the pieces can

48-- never corrupt or drop characters from a poem. (A naive gmatch split would

49-- silently eat the delimiters.) Splitting only ever happens at whitespace

50-- separators, which WordPiece never tokenizes across — so the token count of a

51-- concatenation equals the sum of its pieces' counts, which is what lets the

52-- packer below keep an EXACT running token total.

53local function split_keeping_separator(text, sep)

54 local pieces = {}

55 local start = 1

56 while true do

57 -- plain=true: treat sep as a literal string, not a Lua pattern, so

58 -- "." and other magic chars in a separator are matched verbatim.

59 local s, e = string.find(text, sep, start, true)

60 if not s then

61 pieces[#pieces + 1] = string.sub(text, start)

62 break

63 end

64 pieces[#pieces + 1] = string.sub(text, start, e)

65 start = e + 1

66 end

67 return pieces

68end

69-- }}}

71-- {{{ local function largest_prefix_within(text, count_fn, max_tokens)

72-- Binary-search the largest character prefix of `text` whose exact token count

73-- is <= max_tokens. Used only by the no-separator hard split below. Token count

74-- is monotonic non-decreasing in prefix length, so binary search is valid.

75local function largest_prefix_within(text, count_fn, max_tokens)

76 local lo, hi, best = 1, #text, 1

77 while lo <= hi do

78 local mid = math.floor((lo + hi) / 2)

79 if count_fn(string.sub(text, 1, mid)) <= max_tokens then

80 best = mid

81 lo = mid + 1

82 else

83 hi = mid - 1

84 end

85 end

86 return best

87end

88-- }}}

90-- {{{ local function hard_split_by_tokens(text, count_fn, max_tokens)

91-- Last resort when a blob has no separator of any priority yet exceeds the token

92-- budget (e.g. a giant unbroken URL/base64 run). Carves off the largest

93-- token-fitting prefix repeatedly. Rare in prose/poetry; included for safety.

94-- Returns (chunks, counts) with each chunk's exact token count.

95local function hard_split_by_tokens(text, count_fn, max_tokens)

96 local chunks, counts = {}, {}

97 local s = 1

98 while s <= #text do

99 local remaining = string.sub(text, s)

100 local len = largest_prefix_within(remaining, count_fn, max_tokens)

101 if len < 1 then len = 1 end -- always make progress

102 local piece = string.sub(remaining, 1, len)

103 chunks[#chunks + 1] = piece

104 counts[#counts + 1] = count_fn(piece)

105 s = s + len

106 end

107 return chunks, counts

108end

109-- }}}

110

111-- {{{ local function chunk_by_tokens_recursive(text, count_fn, max_tokens, sep_index)

112-- Greedily PACK pieces (split at the current priority's separator) into chunks

113-- up to max_tokens, making the fewest, fullest chunks that still respect

114-- boundaries. Any single piece that is itself too large is recursively re-split

115-- at the next-lower separator. Returns (chunks, counts) — the exact token count

116-- of every emitted chunk, free, because we already counted to size it.

117local function chunk_by_tokens_recursive(text, count_fn, max_tokens, sep_index)

118 -- Whole text fits: return it as a single chunk with its exact count. (We

119 -- tokenize even short texts — the count is needed for request packing, so

120 -- there is no char shortcut to skip it.)

121 local n_text = count_fn(text)

122 if n_text <= max_tokens then

123 return { text }, { n_text }

124 end

125

126 local sep = M.SEPARATORS[sep_index]

127 if not sep then

128 return hard_split_by_tokens(text, count_fn, max_tokens)

129 end

130

131 local pieces = split_keeping_separator(text, sep)

132 if #pieces == 1 then

133 return chunk_by_tokens_recursive(text, count_fn, max_tokens, sep_index + 1)

134 end

135

136 -- Greedy pack by summed exact piece counts. Because pieces split at

137 -- whitespace separators, the token count of a concatenation equals the sum

138 -- of its pieces' counts, so the running sum (current_tokens) is exact and

139 -- each assembled chunk is provably <= max_tokens.

140 local chunks, counts = {}, {}

141 local current = ""

142 local current_tokens = 0

143 for _, piece in ipairs(pieces) do

144 local pt = count_fn(piece)

145 if pt > max_tokens then

146 -- piece alone overflows: flush, then split it at a finer separator

147 if #current > 0 then

148 chunks[#chunks + 1] = current

149 counts[#counts + 1] = current_tokens

150 current = ""

151 current_tokens = 0

152 end

153 local subs, subcounts = chunk_by_tokens_recursive(piece, count_fn, max_tokens, sep_index + 1)

154 for i = 1, #subs do

155 chunks[#chunks + 1] = subs[i]

156 counts[#counts + 1] = subcounts[i]

157 end

158 elseif current_tokens + pt <= max_tokens then

159 current = current .. piece

160 current_tokens = current_tokens + pt

161 else

162 if #current > 0 then

163 chunks[#chunks + 1] = current

164 counts[#counts + 1] = current_tokens

165 end

166 current = piece

167 current_tokens = pt

168 end

169 end

170 if #current > 0 then

171 chunks[#chunks + 1] = current

172 counts[#counts + 1] = current_tokens

173 end

174 return chunks, counts

175end

176-- }}}

177

178-- {{{ function M.chunk_text_by_tokens(text, count_fn, max_tokens)

179-- Public entry point for exact, token-bounded chunking. count_fn(string) must

180-- return that string's exact token count under the embedding model's tokenizer.

181-- Returns (chunks, counts): chunks each <= max_tokens tokens, and counts[i] the

182-- exact token count of chunks[i]. A text that already fits comes back as one

183-- element; whitespace-only input returns ({}, {}). max_tokens is REQUIRED —

184-- compute it exactly (see fuzzy.embedding_chunk_budget); there is no default.

185function M.chunk_text_by_tokens(text, count_fn, max_tokens)

186 if not max_tokens then

187 error("chunk_text_by_tokens: max_tokens is required (compute it exactly, "

188 .. "e.g. fuzzy.embedding_chunk_budget)")

189 end

190 if type(text) ~= "string" or text:match("^%s*$") then

191 return {}, {}

192 end

193 return chunk_by_tokens_recursive(text, count_fn, max_tokens, 1)

194end

195-- }}}

196

197-- {{{ function M.combine_chunk_vectors(vectors, weights, strategy)

198-- Fold the per-chunk embedding vectors of ONE poem back into a single vector.

199--

200-- vectors : array of equal-length number arrays (one per chunk)

201-- weights : array of per-chunk weights (chunk char lengths for the default

202-- strategy); ignored by "mean" and "first_only". Optional.

203-- strategy : "length_weighted_mean" (default) | "mean" | "first_only"

204--

205-- Strategy rationale (full discussion in issues/10-050):

206-- length_weighted_mean — a chunk holding more text holds more meaning, so it

207-- pulls the combined vector harder. Best default for stanzas of uneven size.

208-- mean — equal weight per chunk; simplest, fine when chunks are even.

209-- first_only — keep only the opening chunk's vector; cheap, throws away the

210-- rest, useful only if a poem's head is treated as its anchor.

211--

212-- Returns nil if there are no vectors. A single vector is returned as-is.

213function M.combine_chunk_vectors(vectors, weights, strategy)

214 strategy = strategy or "length_weighted_mean"

215

216 if not vectors or #vectors == 0 then

217 return nil

218 end

219 if #vectors == 1 then

220 return vectors[1]

221 end

222 if strategy == "first_only" then

223 return vectors[1]

224 end

225

226 local dim = #vectors[1]

227 local combined = {}

228 for d = 1, dim do

229 combined[d] = 0

230 end

231

232 -- "mean" is just length_weighted_mean with every weight equal to 1, so we

233 -- collapse both into one accumulation pass driven by a per-chunk weight.

234 local total_weight = 0

235 for c = 1, #vectors do

236 local vec = vectors[c]

237 -- Guard: a chunk whose embedding failed/short-circuited would corrupt

238 -- the mean. Error loudly on a mismatched-dimension vector rather than

239 -- blend garbage (prefer breaking over silent fallback, per project policy).

240 if #vec ~= dim then

241 error(string.format(

242 "combine_chunk_vectors: chunk %d has dimension %d, expected %d",

243 c, #vec, dim))

244 end

245 local w

246 if strategy == "mean" then

247 w = 1

248 else

249 w = (weights and weights[c]) or 1

250 end

251 total_weight = total_weight + w

252 for d = 1, dim do

253 combined[d] = combined[d] + (vec[d] * w)

254 end

255 end

256

257 if total_weight == 0 then

258 return nil

259 end

260 for d = 1, dim do

261 combined[d] = combined[d] / total_weight

262 end

263 return combined

264end

265-- }}}

266

267return M

268

269-- vim: set foldmethod=marker:

270