libs/text-chunking-test.lua
1-- {{{ text-chunking-test.lua
2-- Issue 10-050: unit tests for libs/text-chunking.lua.
3--
4-- Pure-function tests — no inference server, no files. Run directly:
5-- luajit libs/text-chunking-test.lua
6-- Exits non-zero on failure so it can gate a build/deploy step.
7--
8-- Sizing is token-exact via an injected counter. `count_chars` (1 token per
9-- character) is a perfectly additive mock, so a max_tokens of N behaves like an
10-- N-character limit AND the returned per-chunk counts equal the chunk lengths.
11-- }}}
12
13package.path = "libs/?.lua;" .. package.path
14local chunk = require("text-chunking")
15
16local failures = 0
17local tests = 0
18
19-- {{{ local function check(name, condition, detail)
20local function check(name, condition, detail)
21 tests = tests + 1
22 if condition then
23 print(string.format(" ok %s", name))
24 else
25 failures = failures + 1
26 print(string.format(" FAIL %s%s", name, detail and (" -> " .. detail) or ""))
27 end
28end
29-- }}}
30
31local count_chars = function(s) return #s end
32
33-- {{{ short text is returned whole
34local single = chunk.chunk_text_by_tokens("a short poem", count_chars, 7200)
35check("short text -> single chunk", #single == 1 and single[1] == "a short poem")
36-- }}}
37
38-- {{{ empty / whitespace -> no chunks
39check("empty string -> {}", #chunk.chunk_text_by_tokens("", count_chars, 7200) == 0)
40check("whitespace -> {}", #chunk.chunk_text_by_tokens(" \n\t ", count_chars, 7200) == 0)
41-- }}}
42
43-- {{{ max_tokens is required (no guessed default)
44local ok_req = pcall(function() return chunk.chunk_text_by_tokens("hi", count_chars, nil) end)
45check("max_tokens is required (errors when nil)", ok_req == false)
46-- }}}
47
48-- {{{ splitting is LOSSLESS and counts are EXACT
49-- The most important invariant: chunking must never drop or alter a character.
50local parts = {}
51for i = 1, 60 do
52 parts[#parts + 1] = "Paragraph number " .. i .. " has several words in it. "
53 .. "It also has a second sentence to give the splitter something to chew on."
54end
55local long_text = table.concat(parts, "\n\n")
56local chunks, counts = chunk.chunk_text_by_tokens(long_text, count_chars, 500)
57check("long text actually split into many chunks", #chunks > 1, "got " .. #chunks)
58check("lossless: concat(chunks) == original", table.concat(chunks) == long_text)
59-- With an additive counter, every returned count must equal the chunk's length.
60local counts_exact = (#counts == #chunks)
61for i = 1, #chunks do
62 if counts[i] ~= #chunks[i] then counts_exact = false end
63end
64check("returned per-chunk counts are exact (additive counter)", counts_exact)
65-- }}}
66
67-- {{{ every chunk respects the max size
68local all_within = true
69for _, c in ipairs(chunks) do
70 if #c > 500 then all_within = false end
71end
72check("every chunk <= max_tokens", all_within)
73-- }}}
74
75-- {{{ degenerate input: one giant unbreakable token -> hard split, still lossless
76local giant = string.rep("x", 1730) -- no separators at all
77local giant_chunks = chunk.chunk_text_by_tokens(giant, count_chars, 500)
78local giant_ok = (#giant_chunks == 4) and (table.concat(giant_chunks) == giant)
79check("giant unbreakable token hard-splits losslessly", giant_ok,
80 "chunks=" .. #giant_chunks)
81-- }}}
82
83-- {{{ counter drives splitting with a NON-char token ratio
84-- tok4 reports 1 token per 4 chars (ceil). Verify splitting respects the TOKEN
85-- limit, not the char length, and stays lossless.
86local function tok4(s) return math.ceil(#s / 4) end
87local parts2 = {}
88for i = 1, 50 do parts2[#parts2 + 1] = "Sentence number " .. i .. " has a handful of words." end
89local body = table.concat(parts2, "\n\n")
90local tchunks = chunk.chunk_text_by_tokens(body, tok4, 30) -- ~120 chars/chunk ceiling
91check("token chunk: long text splits into many", #tchunks > 1, "chunks=" .. #tchunks)
92check("token chunk: lossless (concat == original)", table.concat(tchunks) == body)
93local all_under = true
94for _, c in ipairs(tchunks) do if tok4(c) > 30 then all_under = false end end
95check("token chunk: every chunk <= max_tokens by the counter", all_under)
96-- }}}
97
98-- {{{ token chunking: no-separator blob hard-splits by tokens, losslessly
99local blob = string.rep("z", 250) -- no separators of any kind
100local bchunks = chunk.chunk_text_by_tokens(blob, count_chars, 60)
101local blossless = (table.concat(bchunks) == blob)
102local bunder = true
103for _, c in ipairs(bchunks) do if #c > 60 then bunder = false end end
104check("token chunk: no-separator blob hard-splits losslessly",
105 blossless and bunder and #bchunks == 5, "chunks=" .. #bchunks)
106-- }}}
107
108-- {{{ combine: single vector returned as-is
109local one = chunk.combine_chunk_vectors({ {1, 2, 3} })
110check("combine single vector -> identity", one[1] == 1 and one[2] == 2 and one[3] == 3)
111-- }}}
112
113-- {{{ combine: plain mean
114local mean = chunk.combine_chunk_vectors({ {0, 0}, {2, 4} }, nil, "mean")
115check("mean of (0,0),(2,4) = (1,2)", mean[1] == 1 and mean[2] == 2)
116-- }}}
117
118-- {{{ combine: length-weighted mean favours the longer chunk
119-- weights 1 and 3: (1*[0,0] + 3*[4,8]) / 4 = [3,6]
120local lwm = chunk.combine_chunk_vectors({ {0, 0}, {4, 8} }, { 1, 3 }, "length_weighted_mean")
121check("length-weighted mean tilts toward heavier chunk", lwm[1] == 3 and lwm[2] == 6)
122-- }}}
123
124-- {{{ combine: first_only drops the tail
125local first = chunk.combine_chunk_vectors({ {9, 9}, {0, 0} }, nil, "first_only")
126check("first_only keeps only chunk 1", first[1] == 9 and first[2] == 9)
127-- }}}
128
129-- {{{ combine: dimension mismatch errors loudly (no silent garbage)
130local ok_err = pcall(function()
131 chunk.combine_chunk_vectors({ {1, 2, 3}, {1, 2} }, nil, "mean")
132end)
133check("dimension mismatch raises an error", ok_err == false)
134-- }}}
135
136-- {{{ summary
137print(string.format("\n%d/%d checks passed", tests - failures, tests))
138if failures > 0 then
139 os.exit(1)
140end
141-- }}}
142
143-- vim: set foldmethod=marker:
144