config.lua

873 lines

1-- {{{ config.lua
2-- Issue 10-003: Single authoritative configuration for neocities-modernization
3-- All settings are validated against actual script usage as of 2026-01-21.
4-- Sections are organized with vimfolds for easy navigation.
5--
6-- For detailed field documentation, see: docs/config-reference.md (Issue 10-019)
7--
8-- Usage:
9-- local config = require("config-loader")
10-- local assets_root = config.asset_paths.assets_root
11-- local colors = config.semantic_colors
12-- }}}
13
14return {
15 -- {{{ asset_paths
16 -- Root directory for all generated assets: embeddings, caches, indexes.
17 -- Scripts use this to locate poem embeddings, similarity matrices, and other
18 -- computed data that persists between pipeline runs.
19 asset_paths = {
20 assets_root = "/mnt/mtwo/programming/ai-stuff/neocities-modernization/assets"
21 },
22 -- }}}
23
24 -- {{{ layout
25 -- Controls the visual appearance of poem boxes in generated HTML.
26 -- These values are read by src/flat-html-generator.lua:load_layout_from_config()
27 -- Width values are in characters. Junction positions are character offsets.
28 layout = {
29 regular_poem_width = 83, -- Width of standard poem boxes
30 golden_poem_width = 85, -- Width of golden poem boxes (1024 chars)
31 text_content_width = 80, -- Inner content area width
32 left_box_width = 11, -- Left navigation box width
33 right_box_width = 13, -- Right navigation box width
34 gap_width = 59, -- Gap between left and right boxes
35 left_junction_pos = 5, -- Position of left box junction point
36 right_junction_pos = 6 -- Position of right box junction point
37 },
38 -- }}}
39
40 -- NOTE: input_sources section REMOVED (Issue 10-015a)
41 -- All source paths are now in the unified 'sources' section below.
42 -- Extractors use sources-loader.lua to read paths.
43
44 -- {{{ sources
45 -- Unified input source configuration (Issue 10-015, extended 10-026).
46 -- Each source type supports multiple named directories.
47 -- Pipeline deduplicates by content ID across directories.
48 -- All extractors now use sources-loader.lua to read these paths.
49 --
50 -- Issue 10-026: External sync info is now embedded in each source:
51 -- - directories[].external.source = where to rsync from
52 -- - archives[] = ZIP files that extract to this source's directory
53 -- Use sources-loader.get_all_external_syncs() to collect all sync entries.
54 sources = {
55 fediverse = {
56 enabled = true,
57 format = "activitypub",
58 directories = {
59 {
60 name = "primary",
61 path = "input/fediverse",
62 },
63 },
64 -- Issue 10-026: Archive sources (ZIP files that extract to this source's directory)
65 archives = {
66 {
67 name = "fediverse-zip",
68 source = "/home/ritz/backups/fediverse/backups/most-recent-29.zip",
69 extract_to = "input", -- Extracts to input/ root (archive contains fediverse/ dir)
70 },
71 },
72 media = {
73 extract_attachments = true,
74 output_path = "input/media_attachments/fediverse",
75 },
76 },
77 messages = {
78 enabled = true,
79 format = "messages_export",
80 directories = {
81 {
82 name = "primary",
83 path = "input/messages",
84 },
85 },
86 -- Issue 10-026: Archive sources
87 archives = {
88 {
89 name = "messages-zip",
90 source = "/home/ritz/backups/messages-to-myself/input-zip-file/queen-of-her-castle.zip",
91 extract_to = "input", -- Extracts to input/ root (archive contains messages/ dir)
92 },
93 },
94 },
95 notes = {
96 enabled = true,
97 format = "plaintext",
98 directories = {
99 {
100 name = "primary",
101 path = "input/notes",
102 -- Issue 10-026: External source for rsync
103 external = {
104 source = "/home/ritz/notes",
105 },
106 },
107 },
108 },
109 bluesky = {
110 enabled = true,
111 format = "atproto",
112 directories = {
113 {
114 name = "primary",
115 path = "input/bluesky",
116 -- Issue 10-026: External source for rsync
117 external = {
118 source = "/home/ritz/backups/bluesky/input",
119 },
120 },
121 },
122 },
123 images = {
124 enabled = true,
125 -- include_by_default (per source, default true):
126 -- true -> ship EVERYTHING from this source, minus the entries in
127 -- excluded_images that name files in it (a blacklist).
128 -- false -> ship NOTHING from this source EXCEPT the entries in
129 -- excluded_images that name files in it (a whitelist) --
130 -- i.e. the SAME list, but its lines now ADD rather than
131 -- remove. Use false when a source is a big directory and
132 -- you only want a handful of files out of it.
133 -- This keeps excluded_images as one flat list; the flag decides
134 -- whether a source's lines subtract from "all" or add to "none".
135 directories = {
136 {
137 name = "fediverse-media",
138 -- Bugfix: this pointed at input/images/files, which never
139 -- exists, so 546 fediverse post attachments were silently
140 -- skipped and never rendered inline. The ZIP extraction
141 -- (scripts/update -> scripts/zip-extractor.lua) writes the
142 -- deeply-nested Mastodon media to input/media_attachments/
143 -- files/..., so the source points there now.
144 path = "input/media_attachments/files",
145 description = "Mastodon/ActivityPub media attachments (deeply nested)",
146 -- No external: comes from ZIP extraction
147 },
148 {
149 name = "my-art",
150 include_by_default = true,
151 path = "input/images/my-art",
152 description = "artwork made in kolourpaint",
153 -- Issue 10-026: External source for rsync
154 external = {
155 source = "/home/ritz/pictures/my-art",
156 },
157 },
158 {
159 name = "things-I-almost-posted",
160 include_by_default = true,
161 path = "input/images/things-i-almost-posted",
162 external = {
163 source = "/home/ritz/pictures/things-i-almost-posted",
164 },
165 randomize_order = false,
166 },
167 {
168 name = "poem-pictures",
169 include_by_default = true,
170 path = "input/images/poem-pictures",
171 external = {
172 source = "/home/ritz/pictures/poem-pictures",
173 },
174 },
175 {
176 name = "dnd-pictures-from-the-internet",
177 include_by_default = true,
178 path = "input/images/dnd-pictures",
179 external = {
180 source = "/home/ritz/pictures/dnd-pictures",
181 },
182 -- Issue 10-030: Randomize position of these images in timeline
183 -- (they don't have meaningful dates, scatter throughout)
184 randomize_order = true;
185
186 },
187 {
188 -- NOTE: external syncs to fediverse-stars, sources reads from here
189 -- Path updated to match sync destination (was fediverse-backup)
190 name = "fediverse-stars",
191 include_by_default = true,
192 path = "input/images/fediverse-stars",
193 external = {
194 source = "/home/ritz/pictures/fediverse-backup",
195 },
196 -- Issue 10-030: Randomize position of these images in timeline
197 -- (they don't have meaningful dates, scatter throughout)
198 randomize_order = true;
199 },
200 },
201 supported_formats = {"png", "jpg", "jpeg", "gif", "webp", "svg"},
202 max_file_size_mb = 200,
203 preserve_structure = true,
204 overwrite_existing = false,
205 },
206 },
207 -- }}}
208
209 -- {{{ external_files - DEPRECATED (Issue 10-026)
210 -- This section has been merged into the 'sources' section above.
211 -- External sync info is now stored as 'external' fields in each source's directories,
212 -- and as 'archives' arrays for ZIP files.
213 --
214 -- external-sync.lua now reads from sources-loader.get_all_external_syncs()
215 -- which collects external sync info from the unified sources configuration.
216 --
217 -- This empty array is kept for backward compatibility during the transition.
218 -- It can be removed after confirming all scripts use sources-loader.
219 external_files = {},
220 -- }}}
221
222 -- {{{ extraction
223 -- Controls which input sources are processed during extraction.
224 -- Disabling a source skips it entirely, useful for testing or partial rebuilds.
225 extraction = {
226 enable_fediverse = true,
227 enable_messages = true,
228 enable_notes = true,
229 enable_bluesky = true,
230 -- Issue 7-003: ZIP files to ignore during archive scanning.
231 -- These are ZIPs that appear in input/ but aren't content archives
232 -- (e.g., site backups embedded in media_attachments from fediverse export).
233 ignored_archives = {
234 "neocities-ritz-menardi" -- Neocities site backup, not content data
235 }
236 },
237 -- }}}
238
239 -- {{{ randomization
240 -- Issue 10-058: One master seed governs every randomization site in a build.
241 -- Today that is the word-cloud word shuffle (src/wordcloud-generator.lua) and
242 -- the image-order randomization for any source that does not pin its own
243 -- per-source random_seed (src/image-manager.lua). Reproducibility needs two
244 -- things: (a) all randomness flowing from ONE known seed, and (b) that seed
245 -- recorded somewhere durable. run.sh resolves the seed -- precedence is the
246 -- --seed CLI flag > this config value > an auto-generated seed -- and records
247 -- the resolved value to output/generation-metadata.json and the run log, so a
248 -- build is always answerable to "which seed produced this?".
249 --
250 -- seed = nil => run.sh invents a seed each build and RECORDS it, so even a
251 -- build nobody thought to seed is reproducible after the fact.
252 -- seed = N => a fixed non-negative integer pins the build: the same seed
253 -- over the same inputs yields byte-identical shuffled output.
254 --
255 -- A `--seed N` on the run.sh command line overrides this value for one run.
256 randomization = {
257 seed = nil,
258 },
259 -- }}}
260
261 -- {{{ excluded_poems
262 -- Issue 6-031: Poems to exclude from the collection during extraction.
263 -- Excluded poems leave gaps in the ID sequence (tombstoning) - they don't
264 -- shift other poem IDs down, preserving stable anchor links.
265 -- Read by: libs/exclusion-filter.lua
266 --
267 -- ID Formats by Category:
268 -- fediverse: Numeric post ID from ActivityPub (e.g., "113847291038475")
269 -- notes: Filename without extension (e.g., "what-a-lame-movie")
270 -- messages: Numeric message index (e.g., "42")
271 -- bluesky: AT Protocol record key (e.g., "3k...abc")
272 --
273 -- Finding poem IDs:
274 -- Browse chronological.html, search poems.json, or grep generated HTML
275 excluded_poems = {
276 fediverse = {
277 -- Add fediverse post IDs here, e.g.: "113847291038475"
278 },
279 notes = {
280 -- Add note filenames here (without extension), e.g.: "test-post-please-ignore"
281 -- 0129.txt is a raw PDF file (its content starts with %PDF-1.5), not text.
282 -- Extraction stored the PDF's binary bytes as the poem content, and because
283 -- a PDF is maximally dissimilar to every text poem it became a diversity
284 -- outlier -- landing on ~7,900 "different" pages and dumping ~14KB of binary
285 -- (NUL bytes, PDF stream data) into each one. Excluding it tombstones the
286 -- poem so it never enters poems.json, clearing the whole "different" section.
287 -- (Takes effect on the next extraction run.)
288 "0129",
289 },
290 messages = {
291 -- Add message indices here, e.g.: "42"
292 },
293 bluesky = {
294 -- Add bluesky record keys here
295 }
296 },
297 -- }}}
298
299 -- {{{ excluded_images
300 -- Issue 10-053: Images to exclude, named RELATIVE TO input/images/ -- i.e.
301 -- "<source>/<path-within-source>", the same shape you'd see under
302 -- input/images/. The leading "input/images/" is implied, so it is no longer
303 -- repeated on every line (it carried no information and hid the part that
304 -- matters: which gallery, which file).
305 --
306 -- These are STRIPPED from input/ by scripts/strip-excluded after sync, so
307 -- they never get cataloged, embedded, flattened into output/media, rendered,
308 -- OR uploaded with input/. The originals stay safe in the /home/ritz/... rsync
309 -- sources (a later sync re-copies them; the strip removes them again).
310 --
311 -- VALIDATED at build start: strip-excluded resolves every entry back to its
312 -- rsync source and ERRORS if one points at no real file -- a wrong path
313 -- (e.g. forgetting a subdirectory like kooky-dookerie/) can no longer fail
314 -- silently and let the image ship anyway. Fix the path and re-run the phase.
315 --
316 -- Finding an image's path: copy it from the gallery/page that shows it, or
317 -- ls input/images/<source>/ then drop the "input/images/" prefix.
318 excluded_images = {
319 -- "my-art/that-one-i-regret.png",
320 "poem-pictures/stick-cubes-2.png",
321 "my-art/sword-of-damocles-3.png",
322 "my-art/help-me-obiwan-kenobi-3.png",
323 "my-art/help-me-obiwan-kenobi-2.png",
324 "my-art/help-me-obiwan-kenobi-1.png",
325 "my-art/help-me-obiwan-kenobi.png",
326 "my-art/legion-td-idea.png",
327 "my-art/chat-application-with-arrows.png",
328 "my-art/air-defence-drones-1.png",
329 "my-art/air-defence-drones-2.png",
330 "my-art/air-defence-drones-3.png",
331
332 "my-art/air-defence-drones-5.png",
333 "my-art/greed.png",
334 "my-art/continual-context.png",
335 "my-art/continual-context-part-2.png",
336 "my-art/about-face.png",
337 "my-art/perspective-of-matter.png",
338
339 -- usa-today is a sliced thread: the 18 numbered pieces (1..9, then
340 -- 99..9999999999) are just the cut-up panels of the single stitched
341 -- image usa-today.png, which is the only one we want in the gallery.
342 -- The slices stay on disk under my-art/usa-today/ (and their .txt
343 -- alt-text); only their input/ copies are stripped so they never
344 -- catalog, render, or upload. preserve_structure=true keeps the
345 -- usa-today/ subdir, so these paths carry it.
346 "my-art/usa-today/1.png",
347 "my-art/usa-today/2.png",
348 "my-art/usa-today/3.png",
349 "my-art/usa-today/4.png",
350 "my-art/usa-today/5.png",
351 "my-art/usa-today/6.png",
352 "my-art/usa-today/7.png",
353 "my-art/usa-today/8.png",
354 "my-art/usa-today/9.png",
355 "my-art/usa-today/99.png",
356 "my-art/usa-today/999.png",
357 "my-art/usa-today/9999.png",
358 "my-art/usa-today/99999.png",
359 "my-art/usa-today/999999.png",
360 "my-art/usa-today/9999999.png",
361 "my-art/usa-today/99999999.png",
362 "my-art/usa-today/999999999.png",
363 "my-art/usa-today/9999999999.png",
364
365 -- 777-1.png lives in the kooky-dookerie/ subdir, NOT the poem-pictures
366 -- root -- preserve_structure=true keeps that subdir, so the exclusion
367 -- path must carry it. The old root-level path silently matched nothing.
368 -- (The former poem-pictures/1-7.png entries were dropped after those
369 -- 2560x1440 screenshots were deleted from disk.)
370 "poem-pictures/kooky-dookerie/777-1.png",
371
372 "fediverse-stars/ffdsfa90f670235.png",
373
374 "dnd-pictures/flag.png",
375 "dnd-pictures/flag6.png",
376 "dnd-pictures/flag7.png",
377 "dnd-pictures/flag8.png",
378 "dnd-pictures/flag9.png",
379 },
380 -- }}}
381
382 -- {{{ privacy
383 -- Anonymization settings for public deployment. In "clean" mode, usernames
384 -- are replaced with sequential identifiers (user-1, user-2...) to prevent
385 -- identifying who you were talking to. The local_server_domain is your home
386 -- instance - local users are anonymized while you remain identifiable.
387 -- Available modes: "clean" (anonymize), "raw" (preserve original)
388 privacy = {
389 mode = "clean", -- "clean" or "raw"
390 anonymization_prefix = "user-", -- Prefix for anonymized usernames
391 include_boosts = false, -- Boosted/reblogged posts OFF by default (the name implies opt-in); pass --include-boosts to add them
392 preserve_original_length = true, -- Keep length hints for anonymized names
393 store_anonymization_map = false, -- Don't store mapping (privacy)
394 local_server_domain = "tech.lgbt" -- Your home instance domain
395 },
396 -- }}}
397
398 -- Golden-poem PRIORITIZATION was removed in Issue 5-015 (Dec 2025): golden
399 -- poems rank on equal footing and are distinguished only visually (the
400 -- box-drawing frame). The golden_poems bonus/quota config and its entire dead
401 -- template subsystem (golden-poem-bonus, similarity-engine, template-engine,
402 -- golden-collection-generator + tests) were deleted on 2026-06-23 -- the live
403 -- GPU ranking path never read any of it.
404
405 -- {{{ semantic_colors
406 -- Colors for the semantic clustering visualization. Each poem is assigned
407 -- a color based on its embedding cluster, creating a visual map of your
408 -- collection's thematic regions. Progress bars blend these colors.
409 -- Read by: src/semantic-color-calculator.lua
410 semantic_colors = {
411 red = { rgb = {220, 60, 60}, hex = "#dc3c3c", name = "red" },
412 blue = { rgb = {60, 120, 220}, hex = "#3c78dc", name = "blue" },
413 green = { rgb = {60, 180, 90}, hex = "#3cb45a", name = "green" },
414 purple = { rgb = {140, 60, 200}, hex = "#8c3cc8", name = "purple" },
415 orange = { rgb = {230, 140, 60}, hex = "#e68c3c", name = "orange" },
416 yellow = { rgb = {200, 180, 40}, hex = "#c8b428", name = "yellow" },
417 gray = { rgb = {120, 120, 120}, hex = "#787878", name = "gray" }
418 },
419 -- Ordered list for deterministic iteration across pages
420 color_names = {"red", "blue", "green", "purple", "orange", "yellow", "gray"},
421
422 -- {{{ color_associations
423 -- Each color's "essence" as a list of associated words -- concrete things AND
424 -- abstract feelings/concepts. semantic-color-calculator embeds every word,
425 -- mean-combines them into one per-color centroid (the same recombination used
426 -- for long-poem chunks, Issue 10-050), and assigns each poem the color whose
427 -- centroid it sits most ABOVE-baseline for (z-scored, hubness-corrected).
428 --
429 -- Why a list of associations instead of the bare color word: the bare word
430 -- "red" embeds to a generic point that, by raw nearness, swallowed ~38% of
431 -- all poems. A list (fire, blood, passion, rage...) pulls the anchor into the
432 -- color's real semantic territory -- a poem about war or embers reads red even
433 -- if it never says "red". Keep each list COHERENT (every word genuinely of
434 -- that color); a coherent set averages to a clean anchor, an incoherent one
435 -- to mush. These are a starting point -- edit freely; re-run stage 6.5 after.
436 color_associations = {
437 red = {"fire", "blood", "passion", "anger", "rose", "heat", "danger", "war", "rage", "embers", "desire", "love", "wound"},
438 blue = {"sky", "ocean", "calm", "sadness", "cold", "melancholy", "depth", "distance", "ice", "serenity", "longing", "loneliness", "peace"},
439 green = {"forest", "growth", "nature", "envy", "leaf", "spring", "life", "moss", "renewal", "jealousy", "fertility", "garden", "grass"},
440 purple = {"royalty", "mystery", "magic", "twilight", "luxury", "grief", "wisdom", "orchid", "velvet", "dusk", "nobility", "dream", "spirituality"},
441 orange = {"autumn", "warmth", "energy", "citrus", "sunset", "harvest", "enthusiasm", "pumpkin", "amber", "glow", "vitality", "spice", "zest"},
442 yellow = {"sun", "joy", "warning", "gold", "happiness", "cowardice", "daffodil", "brightness", "caution", "summer", "lemon", "optimism", "light"},
443 gray = {"fog", "ash", "stone", "age", "neutrality", "concrete", "rain", "dullness", "shadow", "winter", "steel", "silence", "gloom"},
444 },
445 -- }}}
446 -- }}}
447
448 -- {{{ similarity
449 -- Algorithm settings for computing poem-to-poem similarity scores.
450 -- Read by: src/similarity-calculator.lua
451 -- Available algorithms: "cosine", "euclidean", "manhattan", "angular", "pearson_correlation"
452 similarity = {
453 default_algorithm = "cosine" -- Cosine is standard for text embeddings
454 },
455 -- }}}
456
457 -- {{{ inference_servers
458 -- Issue 10-049: Inference-server configuration for embedding generation.
459 -- Originally written for Ollama under 10-017; renamed and re-shaped for
460 -- llama.cpp. Define multiple servers (local, remote GPU, etc.) and
461 -- switch between them via TUI selection or CLI flags.
462 -- Read by: libs/inference-server-config.lua
463 -- CLI overrides: --server NAME, --model NAME, --list-servers
464 --
465 -- Fields per server:
466 -- name: Label shown in the TUI and used with the --server flag
467 -- description: Human-readable description
468 -- host: Server hostname or IP
469 -- port: Inference server's HTTP port
470 -- model: Identifier sent in the OpenAI request body (informational;
471 -- llama-server serves whatever model it has loaded). Convention
472 -- is to use the GGUF basename without ".gguf".
473 -- model_path: Path to the GGUF model file on disk, relative to the
474 -- project DIR. start-llamacpp-server.sh resolves this
475 -- into the absolute path it passes to llama-server -m.
476 -- available_models: (optional) List of models the host can serve
477 -- embedding_prompt_prefix: (optional) Prefix prepended to every input
478 -- (e.g. "clustering: " for nomic-embed-text v1.5)
479 inference_servers = {
480 {
481 name = "gpu-server",
482 description = "Remote GPU server (CUDA)",
483 host = "192.168.0.115",
484 port = 10265,
485 model = "nomic-embed-text-v1.5",
486 model_path = "assets/models/nomic-embed-text-v1.5.Q8_0.gguf",
487 available_models = {
488 "nomic-embed-text-v1.5",
489 "mxbai-embed-large",
490 }
491 },
492 {
493 name = "gpu-server-alt",
494 description = "Remote GPU server (alternate port)",
495 host = "192.168.0.115",
496 port = 11434,
497 model = "nomic-embed-text-v1.5",
498 model_path = "assets/models/nomic-embed-text-v1.5.Q8_0.gguf",
499 },
500 {
501 name = "local",
502 description = "Local llama.cpp instance (CUDA-enabled)",
503 host = "192.168.1.100",
504 port = 10265,
505 -- nomic-embed-text v1.5 produces 768-dimensional vectors and
506 -- requires a task-prefix on every input. For diversity ranking
507 -- of poetry the right prefix is "clustering: ", which routes
508 -- the model through its clustering-oriented internal weights.
509 -- Switching models requires regenerating embeddings.json, the
510 -- similarity caches, the diversity cache, etc.
511 model = "nomic-embed-text-v1.5",
512 model_path = "assets/models/nomic-embed-text-v1.5.Q8_0.gguf",
513 embedding_prompt_prefix = "clustering: ",
514 -- This one machine can serve several local GGUFs (one at a time:
515 -- start-llamacpp-server.sh --server=local --model=NAME loads the
516 -- chosen file). The default model above is nomic; the entries below
517 -- add the others. A plain-string entry (or the default model itself)
518 -- uses the server-level model_path/prefix above; a table entry brings
519 -- its OWN GGUF and the prompt phrasing its makers intend for
520 -- clustering/similarity, so each model is asked the same question
521 -- the right way. Switching the served model needs a server restart
522 -- (and regenerating the caches that depend on the embedding space).
523 available_models = {
524 "nomic-embed-text-v1.5",
525 {
526 model = "mxbai-embed-large-v1",
527 model_path = "assets/models/mxbai-embed-large-v1.Q8_0.gguf",
528 -- No task-prompt training; embed plain text for symmetric
529 -- poem-to-poem similarity (the "Represent this sentence..."
530 -- instruction is only for the query side of retrieval).
531 embedding_prompt_prefix = nil,
532 },
533 {
534 model = "embeddinggemma-300m",
535 model_path = "assets/models/embeddinggemma-300M-Q8_0.gguf",
536 -- Trained WITH task prompts; the clustering task uses this
537 -- exact prefix per the model card, mirroring nomic's intent.
538 embedding_prompt_prefix = "task: clustering | query: ",
539 },
540 },
541 },
542 },
543 -- Default server name (must match a name above)
544 -- If not set, first server in list is used
545 default_inference_server = "local",
546 -- }}}
547
548 -- {{{ image_integration
549 -- Settings for including media attachments (images, GIFs) alongside poems.
550 -- Images from fediverse posts are copied to the output and displayed inline.
551 -- Read by: src/image-manager.lua (uses sources.images for directories)
552 image_integration = {
553 enabled = true,
554 -- NOTE: image directories now come from sources.images (Issue 10-015a)
555 supported_formats = {"png", "jpg", "jpeg", "gif", "webp", "svg"},
556 max_file_size_mb = 100, -- Skip oversized files
557 output_path = "assets/images", -- Where to copy images
558 catalog_file = "assets/image-catalog.json" -- Index of all images
559 },
560 -- }}}
561
562 -- {{{ image_sync - DEPRECATED (Issue 10-003b)
563 -- This section has been replaced by external_files (see above).
564 -- All external file syncing is now handled by libs/external-sync.lua
565 -- and scripts/sync-external-files.
566 --
567 -- To add new image sources, add entries to external_files with:
568 -- destination = "media_attachments/your-source-name"
569 -- }}}
570 -- REMOVED: image_sync section (10-003b)
571
572 -- {{{ pagination
573 -- Controls how poems are split across HTML pages. Large collections need
574 -- pagination to keep page load times reasonable.
575 -- Read by: src/flat-html-generator.lua:load_pagination_config()
576 -- CLI overrides: --poems-per-page, --chrono-per-page, --pages (via run.sh)
577 pagination = {
578 poems_per_page = 200, -- Poems per similar/different page
579 -- CLI: --poems-per-page N (run.sh default: 200)
580 minimum_pages = 1, -- Minimum pages to generate
581 -- max_pages_per_poem is intentionally NOT here: the per-poem page ceiling is
582 -- COMPUTED each build from the storage quota (storage.limit_gb below) and the
583 -- measured size of the last build's pages, by flat-html-generator's
584 -- compute_storage_max_pages (Issue 10-057). A frozen 15 was an estimate that
585 -- would have shipped ~66GB into a 45GB quota.
586 page_number_padding = 2, -- Zero-padding for page numbers (01, 02...)
587 generate_txt_exports = true, -- Generate .txt versions of poems
588 generate_html_archives = false, -- Disabled: redundant with paginated pages
589 chronological_paginated = false, -- Split chronological.html into pages
590 chronological_poems_per_page = 1000 -- Poems per chronological page (if paginated)
591 -- CLI: --chrono-per-page N
592 },
593 -- }}}
594
595 -- {{{ storage
596 -- Budget planning for Neocities deployment. These values inform the
597 -- pagination system about storage constraints.
598 -- Read by: src/flat-html-generator.lua:load_pagination_config()
599 storage = {
600 limit_gb = 45, -- Total available storage (Neocities supporter)
601 reserved_for_maze_gb = 0.031, -- Reserved for HTML Maze feature
602 reserved_headroom_gb = 5 -- Safety buffer
603 },
604 -- }}}
605
606 -- {{{ word_cloud
607 -- Word cloud page settings. Extracts vocabulary from all poems, filters
608 -- stop words (common words like "the", "and"), and displays the remaining
609 -- words sized by frequency. Each word links to poems containing it.
610 -- Read by: src/wordcloud-generator.lua
611 word_cloud = {
612 enabled = true,
613 output_file = "wordcloud.html",
614 min_occurrences = 5, -- Minimum times a word must appear
615 max_words = 200, -- Maximum words to display (0 = unlimited)
616 min_word_length = 3, -- Ignore words shorter than this
617 font_size_min = 1, -- HTML font tag: 1-7 scale
618 font_size_max = 7,
619
620 -- Stop words: common words to exclude from word cloud
621 -- Organized by category for easy editing
622 stop_words = {
623 -- Anonymization artifacts (from privacy processing)
624 "user", "users",
625 -- Contraction fragments (from apostrophe removal)
626 "don", "doesn", "didn", "isn", "aren", "wasn", "weren",
627 "wouldn", "couldn", "shouldn", "haven", "hasn", "hadn", "won",
628 -- URL/Technical artifacts
629 "https", "http", "www", "com", "org", "net",
630 -- Articles
631 "a", "an", "the",
632 -- Pronouns
633 "i", "me", "my", "mine", "myself", "you", "your", "yours", "yourself",
634 "he", "him", "his", "himself", "she", "her", "hers", "herself",
635 "it", "its", "itself", "we", "us", "our", "ours", "ourselves",
636 "they", "them", "their", "theirs", "themselves",
637 "who", "whom", "whose", "which", "what", "that", "this", "these", "those",
638 -- Prepositions
639 "in", "on", "at", "to", "for", "of", "with", "by", "from", "up", "down",
640 "out", "into", "over", "under", "through", "between", "among",
641 "about", "after", "before", "during", "without", "within",
642 -- Conjunctions
643 "and", "or", "but", "nor", "so", "yet", "because", "although",
644 "while", "if", "when", "where", "as", "than",
645 -- Auxiliary verbs
646 "is", "are", "was", "were", "be", "been", "being", "am",
647 "have", "has", "had", "having", "do", "does", "did", "doing",
648 "will", "would", "could", "should", "may", "might", "must", "shall", "can",
649 -- Common verbs
650 "get", "got", "go", "went", "gone", "come", "came", "make", "made",
651 "take", "took", "taken", "see", "saw", "seen", "know", "knew", "known",
652 "think", "thought", "say", "said", "give", "gave", "given",
653 "find", "found", "tell", "told", "feel", "felt", "become", "became",
654 "leave", "left", "put", "keep", "kept", "let", "begin", "began", "begun",
655 "seem", "seemed", "help", "helped", "show", "showed", "shown",
656 "hear", "heard", "turn", "turned", "start", "started", "run", "ran", "move", "moved",
657 -- Common adverbs
658 "very", "really", "just", "also", "too", "still", "even", "now", "then",
659 "here", "there", "always", "never", "often", "sometimes", "already",
660 "again", "ever", "soon", "only",
661 -- Question words
662 "how", "why",
663 -- Other common words
664 "all", "some", "any", "no", "not", "more", "most", "other", "such",
665 "own", "same", "like", "well", "way", "back", "much", "many",
666 "new", "good", "first", "last", "long", "great", "little", "old",
667 "right", "big", "high", "different", "small", "large", "next", "early",
668 "young", "important", "few", "public", "bad", "enough", "able", "sure",
669 "thing", "things", "people", "time", "year", "years", "day", "days",
670 "world", "life", "man", "woman", "men", "women", "child", "children",
671 "something", "nothing", "everything", "someone", "anyone", "everyone"
672 }
673 },
674 -- }}}
675
676 -- {{{ centroids
677 -- Mood-based exploration anchors. Each centroid defines a "semantic target"
678 -- using keywords and optional source files. The pipeline embeds these targets
679 -- and generates similarity pages showing which poems match each mood.
680 -- Read by: src/centroid-generator.lua
681 --
682 -- To add a new mood: copy an existing entry, change the name/slug/keywords.
683 -- Keywords can be single words or evocative phrases - the embedding model
684 -- will find poems that feel similar to the combined meaning.
685 centroids = {
686 {
687 name = "melancholy",
688 description = "Sad, reflective, introspective moods - winter feelings and quiet grief",
689 source_files = {},
690 keywords = {
691 "loneliness",
692 "grief",
693 "winter",
694 "rain on windows",
695 "empty rooms",
696 "quiet sadness",
697 "memory of someone gone",
698 "the weight of silence"
699 },
700 output_slug = "melancholy"
701 },
702 {
703 name = "wonder",
704 description = "Awe, curiosity, the vastness of existence",
705 source_files = {},
706 keywords = {
707 "stars",
708 "infinity",
709 "childhood wonder",
710 "discovery",
711 "the unknown",
712 "first time seeing the ocean",
713 "questions without answers",
714 "the size of the universe"
715 },
716 output_slug = "wonder"
717 },
718 {
719 name = "rage",
720 description = "Anger, frustration, righteous fury",
721 source_files = {},
722 keywords = {
723 "injustice",
724 "betrayal",
725 "fire",
726 "screaming into the void",
727 "broken promises",
728 "systemic failure",
729 "enough is enough"
730 },
731 output_slug = "rage"
732 },
733 {
734 name = "tenderness",
735 description = "Gentle love, care, softness between beings",
736 source_files = {},
737 keywords = {
738 "holding hands",
739 "soft voice",
740 "caring for someone sick",
741 "pet sleeping on your lap",
742 "forgiveness",
743 "vulnerability",
744 "being seen"
745 },
746 output_slug = "tenderness"
747 },
748 {
749 name = "absurdity",
750 description = "The strange, surreal, and darkly comic",
751 source_files = {},
752 keywords = {
753 "kafka",
754 "bureaucracy",
755 "meaninglessness that becomes funny",
756 "the universe as joke",
757 "recursive paradox",
758 "waiting for something that never comes"
759 },
760 output_slug = "absurd"
761 },
762 {
763 name = "hope",
764 description = "Uplifting, encouraging, healing - poems for hope cards and difficult times",
765 source_files = {},
766 keywords = {
767 "hope",
768 "healing",
769 "light at the end of the tunnel",
770 "things will get better",
771 "resilience after hardship",
772 "growth through difficulty",
773 "recovery and renewal",
774 "new beginnings",
775 "gentle encouragement",
776 "you are not alone in this",
777 "kindness in dark times",
778 "compassion for yourself",
779 "tomorrow is another day",
780 "this too shall pass",
781 "the relief after crying",
782 "being held when you're scared",
783 "winter turning to spring",
784 "stars in the darkest night",
785 "tired but still here",
786 "scared but brave enough",
787 "small victories matter",
788 "rest is not giving up",
789 "you did your best today",
790 "permission to be imperfect"
791 },
792 output_slug = "hope"
793 },
794 {
795 name = "fierce-hope",
796 description = "Empowering, activist, revolutionary hope - strength and resistance",
797 source_files = {},
798 keywords = {
799 "revolution",
800 "resistance",
801 "we will overcome",
802 "rising up together",
803 "collective power",
804 "speaking truth to power",
805 "no justice no peace",
806 "solidarity",
807 "the arc of justice",
808 "they tried to bury us they didn't know we were seeds",
809 "we are the ones we've been waiting for",
810 "never give up never surrender",
811 "fierce tenderness",
812 "angry and hopeful",
813 "burn it down and build anew"
814 },
815 output_slug = "fierce-hope"
816 },
817 {
818 name = "quiet-comfort",
819 description = "Cozy, gentle, safe spaces - poems for rest and sanctuary",
820 source_files = {},
821 keywords = {
822 "rest",
823 "safety",
824 "warm blanket on cold night",
825 "tea and quiet moments",
826 "sanctuary from the storm",
827 "soft lighting",
828 "gentle rain on windows",
829 "curled up with a book",
830 "permission to do nothing",
831 "the luxury of being alone",
832 "home as refuge",
833 "peace in small things",
834 "the comfort of routine",
835 "slow mornings",
836 "everything can wait",
837 "you are safe here"
838 },
839 output_slug = "comfort"
840 }
841 },
842 -- }}}
843
844 -- {{{ html_theme
845 -- Dark mode theme colors applied via HTML body attributes (CSS-free).
846 -- Uses true black (#000000) for OLED power savings and maximum contrast.
847 -- These colors are applied to <body bgcolor="..." text="..." link="..." vlink="...">
848 html_theme = {
849 background = "#000000", -- True black background (OLED-friendly)
850 text = "#FFFFFF", -- White text for readability
851 link = "#6699FF", -- Blue for unvisited links
852 vlink = "#9966FF" -- Purple for visited links
853 },
854 -- }}}
855
856 -- {{{ Algorithm Reference (documentation only)
857 -- These algorithm descriptions are for reference only - not read by scripts.
858 -- The actual algorithm is selected via similarity.default_algorithm above.
859 --
860 -- Available algorithms:
861 -- cosine: Angle between vectors, range [-1, 1], fast, best for text embeddings
862 -- euclidean: Distance converted to similarity, range [0, 1], fast
863 -- manhattan: L1 distance converted to similarity, range [0, 1], robust to outliers
864 -- angular: Normalized angle, range [0, 1], good for directional data
865 -- pearson: Correlation coefficient, range [0, 1], for statistical analysis
866 --
867 -- Removed stale options (2026-01-21, Issue 10-003):
868 -- output_format: Only JSON is supported, no need for config
869 -- preserve_timestamps: Always preserved, not configurable
870 -- validation_settings: Over-engineering, not implemented
871 -- }}}
872}
873