From fc8bcbce4f78086418ef26fbfb95706ddc2eed38 Mon Sep 17 00:00:00 2001 From: Arthur Tazhitdinov Date: Fri, 9 Jan 2026 22:41:55 +0500 Subject: [PATCH] script to embed hypher-generated `.bin` tries into constexpr headers --- scripts/generate_hyphenation_trie.py | 82 ++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100755 scripts/generate_hyphenation_trie.py diff --git a/scripts/generate_hyphenation_trie.py b/scripts/generate_hyphenation_trie.py new file mode 100755 index 00000000..81efc91a --- /dev/null +++ b/scripts/generate_hyphenation_trie.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 +"""Embed hypher-generated `.bin` tries into constexpr headers.""" + +from __future__ import annotations + +import argparse +import pathlib + + +def _format_bytes(blob: bytes, per_line: int = 16) -> str: + # Render the blob as a comma separated list of hex literals with consistent wrapping. + lines = [] + for i in range(0, len(blob), per_line): + chunk = ', '.join(f"0x{b:02X}" for b in blob[i : i + per_line]) + lines.append(f" {chunk},") + if not lines: + lines.append(" 0x00,") + return '\n'.join(lines) + + +def _symbol_from_output(path: pathlib.Path) -> str: + # Derive a stable C identifier from the destination header name (e.g., hyph-en.trie.h -> en). + name = path.name + if name.endswith('.trie.h'): + name = name[:-7] + if name.startswith('hyph-'): + name = name[5:] + name = name.replace('-', '_') + if name.endswith('.trie'): + name = name[:-5] + return name + + +def write_header(path: pathlib.Path, blob: bytes, symbol: str) -> None: + # Emit a constexpr header containing the raw bytes plus a SerializedHyphenationPatterns descriptor. + path.parent.mkdir(parents=True, exist_ok=True) + data_symbol = f"{symbol}_trie_data" + patterns_symbol = f"{symbol}_patterns" + bytes_literal = _format_bytes(blob) + content = f"""#pragma once + +#include +#include + +#include "../SerializedHyphenationTrie.h" + +// Auto-generated by generate_hyphenation_trie.py. Do not edit manually. +alignas(4) constexpr uint8_t {data_symbol}[] = {{ +{bytes_literal} +}}; + +constexpr SerializedHyphenationPatterns {patterns_symbol} = {{ + {data_symbol}, + sizeof({data_symbol}), +}}; +""" + path.write_text(content) + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument('--input', dest='inputs', action='append', required=True, + help='Path to a hypher-generated .bin trie') + parser.add_argument('--output', dest='outputs', action='append', required=True, + help='Destination header path (hyph-*.trie.h)') + args = parser.parse_args() + + if len(args.inputs) != len(args.outputs): + raise SystemExit('input/output counts must match') + + for src, dst in zip(args.inputs, args.outputs): + # Process each input/output pair independently so mixed-language refreshes work in one invocation. + src_path = pathlib.Path(src) + blob = src_path.read_bytes() + out_path = pathlib.Path(dst) + symbol = _symbol_from_output(out_path) + write_header(out_path, blob, symbol) + print(f'wrote {dst} ({len(blob)} bytes payload)') + + +if __name__ == '__main__': + main()