mirror of
https://github.com/daveallie/crosspoint-reader.git
synced 2026-02-04 22:57:50 +03:00
## Summary * Adds (optional) Hyphenation for English, French, German, Russian languages ## Additional Context * Included hyphenation dictionaries add approximately 280kb to the flash usage (German alone takes 200kb) * Trie encoded dictionaries are adopted from hypher project (https://github.com/typst/hypher) * Soft hyphens (and other explicit hyphens) take precedence over dict-based hyphenation. Overall, the hyphenation rules are quite aggressive, as I believe it makes more sense on our smaller screen. --------- Co-authored-by: Dave Allie <dave@daveallie.com>
83 lines
2.6 KiB
Python
Executable File
83 lines
2.6 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""Embed hypher-generated `.bin` tries into constexpr headers."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import pathlib
|
|
|
|
|
|
def _format_bytes(blob: bytes, per_line: int = 16) -> str:
|
|
# Render the blob as a comma separated list of hex literals with consistent wrapping.
|
|
lines = []
|
|
for i in range(0, len(blob), per_line):
|
|
chunk = ', '.join(f"0x{b:02X}" for b in blob[i : i + per_line])
|
|
lines.append(f" {chunk},")
|
|
if not lines:
|
|
lines.append(" 0x00,")
|
|
return '\n'.join(lines)
|
|
|
|
|
|
def _symbol_from_output(path: pathlib.Path) -> str:
|
|
# Derive a stable C identifier from the destination header name (e.g., hyph-en.trie.h -> en).
|
|
name = path.name
|
|
if name.endswith('.trie.h'):
|
|
name = name[:-7]
|
|
if name.startswith('hyph-'):
|
|
name = name[5:]
|
|
name = name.replace('-', '_')
|
|
if name.endswith('.trie'):
|
|
name = name[:-5]
|
|
return name
|
|
|
|
|
|
def write_header(path: pathlib.Path, blob: bytes, symbol: str) -> None:
|
|
# Emit a constexpr header containing the raw bytes plus a SerializedHyphenationPatterns descriptor.
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
data_symbol = f"{symbol}_trie_data"
|
|
patterns_symbol = f"{symbol}_patterns"
|
|
bytes_literal = _format_bytes(blob)
|
|
content = f"""#pragma once
|
|
|
|
#include <cstddef>
|
|
#include <cstdint>
|
|
|
|
#include "../SerializedHyphenationTrie.h"
|
|
|
|
// Auto-generated by generate_hyphenation_trie.py. Do not edit manually.
|
|
alignas(4) constexpr uint8_t {data_symbol}[] = {{
|
|
{bytes_literal}
|
|
}};
|
|
|
|
constexpr SerializedHyphenationPatterns {patterns_symbol} = {{
|
|
{data_symbol},
|
|
sizeof({data_symbol}),
|
|
}};
|
|
"""
|
|
path.write_text(content)
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('--input', dest='inputs', action='append', required=True,
|
|
help='Path to a hypher-generated .bin trie')
|
|
parser.add_argument('--output', dest='outputs', action='append', required=True,
|
|
help='Destination header path (hyph-*.trie.h)')
|
|
args = parser.parse_args()
|
|
|
|
if len(args.inputs) != len(args.outputs):
|
|
raise SystemExit('input/output counts must match')
|
|
|
|
for src, dst in zip(args.inputs, args.outputs):
|
|
# Process each input/output pair independently so mixed-language refreshes work in one invocation.
|
|
src_path = pathlib.Path(src)
|
|
blob = src_path.read_bytes()
|
|
out_path = pathlib.Path(dst)
|
|
symbol = _symbol_from_output(out_path)
|
|
write_header(out_path, blob, symbol)
|
|
print(f'wrote {dst} ({len(blob)} bytes payload)')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|