Compare commits

..

5 Commits

Author SHA1 Message Date
Arthur Tazhitdinov
3f9d84dafb clang format fix 2026-01-14 01:12:00 +05:00
Arthur Tazhitdinov
ee2db07e64 hyphenation evaluation 2026-01-14 01:10:18 +05:00
Arthur Tazhitdinov
3de16d679a docs: Add documentation for Hypher binary tries and firmware embedding process 2026-01-14 01:08:21 +05:00
Arthur Tazhitdinov
0e29640699 fix: Update BOOK_CACHE_VERSION to 5 2026-01-14 01:07:18 +05:00
Arthur Tazhitdinov
43ebe9c66e fix: Enhance Latin letter case conversion and improve isLatinLetter function 2026-01-13 23:51:17 +05:00
10 changed files with 46151 additions and 4 deletions

View File

@ -0,0 +1,66 @@
# Hypher Binary Tries
CrossPoint embeds the exact binary automata produced by
[Typst's `hypher`](https://github.com/typst/hypher).
## File layout
Each `.bin` blob is a single self-contained automaton:
```
uint32_t root_addr_be; // big-endian offset of the root node
uint8_t levels[]; // shared "levels" tape (dist/score pairs)
uint8_t nodes[]; // node records packed back-to-back
```
The size of the `levels` tape is implicit. Individual nodes reference slices
inside that tape via 12-bit offsets, so no additional pointers are required.
### Node encoding
Every node starts with a single control byte:
- Bit 7 set when the node stores scores (`levels`).
- Bits 5-6 stride of the target deltas (1, 2, or 3 bytes, big-endian).
- Bits 0-4 transition count (values ≥ 31 spill into an extra byte).
If the `levels` flag is set, two more bytes follow. Together they encode a
12-bit offset into the global `levels` tape and a 4-bit length. Each byte in the
levels tape packs a distance/score pair as `dist * 10 + score`, where `dist`
counts how many UTF-8 bytes we advanced since the previous digit.
After the optional levels header come the transition labels (one byte per edge)
followed by the signed target deltas. Targets are stored as relative offsets
from the current node address. Deltas up to ±128 fit in a single byte, larger
distances grow to 2 or 3 bytes. The runtime walks the transitions with a simple
linear scan and materializes the absolute address by adding the decoded delta
to the current nodes base.
## Embedding blobs into the firmware
The helper script `scripts/generate_hyphenation_trie.py` acts as a thin
wrapper: it reads the hypher-generated `.bin` files, formats them as `constexpr`
byte arrays, and emits headers under
`lib/Epub/Epub/hyphenation/generated/`. Each header defines the raw data plus a
`SerializedHyphenationPatterns` descriptor so the reader can keep the automaton
in flash.
To refresh the firmware assets after updating the `.bin` files, run:
```
./scripts/generate_hyphenation_trie.py \
--input lib/Epub/Epub/hyphenation/tries/en.bin \
--output lib/Epub/Epub/hyphenation/generated/hyph-en.trie.h
./scripts/generate_hyphenation_trie.py \
--input lib/Epub/Epub/hyphenation/tries/fr.bin \
--output lib/Epub/Epub/hyphenation/generated/hyph-fr.trie.h
./scripts/generate_hyphenation_trie.py \
--input lib/Epub/Epub/hyphenation/tries/de.bin \
--output lib/Epub/Epub/hyphenation/generated/hyph-de.trie.h
./scripts/generate_hyphenation_trie.py \
--input lib/Epub/Epub/hyphenation/tries/ru.bin \
--output lib/Epub/Epub/hyphenation/generated/hyph-ru.trie.h
```

View File

@ -9,7 +9,7 @@
#include "FsHelpers.h"
namespace {
constexpr uint8_t BOOK_CACHE_VERSION = 4;
constexpr uint8_t BOOK_CACHE_VERSION = 5;
constexpr char bookBinFile[] = "/book.bin";
constexpr char tmpSpineBinFile[] = "/spine.bin.tmp";
constexpr char tmpTocBinFile[] = "/toc.bin.tmp";

View File

@ -4,12 +4,25 @@
namespace {
// Convert Latin uppercase letters (A-Z) to lowercase (a-z)
// Convert Latin uppercase letters (ASCII plus Latin-1 supplement) to lowercase
uint32_t toLowerLatinImpl(const uint32_t cp) {
if (cp >= 'A' && cp <= 'Z') {
return cp - 'A' + 'a';
}
return cp;
if ((cp >= 0x00C0 && cp <= 0x00D6) || (cp >= 0x00D8 && cp <= 0x00DE)) {
return cp + 0x20;
}
switch (cp) {
case 0x0152: // Œ
return 0x0153; // œ
case 0x0178: // Ÿ
return 0x00FF; // ÿ
case 0x1E9E: // ẞ
return 0x00DF; // ß
default:
return cp;
}
}
// Convert Cyrillic uppercase letters to lowercase
@ -31,7 +44,26 @@ uint32_t toLowerLatin(const uint32_t cp) { return toLowerLatinImpl(cp); }
uint32_t toLowerCyrillic(const uint32_t cp) { return toLowerCyrillicImpl(cp); }
bool isLatinLetter(const uint32_t cp) { return (cp >= 'A' && cp <= 'Z') || (cp >= 'a' && cp <= 'z'); }
bool isLatinLetter(const uint32_t cp) {
if ((cp >= 'A' && cp <= 'Z') || (cp >= 'a' && cp <= 'z')) {
return true;
}
if (((cp >= 0x00C0 && cp <= 0x00D6) || (cp >= 0x00D8 && cp <= 0x00F6) || (cp >= 0x00F8 && cp <= 0x00FF)) &&
cp != 0x00D7 && cp != 0x00F7) {
return true;
}
switch (cp) {
case 0x0152: // Œ
case 0x0153: // œ
case 0x0178: // Ÿ
case 0x1E9E: // ẞ
return true;
default:
return false;
}
}
bool isCyrillicLetter(const uint32_t cp) { return (cp >= 0x0400 && cp <= 0x052F); }

View File

@ -0,0 +1,389 @@
#include <Utf8.h>
#include <algorithm>
#include <cctype>
#include <cmath>
#include <fstream>
#include <functional>
#include <iostream>
#include <sstream>
#include <string>
#include <vector>
#include "lib/Epub/Epub/hyphenation/HyphenationCommon.h"
#include "lib/Epub/Epub/hyphenation/LanguageHyphenator.h"
#include "lib/Epub/Epub/hyphenation/LanguageRegistry.h"
struct TestCase {
std::string word;
std::string hyphenated;
std::vector<size_t> expectedPositions;
int frequency;
};
struct EvaluationResult {
int truePositives = 0;
int falsePositives = 0;
int falseNegatives = 0;
double precision = 0.0;
double recall = 0.0;
double f1Score = 0.0;
double weightedScore = 0.0;
};
struct LanguageConfig {
std::string cliName;
std::string testDataFile;
const char* primaryTag;
};
const std::vector<LanguageConfig> kSupportedLanguages = {
{"english", "test/hyphenation_eval/resources/english_hyphenation_tests.txt", "en"},
{"french", "test/hyphenation_eval/resources/french_hyphenation_tests.txt", "fr"},
{"german", "test/hyphenation_eval/resources/german_hyphenation_tests.txt", "de"},
{"russian", "test/hyphenation_eval/resources/russian_hyphenation_tests.txt", "ru"},
};
std::vector<size_t> expectedPositionsFromAnnotatedWord(const std::string& annotated) {
std::vector<size_t> positions;
const unsigned char* ptr = reinterpret_cast<const unsigned char*>(annotated.c_str());
size_t codepointIndex = 0;
while (*ptr != 0) {
if (*ptr == '=') {
positions.push_back(codepointIndex);
++ptr;
continue;
}
utf8NextCodepoint(&ptr);
++codepointIndex;
}
return positions;
}
std::vector<TestCase> loadTestData(const std::string& filename) {
std::vector<TestCase> testCases;
std::ifstream file(filename);
if (!file.is_open()) {
std::cerr << "Error: Could not open file " << filename << std::endl;
return testCases;
}
std::string line;
while (std::getline(file, line)) {
if (line.empty() || line[0] == '#') {
continue;
}
std::istringstream iss(line);
std::string word, hyphenated, freqStr;
if (std::getline(iss, word, '|') && std::getline(iss, hyphenated, '|') && std::getline(iss, freqStr, '|')) {
TestCase testCase;
testCase.word = word;
testCase.hyphenated = hyphenated;
testCase.frequency = std::stoi(freqStr);
testCase.expectedPositions = expectedPositionsFromAnnotatedWord(hyphenated);
testCases.push_back(testCase);
}
}
file.close();
return testCases;
}
std::string positionsToHyphenated(const std::string& word, const std::vector<size_t>& positions) {
std::string result;
std::vector<size_t> sortedPositions = positions;
std::sort(sortedPositions.begin(), sortedPositions.end());
const unsigned char* ptr = reinterpret_cast<const unsigned char*>(word.c_str());
size_t codepointIndex = 0;
size_t posIdx = 0;
while (*ptr != 0) {
while (posIdx < sortedPositions.size() && sortedPositions[posIdx] == codepointIndex) {
result.push_back('=');
++posIdx;
}
const unsigned char* current = ptr;
utf8NextCodepoint(&ptr);
result.append(reinterpret_cast<const char*>(current), reinterpret_cast<const char*>(ptr));
++codepointIndex;
}
while (posIdx < sortedPositions.size() && sortedPositions[posIdx] == codepointIndex) {
result.push_back('=');
++posIdx;
}
return result;
}
std::vector<size_t> hyphenateWordWithHyphenator(const std::string& word, const LanguageHyphenator& hyphenator) {
auto cps = collectCodepoints(word);
trimSurroundingPunctuation(cps);
trimTrailingFootnoteReference(cps);
return hyphenator.breakIndexes(cps);
}
std::vector<LanguageConfig> resolveLanguages(const std::string& selection) {
if (selection == "all") {
return kSupportedLanguages;
}
for (const auto& config : kSupportedLanguages) {
if (config.cliName == selection) {
return {config};
}
}
return {};
}
EvaluationResult evaluateWord(const TestCase& testCase,
std::function<std::vector<size_t>(const std::string&)> hyphenateFunc) {
EvaluationResult result;
std::vector<size_t> actualPositions = hyphenateFunc(testCase.word);
std::vector<size_t> expected = testCase.expectedPositions;
std::vector<size_t> actual = actualPositions;
std::sort(expected.begin(), expected.end());
std::sort(actual.begin(), actual.end());
for (size_t pos : actual) {
if (std::find(expected.begin(), expected.end(), pos) != expected.end()) {
result.truePositives++;
} else {
result.falsePositives++;
}
}
for (size_t pos : expected) {
if (std::find(actual.begin(), actual.end(), pos) == actual.end()) {
result.falseNegatives++;
}
}
if (result.truePositives + result.falsePositives > 0) {
result.precision = static_cast<double>(result.truePositives) / (result.truePositives + result.falsePositives);
}
if (result.truePositives + result.falseNegatives > 0) {
result.recall = static_cast<double>(result.truePositives) / (result.truePositives + result.falseNegatives);
}
if (result.precision + result.recall > 0) {
result.f1Score = 2 * result.precision * result.recall / (result.precision + result.recall);
}
// Treat words that contain no hyphenation marks in both the expected data and the
// algorithmic output as perfect matches so they don't drag down the per-word averages.
if (expected.empty() && actual.empty()) {
result.precision = 1.0;
result.recall = 1.0;
result.f1Score = 1.0;
}
double fpPenalty = 2.0;
double fnPenalty = 1.0;
int totalErrors = result.falsePositives * fpPenalty + result.falseNegatives * fnPenalty;
int totalPossible = expected.size() * fpPenalty;
if (totalPossible > 0) {
result.weightedScore = 1.0 - (static_cast<double>(totalErrors) / totalPossible);
result.weightedScore = std::max(0.0, result.weightedScore);
} else if (result.falsePositives == 0) {
result.weightedScore = 1.0;
}
return result;
}
void printResults(const std::string& language, const std::vector<TestCase>& testCases,
const std::vector<std::pair<TestCase, EvaluationResult>>& worstCases, int perfectMatches,
int partialMatches, int completeMisses, double totalPrecision, double totalRecall, double totalF1,
double totalWeighted, int totalTP, int totalFP, int totalFN,
std::function<std::vector<size_t>(const std::string&)> hyphenateFunc) {
std::string lang_upper = language;
if (!lang_upper.empty()) {
lang_upper[0] = std::toupper(lang_upper[0]);
}
std::cout << "================================================================================" << std::endl;
std::cout << lang_upper << " HYPHENATION EVALUATION RESULTS" << std::endl;
std::cout << "================================================================================" << std::endl;
std::cout << std::endl;
std::cout << "Total test cases: " << testCases.size() << std::endl;
std::cout << "Perfect matches: " << perfectMatches << " (" << (perfectMatches * 100.0 / testCases.size()) << "%)"
<< std::endl;
std::cout << "Partial matches: " << partialMatches << std::endl;
std::cout << "Complete misses: " << completeMisses << std::endl;
std::cout << std::endl;
std::cout << "--- Overall Metrics (averaged per word) ---" << std::endl;
std::cout << "Average Precision: " << (totalPrecision / testCases.size() * 100.0) << "%" << std::endl;
std::cout << "Average Recall: " << (totalRecall / testCases.size() * 100.0) << "%" << std::endl;
std::cout << "Average F1 Score: " << (totalF1 / testCases.size() * 100.0) << "%" << std::endl;
std::cout << "Average Weighted Score: " << (totalWeighted / testCases.size() * 100.0) << "% (FP penalty: 2x)"
<< std::endl;
std::cout << std::endl;
std::cout << "--- Overall Metrics (total counts) ---" << std::endl;
std::cout << "True Positives: " << totalTP << std::endl;
std::cout << "False Positives: " << totalFP << " (incorrect hyphenation points)" << std::endl;
std::cout << "False Negatives: " << totalFN << " (missed hyphenation points)" << std::endl;
double overallPrecision = totalTP + totalFP > 0 ? static_cast<double>(totalTP) / (totalTP + totalFP) : 0.0;
double overallRecall = totalTP + totalFN > 0 ? static_cast<double>(totalTP) / (totalTP + totalFN) : 0.0;
double overallF1 = overallPrecision + overallRecall > 0
? 2 * overallPrecision * overallRecall / (overallPrecision + overallRecall)
: 0.0;
std::cout << "Overall Precision: " << (overallPrecision * 100.0) << "%" << std::endl;
std::cout << "Overall Recall: " << (overallRecall * 100.0) << "%" << std::endl;
std::cout << "Overall F1 Score: " << (overallF1 * 100.0) << "%" << std::endl;
std::cout << std::endl;
// Filter out perfect matches from the “worst cases” section so that only actionable failures appear.
auto hasImperfection = [](const EvaluationResult& r) { return r.weightedScore < 0.999999; };
std::vector<std::pair<TestCase, EvaluationResult>> imperfectCases;
imperfectCases.reserve(worstCases.size());
for (const auto& entry : worstCases) {
if (hasImperfection(entry.second)) {
imperfectCases.push_back(entry);
}
}
std::cout << "--- Worst Cases (lowest weighted scores) ---" << std::endl;
int showCount = std::min(10, static_cast<int>(imperfectCases.size()));
for (int i = 0; i < showCount; i++) {
const auto& testCase = imperfectCases[i].first;
const auto& result = imperfectCases[i].second;
std::vector<size_t> actualPositions = hyphenateFunc(testCase.word);
std::string actualHyphenated = positionsToHyphenated(testCase.word, actualPositions);
std::cout << "Word: " << testCase.word << " (freq: " << testCase.frequency << ")" << std::endl;
std::cout << " Expected: " << testCase.hyphenated << std::endl;
std::cout << " Got: " << actualHyphenated << std::endl;
std::cout << " Precision: " << (result.precision * 100.0) << "%"
<< " Recall: " << (result.recall * 100.0) << "%"
<< " F1: " << (result.f1Score * 100.0) << "%"
<< " Weighted: " << (result.weightedScore * 100.0) << "%" << std::endl;
std::cout << " TP: " << result.truePositives << " FP: " << result.falsePositives
<< " FN: " << result.falseNegatives << std::endl;
std::cout << std::endl;
}
// Additional compact list of the worst ~100 words to aid iteration
int compactCount = std::min(100, static_cast<int>(imperfectCases.size()));
if (compactCount > 0) {
std::cout << "--- Compact Worst Cases (" << compactCount << ") ---" << std::endl;
for (int i = 0; i < compactCount; i++) {
const auto& testCase = imperfectCases[i].first;
std::vector<size_t> actualPositions = hyphenateFunc(testCase.word);
std::string actualHyphenated = positionsToHyphenated(testCase.word, actualPositions);
std::cout << testCase.word << " | exp:" << testCase.hyphenated << " | got:" << actualHyphenated << std::endl;
}
std::cout << std::endl;
}
}
int main(int argc, char* argv[]) {
const bool summaryMode = argc <= 1;
const std::string languageSelection = summaryMode ? "all" : argv[1];
std::vector<LanguageConfig> languages = resolveLanguages(languageSelection);
if (languages.empty()) {
std::cerr << "Unknown language: " << languageSelection << std::endl;
return 1;
}
for (const auto& lang : languages) {
const auto* hyphenator = getLanguageHyphenatorForPrimaryTag(lang.primaryTag);
if (!hyphenator) {
std::cerr << "No hyphenator registered for tag: " << lang.primaryTag << std::endl;
continue;
}
const auto hyphenateFunc = [hyphenator](const std::string& word) {
return hyphenateWordWithHyphenator(word, *hyphenator);
};
if (!summaryMode) {
std::cout << "Loading test data from: " << lang.testDataFile << std::endl;
}
std::vector<TestCase> testCases = loadTestData(lang.testDataFile);
if (testCases.empty()) {
std::cerr << "No test cases loaded for " << lang.cliName << ". Skipping." << std::endl;
continue;
}
if (!summaryMode) {
std::cout << "Loaded " << testCases.size() << " test cases for " << lang.cliName << std::endl;
std::cout << std::endl;
}
int perfectMatches = 0;
int partialMatches = 0;
int completeMisses = 0;
double totalPrecision = 0.0;
double totalRecall = 0.0;
double totalF1 = 0.0;
double totalWeighted = 0.0;
int totalTP = 0, totalFP = 0, totalFN = 0;
std::vector<std::pair<TestCase, EvaluationResult>> worstCases;
for (const auto& testCase : testCases) {
EvaluationResult result = evaluateWord(testCase, hyphenateFunc);
totalTP += result.truePositives;
totalFP += result.falsePositives;
totalFN += result.falseNegatives;
totalPrecision += result.precision;
totalRecall += result.recall;
totalF1 += result.f1Score;
totalWeighted += result.weightedScore;
if (result.f1Score == 1.0) {
perfectMatches++;
} else if (result.f1Score > 0.0) {
partialMatches++;
} else {
completeMisses++;
}
worstCases.push_back({testCase, result});
}
if (summaryMode) {
const double averageF1Percent = testCases.empty() ? 0.0 : (totalF1 / testCases.size() * 100.0);
std::cout << lang.cliName << ": " << averageF1Percent << "%" << std::endl;
continue;
}
std::sort(worstCases.begin(), worstCases.end(),
[](const auto& a, const auto& b) { return a.second.weightedScore < b.second.weightedScore; });
printResults(lang.cliName, testCases, worstCases, perfectMatches, partialMatches, completeMisses, totalPrecision,
totalRecall, totalF1, totalWeighted, totalTP, totalFP, totalFN, hyphenateFunc);
}
return 0;
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,204 @@
"""
Generate hyphenation test data from a text file.
This script extracts unique words from a book and generates ground truth
hyphenations using the pyphen library, which can be used to test and validate
the hyphenation implementations (e.g., German, English, Russian).
Usage:
python generate_hyphenation_test_data.py <input_file> <output_file> [--language de_DE]
Requirements:
pip install pyphen
"""
import argparse
import re
from collections import Counter
import pyphen
from pathlib import Path
import zipfile
def extract_text_from_epub(epub_path):
"""Extract textual content from an .epub archive by concatenating HTML/XHTML files."""
texts = []
with zipfile.ZipFile(epub_path, "r") as z:
for name in z.namelist():
lower = name.lower()
if (
lower.endswith(".xhtml")
or lower.endswith(".html")
or lower.endswith(".htm")
):
try:
data = z.read(name).decode("utf-8", errors="ignore")
except Exception:
continue
# Remove tags
text = re.sub(r"<[^>]+>", " ", data)
texts.append(text)
return "\n".join(texts)
def extract_words(text):
"""Extract all words from text, preserving original case."""
# Match runs of Unicode letters (any script) while excluding digits/underscores
return re.findall(r"[^\W\d_]+", text, flags=re.UNICODE)
def clean_word(word):
"""Normalize word for hyphenation testing."""
# Keep original case but strip any non-letter characters
return word.strip()
def generate_hyphenation_data(
input_file, output_file, language="de_DE", min_length=6, max_words=None
):
"""
Generate hyphenation test data from a text file.
Args:
input_file: Path to input text file
output_file: Path to output file with hyphenation data
language: Language code for pyphen (e.g., 'de_DE', 'en_US')
min_length: Minimum word length to include
max_words: Maximum number of words to include (None for all)
"""
print(f"Reading from: {input_file}")
# Read the input file
if str(input_file).lower().endswith(".epub"):
print("Detected .epub input; extracting HTML content")
text = extract_text_from_epub(input_file)
else:
with open(input_file, "r", encoding="utf-8") as f:
text = f.read()
# Extract words
print("Extracting words...")
words = extract_words(text)
print(f"Found {len(words)} total words")
# Count word frequencies
word_counts = Counter(words)
print(f"Found {len(word_counts)} unique words")
# Initialize pyphen hyphenator
print(f"Initializing hyphenator for language: {language}")
try:
hyphenator = pyphen.Pyphen(lang=language)
except KeyError:
print(f"Error: Language '{language}' not found in pyphen.")
print("Available languages include: de_DE, en_US, en_GB, fr_FR, etc.")
return
# Generate hyphenations
print("Generating hyphenations...")
hyphenation_data = []
# Sort by frequency (most common first) then alphabetically
sorted_words = sorted(word_counts.items(), key=lambda x: (-x[1], x[0].lower()))
for word, count in sorted_words:
# Filter by minimum length
if len(word) < min_length:
continue
# Get hyphenation (may produce no '=' characters)
hyphenated = hyphenator.inserted(word, hyphen="=")
# Include all words (so we can take the top N most common words even if
# they don't have hyphenation points). This replaces the previous filter
# which dropped words without '='.
hyphenation_data.append(
{"word": word, "hyphenated": hyphenated, "count": count}
)
# Stop if we've reached max_words
if max_words and len(hyphenation_data) >= max_words:
break
print(f"Generated {len(hyphenation_data)} hyphenated words")
# Write output file
print(f"Writing to: {output_file}")
with open(output_file, "w", encoding="utf-8") as f:
# Write header with metadata
f.write(f"# Hyphenation Test Data\n")
f.write(f"# Source: {Path(input_file).name}\n")
f.write(f"# Language: {language}\n")
f.write(f"# Total words: {len(hyphenation_data)}\n")
f.write(f"# Format: word | hyphenated_form | frequency_in_source\n")
f.write(f"#\n")
f.write(f"# Hyphenation points are marked with '='\n")
f.write(f"# Example: Silbentrennung -> Sil=ben=tren=nung\n")
f.write(f"#\n\n")
# Write data
for item in hyphenation_data:
f.write(f"{item['word']}|{item['hyphenated']}|{item['count']}\n")
print("Done!")
# Print some statistics
print("\n=== Statistics ===")
print(f"Total unique words extracted: {len(word_counts)}")
print(f"Words with hyphenation points: {len(hyphenation_data)}")
print(
f"Average hyphenation points per word: {sum(h['hyphenated'].count('=') for h in hyphenation_data) / len(hyphenation_data):.2f}"
)
# Print some examples
print("\n=== Examples (first 10) ===")
for item in hyphenation_data[:10]:
print(
f" {item['word']:20} -> {item['hyphenated']:30} (appears {item['count']}x)"
)
def main():
parser = argparse.ArgumentParser(
description="Generate hyphenation test data from a text file",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Generate test data from a German book
python generate_hyphenation_test_data.py ../data/books/bobiverse_1.txt hyphenation_test_data.txt
# Limit to 500 most common words
python generate_hyphenation_test_data.py ../data/books/bobiverse_1.txt hyphenation_test_data.txt --max-words 500
# Use English hyphenation (when available)
python generate_hyphenation_test_data.py book.txt test_en.txt --language en_US
""",
)
parser.add_argument("input_file", help="Input text file to extract words from")
parser.add_argument("output_file", help="Output file for hyphenation test data")
parser.add_argument(
"--language", default="de_DE", help="Language code (default: de_DE)"
)
parser.add_argument(
"--min-length", type=int, default=6, help="Minimum word length (default: 6)"
)
parser.add_argument(
"--max-words",
type=int,
help="Maximum number of words to include (default: all)",
)
args = parser.parse_args()
generate_hyphenation_data(
args.input_file,
args.output_file,
language=args.language,
min_length=args.min_length,
max_words=args.max_words,
)
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

32
test/run_hyphenation_eval.sh Executable file
View File

@ -0,0 +1,32 @@
#!/usr/bin/env bash
set -euo pipefail
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
BUILD_DIR="$ROOT_DIR/build/hyphenation_eval"
BINARY="$BUILD_DIR/HyphenationEvaluationTest"
mkdir -p "$BUILD_DIR"
SOURCES=(
"$ROOT_DIR/test/hyphenation_eval/HyphenationEvaluationTest.cpp"
"$ROOT_DIR/lib/Epub/Epub/hyphenation/Hyphenator.cpp"
"$ROOT_DIR/lib/Epub/Epub/hyphenation/LanguageRegistry.cpp"
"$ROOT_DIR/lib/Epub/Epub/hyphenation/LiangHyphenation.cpp"
"$ROOT_DIR/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp"
"$ROOT_DIR/lib/Utf8/Utf8.cpp"
)
CXXFLAGS=(
-std=c++20
-O2
-Wall
-Wextra
-pedantic
-I"$ROOT_DIR"
-I"$ROOT_DIR/lib"
-I"$ROOT_DIR/lib/Utf8"
)
c++ "${CXXFLAGS[@]}" "${SOURCES[@]}" -o "$BINARY"
"$BINARY" "$@"