mirror of
https://github.com/daveallie/crosspoint-reader.git
synced 2026-02-04 14:47:37 +03:00
## Summary * Adds (optional) Hyphenation for English, French, German, Russian languages ## Additional Context * Included hyphenation dictionaries add approximately 280kb to the flash usage (German alone takes 200kb) * Trie encoded dictionaries are adopted from hypher project (https://github.com/typst/hypher) * Soft hyphens (and other explicit hyphens) take precedence over dict-based hyphenation. Overall, the hyphenation rules are quite aggressive, as I believe it makes more sense on our smaller screen. --------- Co-authored-by: Dave Allie <dave@daveallie.com>
233 lines
7.6 KiB
Python
233 lines
7.6 KiB
Python
"""
|
|
Generate hyphenation test data from a text file.
|
|
|
|
This script extracts unique words from a book and generates ground truth
|
|
hyphenations using the pyphen library, which can be used to test and validate
|
|
the hyphenation implementations (e.g., German, English, Russian).
|
|
|
|
Usage:
|
|
python generate_hyphenation_test_data.py <input_file> <output_file>
|
|
[--language de_DE] [--max-words 5000] [--min-prefix 2] [--min-suffix 2]
|
|
|
|
Requirements:
|
|
pip install pyphen
|
|
"""
|
|
|
|
import argparse
|
|
import re
|
|
from collections import Counter
|
|
import pyphen
|
|
from pathlib import Path
|
|
import zipfile
|
|
|
|
|
|
def extract_text_from_epub(epub_path):
|
|
"""Extract textual content from an .epub archive by concatenating HTML/XHTML files."""
|
|
texts = []
|
|
with zipfile.ZipFile(epub_path, "r") as z:
|
|
for name in z.namelist():
|
|
lower = name.lower()
|
|
if (
|
|
lower.endswith(".xhtml")
|
|
or lower.endswith(".html")
|
|
or lower.endswith(".htm")
|
|
):
|
|
try:
|
|
data = z.read(name).decode("utf-8", errors="ignore")
|
|
except Exception:
|
|
continue
|
|
# Remove tags
|
|
text = re.sub(r"<[^>]+>", " ", data)
|
|
texts.append(text)
|
|
return "\n".join(texts)
|
|
|
|
|
|
def extract_words(text):
|
|
"""Extract all words from text, preserving original case."""
|
|
# Match runs of Unicode letters (any script) while excluding digits/underscores
|
|
return re.findall(r"[^\W\d_]+", text, flags=re.UNICODE)
|
|
|
|
|
|
def clean_word(word):
|
|
"""Normalize word for hyphenation testing."""
|
|
# Keep original case but strip any non-letter characters
|
|
return word.strip()
|
|
|
|
|
|
def generate_hyphenation_data(
|
|
input_file,
|
|
output_file,
|
|
language="de_DE",
|
|
min_length=6,
|
|
max_words=5000,
|
|
min_prefix=2,
|
|
min_suffix=2,
|
|
):
|
|
"""
|
|
Generate hyphenation test data from a text file.
|
|
|
|
Args:
|
|
input_file: Path to input text file
|
|
output_file: Path to output file with hyphenation data
|
|
language: Language code for pyphen (e.g., 'de_DE', 'en_US')
|
|
min_length: Minimum word length to include
|
|
max_words: Maximum number of words to include (default: 5000)
|
|
min_prefix: Minimum characters allowed before the first hyphen (default: 2)
|
|
min_suffix: Minimum characters allowed after the last hyphen (default: 2)
|
|
"""
|
|
print(f"Reading from: {input_file}")
|
|
|
|
# Read the input file
|
|
if str(input_file).lower().endswith(".epub"):
|
|
print("Detected .epub input; extracting HTML content")
|
|
text = extract_text_from_epub(input_file)
|
|
else:
|
|
with open(input_file, "r", encoding="utf-8") as f:
|
|
text = f.read()
|
|
|
|
# Extract words
|
|
print("Extracting words...")
|
|
words = extract_words(text)
|
|
print(f"Found {len(words)} total words")
|
|
|
|
# Count word frequencies
|
|
word_counts = Counter(words)
|
|
print(f"Found {len(word_counts)} unique words")
|
|
|
|
# Initialize pyphen hyphenator
|
|
print(
|
|
f"Initializing hyphenator for language: {language} (min_prefix={min_prefix}, min_suffix={min_suffix})"
|
|
)
|
|
try:
|
|
hyphenator = pyphen.Pyphen(lang=language, left=min_prefix, right=min_suffix)
|
|
except KeyError:
|
|
print(f"Error: Language '{language}' not found in pyphen.")
|
|
print("Available languages include: de_DE, en_US, en_GB, fr_FR, etc.")
|
|
return
|
|
|
|
# Generate hyphenations
|
|
print("Generating hyphenations...")
|
|
hyphenation_data = []
|
|
|
|
# Sort by frequency (most common first) then alphabetically
|
|
sorted_words = sorted(word_counts.items(), key=lambda x: (-x[1], x[0].lower()))
|
|
|
|
for word, count in sorted_words:
|
|
# Filter by minimum length
|
|
if len(word) < min_length:
|
|
continue
|
|
|
|
# Get hyphenation (may produce no '=' characters)
|
|
hyphenated = hyphenator.inserted(word, hyphen="=")
|
|
|
|
# Include all words (so we can take the top N most common words even if
|
|
# they don't have hyphenation points). This replaces the previous filter
|
|
# which dropped words without '='.
|
|
hyphenation_data.append(
|
|
{"word": word, "hyphenated": hyphenated, "count": count}
|
|
)
|
|
|
|
# Stop if we've reached max_words
|
|
if max_words and len(hyphenation_data) >= max_words:
|
|
break
|
|
|
|
print(f"Generated {len(hyphenation_data)} hyphenated words")
|
|
|
|
# Write output file
|
|
print(f"Writing to: {output_file}")
|
|
with open(output_file, "w", encoding="utf-8") as f:
|
|
# Write header with metadata
|
|
f.write(f"# Hyphenation Test Data\n")
|
|
f.write(f"# Source: {Path(input_file).name}\n")
|
|
f.write(f"# Language: {language}\n")
|
|
f.write(f"# Min prefix: {min_prefix}\n")
|
|
f.write(f"# Min suffix: {min_suffix}\n")
|
|
f.write(f"# Total words: {len(hyphenation_data)}\n")
|
|
f.write(f"# Format: word | hyphenated_form | frequency_in_source\n")
|
|
f.write(f"#\n")
|
|
f.write(f"# Hyphenation points are marked with '='\n")
|
|
f.write(f"# Example: Silbentrennung -> Sil=ben=tren=nung\n")
|
|
f.write(f"#\n\n")
|
|
|
|
# Write data
|
|
for item in hyphenation_data:
|
|
f.write(f"{item['word']}|{item['hyphenated']}|{item['count']}\n")
|
|
|
|
print("Done!")
|
|
|
|
# Print some statistics
|
|
print("\n=== Statistics ===")
|
|
print(f"Total unique words extracted: {len(word_counts)}")
|
|
print(f"Words with hyphenation points: {len(hyphenation_data)}")
|
|
print(
|
|
f"Average hyphenation points per word: {sum(h['hyphenated'].count('=') for h in hyphenation_data) / len(hyphenation_data):.2f}"
|
|
)
|
|
|
|
# Print some examples
|
|
print("\n=== Examples (first 10) ===")
|
|
for item in hyphenation_data[:10]:
|
|
print(
|
|
f" {item['word']:20} -> {item['hyphenated']:30} (appears {item['count']}x)"
|
|
)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Generate hyphenation test data from a text file",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
# Generate test data from a German book
|
|
python generate_hyphenation_test_data.py ../data/books/bobiverse_1.txt hyphenation_test_data.txt
|
|
|
|
# Limit to 500 most common words
|
|
python generate_hyphenation_test_data.py ../data/books/bobiverse_1.txt hyphenation_test_data.txt --max-words 500
|
|
|
|
# Use English hyphenation (when available)
|
|
python generate_hyphenation_test_data.py book.txt test_en.txt --language en_US
|
|
""",
|
|
)
|
|
|
|
parser.add_argument("input_file", help="Input text file to extract words from")
|
|
parser.add_argument("output_file", help="Output file for hyphenation test data")
|
|
parser.add_argument(
|
|
"--language", default="de_DE", help="Language code (default: de_DE)"
|
|
)
|
|
parser.add_argument(
|
|
"--min-length", type=int, default=6, help="Minimum word length (default: 6)"
|
|
)
|
|
parser.add_argument(
|
|
"--max-words",
|
|
type=int,
|
|
default=5000,
|
|
help="Maximum number of words to include (default: 5000)",
|
|
)
|
|
parser.add_argument(
|
|
"--min-prefix",
|
|
type=int,
|
|
default=2,
|
|
help="Minimum characters permitted before the first hyphen (default: 2)",
|
|
)
|
|
parser.add_argument(
|
|
"--min-suffix",
|
|
type=int,
|
|
default=2,
|
|
help="Minimum characters permitted after the last hyphen (default: 2)",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
generate_hyphenation_data(
|
|
args.input_file,
|
|
args.output_file,
|
|
language=args.language,
|
|
min_length=args.min_length,
|
|
max_words=args.max_words,
|
|
min_prefix=args.min_prefix,
|
|
min_suffix=args.min_suffix,
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|