mirror of
https://github.com/daveallie/crosspoint-reader.git
synced 2026-02-06 15:47:39 +03:00
default min suffix/prefix length 3 for english
This commit is contained in:
parent
3f9d84dafb
commit
85a737fd82
@ -11,10 +11,11 @@
|
||||
|
||||
namespace {
|
||||
|
||||
LanguageHyphenator englishHyphenator(en_us_patterns, isLatinLetter, toLowerLatin);
|
||||
// English hyphenation patterns (3/3 minimum prefix/suffix length)
|
||||
LanguageHyphenator englishHyphenator(en_us_patterns, isLatinLetter, toLowerLatin, 3, 3);
|
||||
LanguageHyphenator frenchHyphenator(fr_patterns, isLatinLetter, toLowerLatin);
|
||||
LanguageHyphenator germanHyphenator(de_patterns, isLatinLetter, toLowerLatin);
|
||||
LanguageHyphenator russianHyphenator(ru_ru_patterns, isCyrillicLetter, toLowerCyrillic, 2, 2);
|
||||
LanguageHyphenator russianHyphenator(ru_ru_patterns, isCyrillicLetter, toLowerCyrillic);
|
||||
|
||||
using EntryArray = std::array<LanguageEntry, 4>;
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -6,7 +6,8 @@ hyphenations using the pyphen library, which can be used to test and validate
|
||||
the hyphenation implementations (e.g., German, English, Russian).
|
||||
|
||||
Usage:
|
||||
python generate_hyphenation_test_data.py <input_file> <output_file> [--language de_DE]
|
||||
python generate_hyphenation_test_data.py <input_file> <output_file>
|
||||
[--language de_DE] [--max-words 5000] [--min-prefix 2] [--min-suffix 2]
|
||||
|
||||
Requirements:
|
||||
pip install pyphen
|
||||
@ -54,7 +55,13 @@ def clean_word(word):
|
||||
|
||||
|
||||
def generate_hyphenation_data(
|
||||
input_file, output_file, language="de_DE", min_length=6, max_words=None
|
||||
input_file,
|
||||
output_file,
|
||||
language="de_DE",
|
||||
min_length=6,
|
||||
max_words=5000,
|
||||
min_prefix=2,
|
||||
min_suffix=2,
|
||||
):
|
||||
"""
|
||||
Generate hyphenation test data from a text file.
|
||||
@ -64,7 +71,9 @@ def generate_hyphenation_data(
|
||||
output_file: Path to output file with hyphenation data
|
||||
language: Language code for pyphen (e.g., 'de_DE', 'en_US')
|
||||
min_length: Minimum word length to include
|
||||
max_words: Maximum number of words to include (None for all)
|
||||
max_words: Maximum number of words to include (default: 5000)
|
||||
min_prefix: Minimum characters allowed before the first hyphen (default: 2)
|
||||
min_suffix: Minimum characters allowed after the last hyphen (default: 2)
|
||||
"""
|
||||
print(f"Reading from: {input_file}")
|
||||
|
||||
@ -86,9 +95,11 @@ def generate_hyphenation_data(
|
||||
print(f"Found {len(word_counts)} unique words")
|
||||
|
||||
# Initialize pyphen hyphenator
|
||||
print(f"Initializing hyphenator for language: {language}")
|
||||
print(
|
||||
f"Initializing hyphenator for language: {language} (min_prefix={min_prefix}, min_suffix={min_suffix})"
|
||||
)
|
||||
try:
|
||||
hyphenator = pyphen.Pyphen(lang=language)
|
||||
hyphenator = pyphen.Pyphen(lang=language, left=min_prefix, right=min_suffix)
|
||||
except KeyError:
|
||||
print(f"Error: Language '{language}' not found in pyphen.")
|
||||
print("Available languages include: de_DE, en_US, en_GB, fr_FR, etc.")
|
||||
@ -129,6 +140,8 @@ def generate_hyphenation_data(
|
||||
f.write(f"# Hyphenation Test Data\n")
|
||||
f.write(f"# Source: {Path(input_file).name}\n")
|
||||
f.write(f"# Language: {language}\n")
|
||||
f.write(f"# Min prefix: {min_prefix}\n")
|
||||
f.write(f"# Min suffix: {min_suffix}\n")
|
||||
f.write(f"# Total words: {len(hyphenation_data)}\n")
|
||||
f.write(f"# Format: word | hyphenated_form | frequency_in_source\n")
|
||||
f.write(f"#\n")
|
||||
@ -186,7 +199,20 @@ Examples:
|
||||
parser.add_argument(
|
||||
"--max-words",
|
||||
type=int,
|
||||
help="Maximum number of words to include (default: all)",
|
||||
default=5000,
|
||||
help="Maximum number of words to include (default: 5000)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--min-prefix",
|
||||
type=int,
|
||||
default=2,
|
||||
help="Minimum characters permitted before the first hyphen (default: 2)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--min-suffix",
|
||||
type=int,
|
||||
default=2,
|
||||
help="Minimum characters permitted after the last hyphen (default: 2)",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
@ -197,6 +223,8 @@ Examples:
|
||||
language=args.language,
|
||||
min_length=args.min_length,
|
||||
max_words=args.max_words,
|
||||
min_prefix=args.min_prefix,
|
||||
min_suffix=args.min_suffix,
|
||||
)
|
||||
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user