mirror of
https://github.com/daveallie/crosspoint-reader.git
synced 2026-02-07 08:07:40 +03:00
default min suffix/prefix length 3 for english
This commit is contained in:
parent
3f9d84dafb
commit
85a737fd82
@ -11,10 +11,11 @@
|
|||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
LanguageHyphenator englishHyphenator(en_us_patterns, isLatinLetter, toLowerLatin);
|
// English hyphenation patterns (3/3 minimum prefix/suffix length)
|
||||||
|
LanguageHyphenator englishHyphenator(en_us_patterns, isLatinLetter, toLowerLatin, 3, 3);
|
||||||
LanguageHyphenator frenchHyphenator(fr_patterns, isLatinLetter, toLowerLatin);
|
LanguageHyphenator frenchHyphenator(fr_patterns, isLatinLetter, toLowerLatin);
|
||||||
LanguageHyphenator germanHyphenator(de_patterns, isLatinLetter, toLowerLatin);
|
LanguageHyphenator germanHyphenator(de_patterns, isLatinLetter, toLowerLatin);
|
||||||
LanguageHyphenator russianHyphenator(ru_ru_patterns, isCyrillicLetter, toLowerCyrillic, 2, 2);
|
LanguageHyphenator russianHyphenator(ru_ru_patterns, isCyrillicLetter, toLowerCyrillic);
|
||||||
|
|
||||||
using EntryArray = std::array<LanguageEntry, 4>;
|
using EntryArray = std::array<LanguageEntry, 4>;
|
||||||
|
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -6,7 +6,8 @@ hyphenations using the pyphen library, which can be used to test and validate
|
|||||||
the hyphenation implementations (e.g., German, English, Russian).
|
the hyphenation implementations (e.g., German, English, Russian).
|
||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
python generate_hyphenation_test_data.py <input_file> <output_file> [--language de_DE]
|
python generate_hyphenation_test_data.py <input_file> <output_file>
|
||||||
|
[--language de_DE] [--max-words 5000] [--min-prefix 2] [--min-suffix 2]
|
||||||
|
|
||||||
Requirements:
|
Requirements:
|
||||||
pip install pyphen
|
pip install pyphen
|
||||||
@ -54,7 +55,13 @@ def clean_word(word):
|
|||||||
|
|
||||||
|
|
||||||
def generate_hyphenation_data(
|
def generate_hyphenation_data(
|
||||||
input_file, output_file, language="de_DE", min_length=6, max_words=None
|
input_file,
|
||||||
|
output_file,
|
||||||
|
language="de_DE",
|
||||||
|
min_length=6,
|
||||||
|
max_words=5000,
|
||||||
|
min_prefix=2,
|
||||||
|
min_suffix=2,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Generate hyphenation test data from a text file.
|
Generate hyphenation test data from a text file.
|
||||||
@ -64,7 +71,9 @@ def generate_hyphenation_data(
|
|||||||
output_file: Path to output file with hyphenation data
|
output_file: Path to output file with hyphenation data
|
||||||
language: Language code for pyphen (e.g., 'de_DE', 'en_US')
|
language: Language code for pyphen (e.g., 'de_DE', 'en_US')
|
||||||
min_length: Minimum word length to include
|
min_length: Minimum word length to include
|
||||||
max_words: Maximum number of words to include (None for all)
|
max_words: Maximum number of words to include (default: 5000)
|
||||||
|
min_prefix: Minimum characters allowed before the first hyphen (default: 2)
|
||||||
|
min_suffix: Minimum characters allowed after the last hyphen (default: 2)
|
||||||
"""
|
"""
|
||||||
print(f"Reading from: {input_file}")
|
print(f"Reading from: {input_file}")
|
||||||
|
|
||||||
@ -86,9 +95,11 @@ def generate_hyphenation_data(
|
|||||||
print(f"Found {len(word_counts)} unique words")
|
print(f"Found {len(word_counts)} unique words")
|
||||||
|
|
||||||
# Initialize pyphen hyphenator
|
# Initialize pyphen hyphenator
|
||||||
print(f"Initializing hyphenator for language: {language}")
|
print(
|
||||||
|
f"Initializing hyphenator for language: {language} (min_prefix={min_prefix}, min_suffix={min_suffix})"
|
||||||
|
)
|
||||||
try:
|
try:
|
||||||
hyphenator = pyphen.Pyphen(lang=language)
|
hyphenator = pyphen.Pyphen(lang=language, left=min_prefix, right=min_suffix)
|
||||||
except KeyError:
|
except KeyError:
|
||||||
print(f"Error: Language '{language}' not found in pyphen.")
|
print(f"Error: Language '{language}' not found in pyphen.")
|
||||||
print("Available languages include: de_DE, en_US, en_GB, fr_FR, etc.")
|
print("Available languages include: de_DE, en_US, en_GB, fr_FR, etc.")
|
||||||
@ -129,6 +140,8 @@ def generate_hyphenation_data(
|
|||||||
f.write(f"# Hyphenation Test Data\n")
|
f.write(f"# Hyphenation Test Data\n")
|
||||||
f.write(f"# Source: {Path(input_file).name}\n")
|
f.write(f"# Source: {Path(input_file).name}\n")
|
||||||
f.write(f"# Language: {language}\n")
|
f.write(f"# Language: {language}\n")
|
||||||
|
f.write(f"# Min prefix: {min_prefix}\n")
|
||||||
|
f.write(f"# Min suffix: {min_suffix}\n")
|
||||||
f.write(f"# Total words: {len(hyphenation_data)}\n")
|
f.write(f"# Total words: {len(hyphenation_data)}\n")
|
||||||
f.write(f"# Format: word | hyphenated_form | frequency_in_source\n")
|
f.write(f"# Format: word | hyphenated_form | frequency_in_source\n")
|
||||||
f.write(f"#\n")
|
f.write(f"#\n")
|
||||||
@ -186,7 +199,20 @@ Examples:
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--max-words",
|
"--max-words",
|
||||||
type=int,
|
type=int,
|
||||||
help="Maximum number of words to include (default: all)",
|
default=5000,
|
||||||
|
help="Maximum number of words to include (default: 5000)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--min-prefix",
|
||||||
|
type=int,
|
||||||
|
default=2,
|
||||||
|
help="Minimum characters permitted before the first hyphen (default: 2)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--min-suffix",
|
||||||
|
type=int,
|
||||||
|
default=2,
|
||||||
|
help="Minimum characters permitted after the last hyphen (default: 2)",
|
||||||
)
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
@ -197,6 +223,8 @@ Examples:
|
|||||||
language=args.language,
|
language=args.language,
|
||||||
min_length=args.min_length,
|
min_length=args.min_length,
|
||||||
max_words=args.max_words,
|
max_words=args.max_words,
|
||||||
|
min_prefix=args.min_prefix,
|
||||||
|
min_suffix=args.min_suffix,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user