hyphenation evaluation

2026-02-06 23:57:39 +03:00 · 2026-01-14 01:10:18 +05:00 · 2026-01-14 01:10:18 +05:00 · ee2db07e64
commit ee2db07e64
parent 3de16d679a
7 changed files with 46049 additions and 0 deletions
--- a/test/hyphenation_eval/HyphenationEvaluationTest.cpp
+++ b/test/hyphenation_eval/HyphenationEvaluationTest.cpp
@ -0,0 +1,389 @@
 #include <algorithm>
 #include <cctype>
 #include <cmath>
 #include <fstream>
 #include <functional>
 #include <iostream>
 #include <sstream>
 #include <string>
 #include <vector>
 #include <Utf8.h>
 #include "lib/Epub/Epub/hyphenation/HyphenationCommon.h"
 #include "lib/Epub/Epub/hyphenation/LanguageRegistry.h"
 #include "lib/Epub/Epub/hyphenation/LanguageHyphenator.h"
 struct TestCase {
  std::string word;
  std::string hyphenated;
  std::vector<size_t> expectedPositions;
  int frequency;
 };
 struct EvaluationResult {
  int truePositives = 0;
  int falsePositives = 0;
  int falseNegatives = 0;
  double precision = 0.0;
  double recall = 0.0;
  double f1Score = 0.0;
  double weightedScore = 0.0;
 };
 struct LanguageConfig {
  std::string cliName;
  std::string testDataFile;
  const char* primaryTag;
 };
 const std::vector<LanguageConfig> kSupportedLanguages = {
    {"english", "test/hyphenation_eval/resources/english_hyphenation_tests.txt", "en"},
    {"french", "test/hyphenation_eval/resources/french_hyphenation_tests.txt", "fr"},
    {"german", "test/hyphenation_eval/resources/german_hyphenation_tests.txt", "de"},
    {"russian", "test/hyphenation_eval/resources/russian_hyphenation_tests.txt", "ru"},
 };
 std::vector<size_t> expectedPositionsFromAnnotatedWord(const std::string& annotated) {
  std::vector<size_t> positions;
  const unsigned char* ptr = reinterpret_cast<const unsigned char*>(annotated.c_str());
  size_t codepointIndex = 0;
  while (*ptr != 0) {
    if (*ptr == '=') {
      positions.push_back(codepointIndex);
      ++ptr;
      continue;
    }
    utf8NextCodepoint(&ptr);
    ++codepointIndex;
  }
  return positions;
 }
 std::vector<TestCase> loadTestData(const std::string& filename) {
  std::vector<TestCase> testCases;
  std::ifstream file(filename);
  if (!file.is_open()) {
    std::cerr << "Error: Could not open file " << filename << std::endl;
    return testCases;
  }
  std::string line;
  while (std::getline(file, line)) {
    if (line.empty() || line[0] == '#') {
      continue;
    }
    std::istringstream iss(line);
    std::string word, hyphenated, freqStr;
    if (std::getline(iss, word, '|') && std::getline(iss, hyphenated, '|') && std::getline(iss, freqStr, '|')) {
      TestCase testCase;
      testCase.word = word;
      testCase.hyphenated = hyphenated;
      testCase.frequency = std::stoi(freqStr);
      testCase.expectedPositions = expectedPositionsFromAnnotatedWord(hyphenated);
      testCases.push_back(testCase);
    }
  }
  file.close();
  return testCases;
 }
 std::string positionsToHyphenated(const std::string& word, const std::vector<size_t>& positions) {
  std::string result;
  std::vector<size_t> sortedPositions = positions;
  std::sort(sortedPositions.begin(), sortedPositions.end());
  const unsigned char* ptr = reinterpret_cast<const unsigned char*>(word.c_str());
  size_t codepointIndex = 0;
  size_t posIdx = 0;
  while (*ptr != 0) {
    while (posIdx < sortedPositions.size() && sortedPositions[posIdx] == codepointIndex) {
      result.push_back('=');
      ++posIdx;
    }
    const unsigned char* current = ptr;
    utf8NextCodepoint(&ptr);
    result.append(reinterpret_cast<const char*>(current), reinterpret_cast<const char*>(ptr));
    ++codepointIndex;
  }
  while (posIdx < sortedPositions.size() && sortedPositions[posIdx] == codepointIndex) {
    result.push_back('=');
    ++posIdx;
  }
  return result;
 }
 std::vector<size_t> hyphenateWordWithHyphenator(const std::string& word, const LanguageHyphenator& hyphenator) {
  auto cps = collectCodepoints(word);
  trimSurroundingPunctuation(cps);
  trimTrailingFootnoteReference(cps);
  return hyphenator.breakIndexes(cps);
 }
 std::vector<LanguageConfig> resolveLanguages(const std::string& selection) {
  if (selection == "all") {
    return kSupportedLanguages;
  }
  for (const auto& config : kSupportedLanguages) {
    if (config.cliName == selection) {
      return {config};
    }
  }
  return {};
 }
 EvaluationResult evaluateWord(const TestCase& testCase,
                              std::function<std::vector<size_t>(const std::string&)> hyphenateFunc) {
  EvaluationResult result;
  std::vector<size_t> actualPositions = hyphenateFunc(testCase.word);
  std::vector<size_t> expected = testCase.expectedPositions;
  std::vector<size_t> actual = actualPositions;
  std::sort(expected.begin(), expected.end());
  std::sort(actual.begin(), actual.end());
  for (size_t pos : actual) {
    if (std::find(expected.begin(), expected.end(), pos) != expected.end()) {
      result.truePositives++;
    } else {
      result.falsePositives++;
    }
  }
  for (size_t pos : expected) {
    if (std::find(actual.begin(), actual.end(), pos) == actual.end()) {
      result.falseNegatives++;
    }
  }
  if (result.truePositives + result.falsePositives > 0) {
    result.precision = static_cast<double>(result.truePositives) / (result.truePositives + result.falsePositives);
  }
  if (result.truePositives + result.falseNegatives > 0) {
    result.recall = static_cast<double>(result.truePositives) / (result.truePositives + result.falseNegatives);
  }
  if (result.precision + result.recall > 0) {
    result.f1Score = 2 * result.precision * result.recall / (result.precision + result.recall);
  }
  // Treat words that contain no hyphenation marks in both the expected data and the
  // algorithmic output as perfect matches so they don't drag down the per-word averages.
  if (expected.empty() && actual.empty()) {
    result.precision = 1.0;
    result.recall = 1.0;
    result.f1Score = 1.0;
  }
  double fpPenalty = 2.0;
  double fnPenalty = 1.0;
  int totalErrors = result.falsePositives * fpPenalty + result.falseNegatives * fnPenalty;
  int totalPossible = expected.size() * fpPenalty;
  if (totalPossible > 0) {
    result.weightedScore = 1.0 - (static_cast<double>(totalErrors) / totalPossible);
    result.weightedScore = std::max(0.0, result.weightedScore);
  } else if (result.falsePositives == 0) {
    result.weightedScore = 1.0;
  }
  return result;
 }
 void printResults(const std::string& language, const std::vector<TestCase>& testCases,
                  const std::vector<std::pair<TestCase, EvaluationResult>>& worstCases, int perfectMatches,
                  int partialMatches, int completeMisses, double totalPrecision, double totalRecall, double totalF1,
                  double totalWeighted, int totalTP, int totalFP, int totalFN,
                  std::function<std::vector<size_t>(const std::string&)> hyphenateFunc) {
  std::string lang_upper = language;
  if (!lang_upper.empty()) {
    lang_upper[0] = std::toupper(lang_upper[0]);
  }
  std::cout << "================================================================================" << std::endl;
  std::cout << lang_upper << " HYPHENATION EVALUATION RESULTS" << std::endl;
  std::cout << "================================================================================" << std::endl;
  std::cout << std::endl;
  std::cout << "Total test cases:   " << testCases.size() << std::endl;
  std::cout << "Perfect matches:    " << perfectMatches << " (" << (perfectMatches * 100.0 / testCases.size()) << "%)"
            << std::endl;
  std::cout << "Partial matches:    " << partialMatches << std::endl;
  std::cout << "Complete misses:    " << completeMisses << std::endl;
  std::cout << std::endl;
  std::cout << "--- Overall Metrics (averaged per word) ---" << std::endl;
  std::cout << "Average Precision:       " << (totalPrecision / testCases.size() * 100.0) << "%" << std::endl;
  std::cout << "Average Recall:          " << (totalRecall / testCases.size() * 100.0) << "%" << std::endl;
  std::cout << "Average F1 Score:        " << (totalF1 / testCases.size() * 100.0) << "%" << std::endl;
  std::cout << "Average Weighted Score:  " << (totalWeighted / testCases.size() * 100.0) << "% (FP penalty: 2x)"
            << std::endl;
  std::cout << std::endl;
  std::cout << "--- Overall Metrics (total counts) ---" << std::endl;
  std::cout << "True Positives:          " << totalTP << std::endl;
  std::cout << "False Positives:         " << totalFP << " (incorrect hyphenation points)" << std::endl;
  std::cout << "False Negatives:         " << totalFN << " (missed hyphenation points)" << std::endl;
  double overallPrecision = totalTP + totalFP > 0 ? static_cast<double>(totalTP) / (totalTP + totalFP) : 0.0;
  double overallRecall = totalTP + totalFN > 0 ? static_cast<double>(totalTP) / (totalTP + totalFN) : 0.0;
  double overallF1 = overallPrecision + overallRecall > 0
                         ? 2 * overallPrecision * overallRecall / (overallPrecision + overallRecall)
                         : 0.0;
  std::cout << "Overall Precision:       " << (overallPrecision * 100.0) << "%" << std::endl;
  std::cout << "Overall Recall:          " << (overallRecall * 100.0) << "%" << std::endl;
  std::cout << "Overall F1 Score:        " << (overallF1 * 100.0) << "%" << std::endl;
  std::cout << std::endl;
  // Filter out perfect matches from the “worst cases” section so that only actionable failures appear.
  auto hasImperfection = [](const EvaluationResult& r) { return r.weightedScore < 0.999999; };
  std::vector<std::pair<TestCase, EvaluationResult>> imperfectCases;
  imperfectCases.reserve(worstCases.size());
  for (const auto& entry : worstCases) {
    if (hasImperfection(entry.second)) {
      imperfectCases.push_back(entry);
    }
  }
  std::cout << "--- Worst Cases (lowest weighted scores) ---" << std::endl;
  int showCount = std::min(10, static_cast<int>(imperfectCases.size()));
  for (int i = 0; i < showCount; i++) {
    const auto& testCase = imperfectCases[i].first;
    const auto& result = imperfectCases[i].second;
    std::vector<size_t> actualPositions = hyphenateFunc(testCase.word);
    std::string actualHyphenated = positionsToHyphenated(testCase.word, actualPositions);
    std::cout << "Word: " << testCase.word << " (freq: " << testCase.frequency << ")" << std::endl;
    std::cout << "  Expected:  " << testCase.hyphenated << std::endl;
    std::cout << "  Got:       " << actualHyphenated << std::endl;
    std::cout << "  Precision: " << (result.precision * 100.0) << "%"
              << "  Recall: " << (result.recall * 100.0) << "%"
              << "  F1: " << (result.f1Score * 100.0) << "%"
              << "  Weighted: " << (result.weightedScore * 100.0) << "%" << std::endl;
    std::cout << "  TP: " << result.truePositives << "  FP: " << result.falsePositives
              << "  FN: " << result.falseNegatives << std::endl;
    std::cout << std::endl;
  }
  // Additional compact list of the worst ~100 words to aid iteration
  int compactCount = std::min(100, static_cast<int>(imperfectCases.size()));
  if (compactCount > 0) {
    std::cout << "--- Compact Worst Cases (" << compactCount << ") ---" << std::endl;
    for (int i = 0; i < compactCount; i++) {
      const auto& testCase = imperfectCases[i].first;
      std::vector<size_t> actualPositions = hyphenateFunc(testCase.word);
      std::string actualHyphenated = positionsToHyphenated(testCase.word, actualPositions);
      std::cout << testCase.word << " | exp:" << testCase.hyphenated << " | got:" << actualHyphenated << std::endl;
    }
    std::cout << std::endl;
  }
 }
 int main(int argc, char* argv[]) {
  const bool summaryMode = argc <= 1;
  const std::string languageSelection = summaryMode ? "all" : argv[1];
  std::vector<LanguageConfig> languages = resolveLanguages(languageSelection);
  if (languages.empty()) {
    std::cerr << "Unknown language: " << languageSelection << std::endl;
    return 1;
  }
  for (const auto& lang : languages) {
    const auto* hyphenator = getLanguageHyphenatorForPrimaryTag(lang.primaryTag);
    if (!hyphenator) {
      std::cerr << "No hyphenator registered for tag: " << lang.primaryTag << std::endl;
      continue;
    }
    const auto hyphenateFunc = [hyphenator](const std::string& word) {
      return hyphenateWordWithHyphenator(word, *hyphenator);
    };
    if (!summaryMode) {
      std::cout << "Loading test data from: " << lang.testDataFile << std::endl;
    }
    std::vector<TestCase> testCases = loadTestData(lang.testDataFile);
    if (testCases.empty()) {
      std::cerr << "No test cases loaded for " << lang.cliName << ". Skipping." << std::endl;
      continue;
    }
    if (!summaryMode) {
      std::cout << "Loaded " << testCases.size() << " test cases for " << lang.cliName << std::endl;
      std::cout << std::endl;
    }
    int perfectMatches = 0;
    int partialMatches = 0;
    int completeMisses = 0;
    double totalPrecision = 0.0;
    double totalRecall = 0.0;
    double totalF1 = 0.0;
    double totalWeighted = 0.0;
    int totalTP = 0, totalFP = 0, totalFN = 0;
    std::vector<std::pair<TestCase, EvaluationResult>> worstCases;
    for (const auto& testCase : testCases) {
      EvaluationResult result = evaluateWord(testCase, hyphenateFunc);
      totalTP += result.truePositives;
      totalFP += result.falsePositives;
      totalFN += result.falseNegatives;
      totalPrecision += result.precision;
      totalRecall += result.recall;
      totalF1 += result.f1Score;
      totalWeighted += result.weightedScore;
      if (result.f1Score == 1.0) {
        perfectMatches++;
      } else if (result.f1Score > 0.0) {
        partialMatches++;
      } else {
        completeMisses++;
      }
      worstCases.push_back({testCase, result});
    }
    if (summaryMode) {
      const double averageF1Percent = testCases.empty() ? 0.0 : (totalF1 / testCases.size() * 100.0);
      std::cout << lang.cliName << ": " << averageF1Percent << "%" << std::endl;
      continue;
    }
    std::sort(worstCases.begin(), worstCases.end(),
              [](const auto& a, const auto& b) { return a.second.weightedScore < b.second.weightedScore; });
    printResults(lang.cliName, testCases, worstCases, perfectMatches, partialMatches, completeMisses, totalPrecision,
                 totalRecall, totalF1, totalWeighted, totalTP, totalFP, totalFN, hyphenateFunc);
  }
  return 0;
 }
--- a/test/hyphenation_eval/resources/english_hyphenation_tests.txt
+++ b/test/hyphenation_eval/resources/english_hyphenation_tests.txt
--- a/test/hyphenation_eval/resources/french_hyphenation_tests.txt
+++ b/test/hyphenation_eval/resources/french_hyphenation_tests.txt
--- a/test/hyphenation_eval/resources/generate_hyphenation_test_data.py
+++ b/test/hyphenation_eval/resources/generate_hyphenation_test_data.py
@ -0,0 +1,204 @@
 """
 Generate hyphenation test data from a text file.
 This script extracts unique words from a book and generates ground truth
 hyphenations using the pyphen library, which can be used to test and validate
 the hyphenation implementations (e.g., German, English, Russian).
 Usage:
    python generate_hyphenation_test_data.py <input_file> <output_file> [--language de_DE]
 Requirements:
    pip install pyphen
 """
 import argparse
 import re
 from collections import Counter
 import pyphen
 from pathlib import Path
 import zipfile
 def extract_text_from_epub(epub_path):
    """Extract textual content from an .epub archive by concatenating HTML/XHTML files."""
    texts = []
    with zipfile.ZipFile(epub_path, "r") as z:
        for name in z.namelist():
            lower = name.lower()
            if (
                lower.endswith(".xhtml")
                or lower.endswith(".html")
                or lower.endswith(".htm")
            ):
                try:
                    data = z.read(name).decode("utf-8", errors="ignore")
                except Exception:
                    continue
                # Remove tags
                text = re.sub(r"<[^>]+>", " ", data)
                texts.append(text)
    return "\n".join(texts)
 def extract_words(text):
    """Extract all words from text, preserving original case."""
    # Match runs of Unicode letters (any script) while excluding digits/underscores
    return re.findall(r"[^\W\d_]+", text, flags=re.UNICODE)
 def clean_word(word):
    """Normalize word for hyphenation testing."""
    # Keep original case but strip any non-letter characters
    return word.strip()
 def generate_hyphenation_data(
    input_file, output_file, language="de_DE", min_length=6, max_words=None
 ):
    """
    Generate hyphenation test data from a text file.
    Args:
        input_file: Path to input text file
        output_file: Path to output file with hyphenation data
        language: Language code for pyphen (e.g., 'de_DE', 'en_US')
        min_length: Minimum word length to include
        max_words: Maximum number of words to include (None for all)
    """
    print(f"Reading from: {input_file}")
    # Read the input file
    if str(input_file).lower().endswith(".epub"):
        print("Detected .epub input; extracting HTML content")
        text = extract_text_from_epub(input_file)
    else:
        with open(input_file, "r", encoding="utf-8") as f:
            text = f.read()
    # Extract words
    print("Extracting words...")
    words = extract_words(text)
    print(f"Found {len(words)} total words")
    # Count word frequencies
    word_counts = Counter(words)
    print(f"Found {len(word_counts)} unique words")
    # Initialize pyphen hyphenator
    print(f"Initializing hyphenator for language: {language}")
    try:
        hyphenator = pyphen.Pyphen(lang=language)
    except KeyError:
        print(f"Error: Language '{language}' not found in pyphen.")
        print("Available languages include: de_DE, en_US, en_GB, fr_FR, etc.")
        return
    # Generate hyphenations
    print("Generating hyphenations...")
    hyphenation_data = []
    # Sort by frequency (most common first) then alphabetically
    sorted_words = sorted(word_counts.items(), key=lambda x: (-x[1], x[0].lower()))
    for word, count in sorted_words:
        # Filter by minimum length
        if len(word) < min_length:
            continue
        # Get hyphenation (may produce no '=' characters)
        hyphenated = hyphenator.inserted(word, hyphen="=")
        # Include all words (so we can take the top N most common words even if
        # they don't have hyphenation points). This replaces the previous filter
        # which dropped words without '='.
        hyphenation_data.append(
            {"word": word, "hyphenated": hyphenated, "count": count}
        )
        # Stop if we've reached max_words
        if max_words and len(hyphenation_data) >= max_words:
            break
    print(f"Generated {len(hyphenation_data)} hyphenated words")
    # Write output file
    print(f"Writing to: {output_file}")
    with open(output_file, "w", encoding="utf-8") as f:
        # Write header with metadata
        f.write(f"# Hyphenation Test Data\n")
        f.write(f"# Source: {Path(input_file).name}\n")
        f.write(f"# Language: {language}\n")
        f.write(f"# Total words: {len(hyphenation_data)}\n")
        f.write(f"# Format: word | hyphenated_form | frequency_in_source\n")
        f.write(f"#\n")
        f.write(f"# Hyphenation points are marked with '='\n")
        f.write(f"# Example: Silbentrennung -> Sil=ben=tren=nung\n")
        f.write(f"#\n\n")
        # Write data
        for item in hyphenation_data:
            f.write(f"{item['word']}|{item['hyphenated']}|{item['count']}\n")
    print("Done!")
    # Print some statistics
    print("\n=== Statistics ===")
    print(f"Total unique words extracted: {len(word_counts)}")
    print(f"Words with hyphenation points: {len(hyphenation_data)}")
    print(
        f"Average hyphenation points per word: {sum(h['hyphenated'].count('=') for h in hyphenation_data) / len(hyphenation_data):.2f}"
    )
    # Print some examples
    print("\n=== Examples (first 10) ===")
    for item in hyphenation_data[:10]:
        print(
            f"  {item['word']:20} -> {item['hyphenated']:30} (appears {item['count']}x)"
        )
 def main():
    parser = argparse.ArgumentParser(
        description="Generate hyphenation test data from a text file",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
 Examples:
  # Generate test data from a German book
  python generate_hyphenation_test_data.py ../data/books/bobiverse_1.txt hyphenation_test_data.txt
  # Limit to 500 most common words
  python generate_hyphenation_test_data.py ../data/books/bobiverse_1.txt hyphenation_test_data.txt --max-words 500
  # Use English hyphenation (when available)
  python generate_hyphenation_test_data.py book.txt test_en.txt --language en_US
        """,
    )
    parser.add_argument("input_file", help="Input text file to extract words from")
    parser.add_argument("output_file", help="Output file for hyphenation test data")
    parser.add_argument(
        "--language", default="de_DE", help="Language code (default: de_DE)"
    )
    parser.add_argument(
        "--min-length", type=int, default=6, help="Minimum word length (default: 6)"
    )
    parser.add_argument(
        "--max-words",
        type=int,
        help="Maximum number of words to include (default: all)",
    )
    args = parser.parse_args()
    generate_hyphenation_data(
        args.input_file,
        args.output_file,
        language=args.language,
        min_length=args.min_length,
        max_words=args.max_words,
    )
 if __name__ == "__main__":
    main()
--- a/test/hyphenation_eval/resources/german_hyphenation_tests.txt
+++ b/test/hyphenation_eval/resources/german_hyphenation_tests.txt
--- a/test/hyphenation_eval/resources/russian_hyphenation_tests.txt
+++ b/test/hyphenation_eval/resources/russian_hyphenation_tests.txt
--- a/test/run_hyphenation_eval.sh
+++ b/test/run_hyphenation_eval.sh
@ -0,0 +1,32 @@
 #!/usr/bin/env bash
 set -euo pipefail
 ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 BUILD_DIR="$ROOT_DIR/build/hyphenation_eval"
 BINARY="$BUILD_DIR/HyphenationEvaluationTest"
 mkdir -p "$BUILD_DIR"
 SOURCES=(
  "$ROOT_DIR/test/hyphenation_eval/HyphenationEvaluationTest.cpp"
  "$ROOT_DIR/lib/Epub/Epub/hyphenation/Hyphenator.cpp"
  "$ROOT_DIR/lib/Epub/Epub/hyphenation/LanguageRegistry.cpp"
  "$ROOT_DIR/lib/Epub/Epub/hyphenation/LiangHyphenation.cpp"
  "$ROOT_DIR/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp"
  "$ROOT_DIR/lib/Utf8/Utf8.cpp"
 )
 CXXFLAGS=(
  -std=c++20
  -O2
  -Wall
  -Wextra
  -pedantic
  -I"$ROOT_DIR"
  -I"$ROOT_DIR/lib"
  -I"$ROOT_DIR/lib/Utf8"
 )
 c++ "${CXXFLAGS[@]}" "${SOURCES[@]}" -o "$BINARY"
 "$BINARY" "$@"