#include "ParsedText.h" #include #include #include #include #include #include #include #include "hyphenation/Hyphenator.h" constexpr int MAX_COST = std::numeric_limits::max(); namespace { // Soft hyphen byte pattern used throughout EPUBs (UTF-8 for U+00AD). constexpr char SOFT_HYPHEN_UTF8[] = "\xC2\xAD"; constexpr size_t SOFT_HYPHEN_BYTES = 2; bool containsSoftHyphen(const std::string& word) { return word.find(SOFT_HYPHEN_UTF8) != std::string::npos; } // Removes every soft hyphen in-place so rendered glyphs match measured widths. void stripSoftHyphensInPlace(std::string& word) { size_t pos = 0; while ((pos = word.find(SOFT_HYPHEN_UTF8, pos)) != std::string::npos) { word.erase(pos, SOFT_HYPHEN_BYTES); } } // Returns the rendered width for a word while ignoring soft hyphen glyphs and optionally appending a visible hyphen. uint16_t measureWordWidth(const GfxRenderer& renderer, const int fontId, const std::string& word, const EpdFontFamily::Style style, const bool appendHyphen = false) { const bool hasSoftHyphen = containsSoftHyphen(word); if (!hasSoftHyphen && !appendHyphen) { return renderer.getTextWidth(fontId, word.c_str(), style); } std::string sanitized = word; if (hasSoftHyphen) { stripSoftHyphensInPlace(sanitized); } if (appendHyphen) { sanitized.push_back('-'); } return renderer.getTextWidth(fontId, sanitized.c_str(), style); } } // namespace void ParsedText::addWord(std::string word, const EpdFontFamily::Style fontStyle) { if (word.empty()) return; words.push_back(std::move(word)); wordStyles.push_back(fontStyle); } // Consumes data to minimize memory usage void ParsedText::layoutAndExtractLines(const GfxRenderer& renderer, const int fontId, const uint16_t viewportWidth, const std::function)>& processLine, const bool includeLastLine) { if (words.empty()) { return; } // Apply fixed transforms before any per-line layout work. applyParagraphIndent(); const int pageWidth = viewportWidth; const int spaceWidth = renderer.getSpaceWidth(fontId); auto wordWidths = calculateWordWidths(renderer, fontId); std::vector lineBreakIndices; if (hyphenationEnabled) { // Use greedy layout that can split words mid-loop when a hyphenated prefix fits. lineBreakIndices = computeHyphenatedLineBreaks(renderer, fontId, pageWidth, spaceWidth, wordWidths); } else { lineBreakIndices = computeLineBreaks(renderer, fontId, pageWidth, spaceWidth, wordWidths); } const size_t lineCount = includeLastLine ? lineBreakIndices.size() : lineBreakIndices.size() - 1; for (size_t i = 0; i < lineCount; ++i) { extractLine(i, pageWidth, spaceWidth, wordWidths, lineBreakIndices, processLine); } } std::vector ParsedText::calculateWordWidths(const GfxRenderer& renderer, const int fontId) { const size_t totalWordCount = words.size(); std::vector wordWidths; wordWidths.reserve(totalWordCount); auto wordsIt = words.begin(); auto wordStylesIt = wordStyles.begin(); while (wordsIt != words.end()) { wordWidths.push_back(measureWordWidth(renderer, fontId, *wordsIt, *wordStylesIt)); std::advance(wordsIt, 1); std::advance(wordStylesIt, 1); } return wordWidths; } std::vector ParsedText::computeLineBreaks(const GfxRenderer& renderer, const int fontId, const int pageWidth, const int spaceWidth, std::vector& wordWidths) { if (words.empty()) { return {}; } // Ensure any word that would overflow even as the first entry on a line is split using fallback hyphenation. for (size_t i = 0; i < wordWidths.size(); ++i) { while (wordWidths[i] > pageWidth) { if (!hyphenateWordAtIndex(i, pageWidth, renderer, fontId, wordWidths, /*allowFallbackBreaks=*/true)) { break; } } } const size_t totalWordCount = words.size(); // DP table to store the minimum badness (cost) of lines starting at index i std::vector dp(totalWordCount); // 'ans[i]' stores the index 'j' of the *last word* in the optimal line starting at 'i' std::vector ans(totalWordCount); // Base Case dp[totalWordCount - 1] = 0; ans[totalWordCount - 1] = totalWordCount - 1; for (int i = totalWordCount - 2; i >= 0; --i) { int currlen = -spaceWidth; dp[i] = MAX_COST; for (size_t j = i; j < totalWordCount; ++j) { // Current line length: previous width + space + current word width currlen += wordWidths[j] + spaceWidth; if (currlen > pageWidth) { break; } int cost; if (j == totalWordCount - 1) { cost = 0; // Last line } else { const int remainingSpace = pageWidth - currlen; // Use long long for the square to prevent overflow const long long cost_ll = static_cast(remainingSpace) * remainingSpace + dp[j + 1]; if (cost_ll > MAX_COST) { cost = MAX_COST; } else { cost = static_cast(cost_ll); } } if (cost < dp[i]) { dp[i] = cost; ans[i] = j; // j is the index of the last word in this optimal line } } // Handle oversized word: if no valid configuration found, force single-word line // This prevents cascade failure where one oversized word breaks all preceding words if (dp[i] == MAX_COST) { ans[i] = i; // Just this word on its own line // Inherit cost from next word to allow subsequent words to find valid configurations if (i + 1 < static_cast(totalWordCount)) { dp[i] = dp[i + 1]; } else { dp[i] = 0; } } } // Stores the index of the word that starts the next line (last_word_index + 1) std::vector lineBreakIndices; size_t currentWordIndex = 0; while (currentWordIndex < totalWordCount) { size_t nextBreakIndex = ans[currentWordIndex] + 1; // Safety check: prevent infinite loop if nextBreakIndex doesn't advance if (nextBreakIndex <= currentWordIndex) { // Force advance by at least one word to avoid infinite loop nextBreakIndex = currentWordIndex + 1; } lineBreakIndices.push_back(nextBreakIndex); currentWordIndex = nextBreakIndex; } return lineBreakIndices; } void ParsedText::applyParagraphIndent() { if (extraParagraphSpacing || words.empty()) { return; } if (style == TextBlock::JUSTIFIED || style == TextBlock::LEFT_ALIGN) { words.front().insert(0, "\xe2\x80\x83"); } } // Builds break indices while opportunistically splitting the word that would overflow the current line. std::vector ParsedText::computeHyphenatedLineBreaks(const GfxRenderer& renderer, const int fontId, const int pageWidth, const int spaceWidth, std::vector& wordWidths) { std::vector lineBreakIndices; size_t currentIndex = 0; while (currentIndex < wordWidths.size()) { const size_t lineStart = currentIndex; int lineWidth = 0; // Consume as many words as possible for current line, splitting when prefixes fit while (currentIndex < wordWidths.size()) { const bool isFirstWord = currentIndex == lineStart; const int spacing = isFirstWord ? 0 : spaceWidth; const int candidateWidth = spacing + wordWidths[currentIndex]; // Word fits on current line if (lineWidth + candidateWidth <= pageWidth) { lineWidth += candidateWidth; ++currentIndex; continue; } // Word would overflow — try to split based on hyphenation points const int availableWidth = pageWidth - lineWidth - spacing; const bool allowFallbackBreaks = isFirstWord; // Only for first word on line if (availableWidth > 0 && hyphenateWordAtIndex(currentIndex, availableWidth, renderer, fontId, wordWidths, allowFallbackBreaks)) { // Prefix now fits; append it to this line and move to next line lineWidth += spacing + wordWidths[currentIndex]; ++currentIndex; break; } // Could not split: force at least one word per line to avoid infinite loop if (currentIndex == lineStart) { lineWidth += candidateWidth; ++currentIndex; } break; } lineBreakIndices.push_back(currentIndex); } return lineBreakIndices; } // Splits words[wordIndex] into prefix (adding a hyphen only when needed) and remainder when a legal breakpoint fits the // available width. bool ParsedText::hyphenateWordAtIndex(const size_t wordIndex, const int availableWidth, const GfxRenderer& renderer, const int fontId, std::vector& wordWidths, const bool allowFallbackBreaks) { // Guard against invalid indices or zero available width before attempting to split. if (availableWidth <= 0 || wordIndex >= words.size()) { return false; } // Get iterators to target word and style. auto wordIt = words.begin(); auto styleIt = wordStyles.begin(); std::advance(wordIt, wordIndex); std::advance(styleIt, wordIndex); const std::string& word = *wordIt; const auto style = *styleIt; // Collect candidate breakpoints (byte offsets and hyphen requirements). auto breakInfos = Hyphenator::breakOffsets(word, allowFallbackBreaks); if (breakInfos.empty()) { return false; } size_t chosenOffset = 0; int chosenWidth = -1; bool chosenNeedsHyphen = true; // Iterate over each legal breakpoint and retain the widest prefix that still fits. for (const auto& info : breakInfos) { const size_t offset = info.byteOffset; if (offset == 0 || offset >= word.size()) { continue; } const bool needsHyphen = info.requiresInsertedHyphen; const int prefixWidth = measureWordWidth(renderer, fontId, word.substr(0, offset), style, needsHyphen); if (prefixWidth > availableWidth || prefixWidth <= chosenWidth) { continue; // Skip if too wide or not an improvement } chosenWidth = prefixWidth; chosenOffset = offset; chosenNeedsHyphen = needsHyphen; } if (chosenWidth < 0) { // No hyphenation point produced a prefix that fits in the remaining space. return false; } // Split the word at the selected breakpoint and append a hyphen if required. std::string remainder = word.substr(chosenOffset); wordIt->resize(chosenOffset); if (chosenNeedsHyphen) { wordIt->push_back('-'); } // Insert the remainder word (with matching style) directly after the prefix. auto insertWordIt = std::next(wordIt); auto insertStyleIt = std::next(styleIt); words.insert(insertWordIt, remainder); wordStyles.insert(insertStyleIt, style); // Update cached widths to reflect the new prefix/remainder pairing. wordWidths[wordIndex] = static_cast(chosenWidth); const uint16_t remainderWidth = measureWordWidth(renderer, fontId, remainder, style); wordWidths.insert(wordWidths.begin() + wordIndex + 1, remainderWidth); return true; } void ParsedText::extractLine(const size_t breakIndex, const int pageWidth, const int spaceWidth, const std::vector& wordWidths, const std::vector& lineBreakIndices, const std::function)>& processLine) { const size_t lineBreak = lineBreakIndices[breakIndex]; const size_t lastBreakAt = breakIndex > 0 ? lineBreakIndices[breakIndex - 1] : 0; const size_t lineWordCount = lineBreak - lastBreakAt; // Calculate total word width for this line int lineWordWidthSum = 0; for (size_t i = lastBreakAt; i < lineBreak; i++) { lineWordWidthSum += wordWidths[i]; } // Calculate spacing const int spareSpace = pageWidth - lineWordWidthSum; int spacing = spaceWidth; const bool isLastLine = breakIndex == lineBreakIndices.size() - 1; if (style == TextBlock::JUSTIFIED && !isLastLine && lineWordCount >= 2) { spacing = spareSpace / (lineWordCount - 1); } // Calculate initial x position uint16_t xpos = 0; if (style == TextBlock::RIGHT_ALIGN) { xpos = spareSpace - (lineWordCount - 1) * spaceWidth; } else if (style == TextBlock::CENTER_ALIGN) { xpos = (spareSpace - (lineWordCount - 1) * spaceWidth) / 2; } // Pre-calculate X positions for words std::list lineXPos; for (size_t i = lastBreakAt; i < lineBreak; i++) { const uint16_t currentWordWidth = wordWidths[i]; lineXPos.push_back(xpos); xpos += currentWordWidth + spacing; } // Iterators always start at the beginning as we are moving content with splice below auto wordEndIt = words.begin(); auto wordStyleEndIt = wordStyles.begin(); std::advance(wordEndIt, lineWordCount); std::advance(wordStyleEndIt, lineWordCount); // *** CRITICAL STEP: CONSUME DATA USING SPLICE *** std::list lineWords; lineWords.splice(lineWords.begin(), words, words.begin(), wordEndIt); std::list lineWordStyles; lineWordStyles.splice(lineWordStyles.begin(), wordStyles, wordStyles.begin(), wordStyleEndIt); for (auto& word : lineWords) { if (containsSoftHyphen(word)) { stripSoftHyphensInPlace(word); } } processLine(std::make_shared(std::move(lineWords), std::move(lineXPos), std::move(lineWordStyles), style)); }