mirror of
https://github.com/daveallie/crosspoint-reader.git
synced 2026-02-05 15:17:37 +03:00
Add explicit hyphen handling and improve hyphenation logic in ParsedText and Hyphenator
This commit is contained in:
parent
f6767c857f
commit
cb1ecdb505
@ -1,6 +1,7 @@
|
|||||||
#include "ParsedText.h"
|
#include "ParsedText.h"
|
||||||
|
|
||||||
#include <GfxRenderer.h>
|
#include <GfxRenderer.h>
|
||||||
|
#include <Utf8.h>
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
@ -9,6 +10,7 @@
|
|||||||
#include <limits>
|
#include <limits>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
#include "hyphenation/HyphenationCommon.h"
|
||||||
#include "hyphenation/Hyphenator.h"
|
#include "hyphenation/Hyphenator.h"
|
||||||
|
|
||||||
constexpr int MAX_COST = std::numeric_limits<int>::max();
|
constexpr int MAX_COST = std::numeric_limits<int>::max();
|
||||||
@ -18,8 +20,38 @@ namespace {
|
|||||||
struct HyphenSplitDecision {
|
struct HyphenSplitDecision {
|
||||||
size_t byteOffset;
|
size_t byteOffset;
|
||||||
uint16_t prefixWidth;
|
uint16_t prefixWidth;
|
||||||
|
bool appendHyphen; // true when we must draw an extra hyphen after the prefix glyphs
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Verifies whether the substring ending at `offset` already contains a literal hyphen glyph, so we can avoid
|
||||||
|
// drawing a duplicate hyphen when breaking the word.
|
||||||
|
bool endsWithExplicitHyphen(const std::string& word, const size_t offset) {
|
||||||
|
if (offset == 0 || offset > word.size()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const unsigned char* base = reinterpret_cast<const unsigned char*>(word.data());
|
||||||
|
const unsigned char* ptr = base;
|
||||||
|
const unsigned char* target = base + offset;
|
||||||
|
const unsigned char* lastStart = nullptr;
|
||||||
|
|
||||||
|
while (ptr < target) {
|
||||||
|
lastStart = ptr;
|
||||||
|
utf8NextCodepoint(&ptr);
|
||||||
|
if (ptr > target) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!lastStart || ptr != target) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const unsigned char* tmp = lastStart;
|
||||||
|
const uint32_t cp = utf8NextCodepoint(&tmp); // decode the codepoint immediately prior to the break
|
||||||
|
return isExplicitHyphen(cp);
|
||||||
|
}
|
||||||
|
|
||||||
bool chooseSplitForWidth(const GfxRenderer& renderer, const int fontId, const std::string& word,
|
bool chooseSplitForWidth(const GfxRenderer& renderer, const int fontId, const std::string& word,
|
||||||
const EpdFontStyle style, const int availableWidth, const bool includeFallback,
|
const EpdFontStyle style, const int availableWidth, const bool includeFallback,
|
||||||
HyphenSplitDecision* decision) {
|
HyphenSplitDecision* decision) {
|
||||||
@ -28,10 +60,6 @@ bool chooseSplitForWidth(const GfxRenderer& renderer, const int fontId, const st
|
|||||||
}
|
}
|
||||||
|
|
||||||
const int hyphenWidth = renderer.getTextWidth(fontId, "-", style);
|
const int hyphenWidth = renderer.getTextWidth(fontId, "-", style);
|
||||||
const int adjustedWidth = availableWidth - hyphenWidth;
|
|
||||||
if (adjustedWidth <= 0) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto offsets = Hyphenator::breakOffsets(word, includeFallback);
|
auto offsets = Hyphenator::breakOffsets(word, includeFallback);
|
||||||
if (offsets.empty()) {
|
if (offsets.empty()) {
|
||||||
@ -40,13 +68,20 @@ bool chooseSplitForWidth(const GfxRenderer& renderer, const int fontId, const st
|
|||||||
|
|
||||||
size_t chosenOffset = std::numeric_limits<size_t>::max();
|
size_t chosenOffset = std::numeric_limits<size_t>::max();
|
||||||
uint16_t chosenWidth = 0;
|
uint16_t chosenWidth = 0;
|
||||||
|
bool chosenAppendHyphen = true;
|
||||||
|
|
||||||
for (const size_t offset : offsets) {
|
for (const size_t offset : offsets) {
|
||||||
|
const bool needsInsertedHyphen = !endsWithExplicitHyphen(word, offset);
|
||||||
|
const int budget = availableWidth - (needsInsertedHyphen ? hyphenWidth : 0);
|
||||||
|
if (budget <= 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
const std::string prefix = word.substr(0, offset);
|
const std::string prefix = word.substr(0, offset);
|
||||||
const int prefixWidth = renderer.getTextWidth(fontId, prefix.c_str(), style);
|
const int prefixWidth = renderer.getTextWidth(fontId, prefix.c_str(), style);
|
||||||
if (prefixWidth <= adjustedWidth) {
|
if (prefixWidth <= budget) {
|
||||||
chosenOffset = offset;
|
chosenOffset = offset;
|
||||||
chosenWidth = static_cast<uint16_t>(prefixWidth + hyphenWidth);
|
chosenWidth = static_cast<uint16_t>(prefixWidth + (needsInsertedHyphen ? hyphenWidth : 0));
|
||||||
|
chosenAppendHyphen = needsInsertedHyphen;
|
||||||
} else {
|
} else {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -58,6 +93,7 @@ bool chooseSplitForWidth(const GfxRenderer& renderer, const int fontId, const st
|
|||||||
|
|
||||||
decision->byteOffset = chosenOffset;
|
decision->byteOffset = chosenOffset;
|
||||||
decision->prefixWidth = chosenWidth;
|
decision->prefixWidth = chosenWidth;
|
||||||
|
decision->appendHyphen = chosenAppendHyphen;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -110,14 +146,17 @@ std::vector<uint16_t> ParsedText::calculateWordWidths(const GfxRenderer& rendere
|
|||||||
uint16_t width = renderer.getTextWidth(fontId, wordsIt->c_str(), *wordStylesIt);
|
uint16_t width = renderer.getTextWidth(fontId, wordsIt->c_str(), *wordStylesIt);
|
||||||
|
|
||||||
if (width > pageWidth) {
|
if (width > pageWidth) {
|
||||||
HyphenSplitDecision decision;
|
HyphenSplitDecision decision{};
|
||||||
if (chooseSplitForWidth(renderer, fontId, *wordsIt, *wordStylesIt, pageWidth, true, &decision)) {
|
if (chooseSplitForWidth(renderer, fontId, *wordsIt, *wordStylesIt, pageWidth, true, &decision)) {
|
||||||
const std::string originalWord = *wordsIt;
|
const std::string originalWord = *wordsIt;
|
||||||
const std::string tail = originalWord.substr(decision.byteOffset);
|
const std::string tail = originalWord.substr(decision.byteOffset);
|
||||||
if (tail.empty()) {
|
if (tail.empty()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
const std::string prefix = originalWord.substr(0, decision.byteOffset) + "-";
|
std::string prefix = originalWord.substr(0, decision.byteOffset);
|
||||||
|
if (decision.appendHyphen) {
|
||||||
|
prefix += "-";
|
||||||
|
}
|
||||||
|
|
||||||
*wordsIt = prefix;
|
*wordsIt = prefix;
|
||||||
auto nextWordIt = words.insert(std::next(wordsIt), tail);
|
auto nextWordIt = words.insert(std::next(wordsIt), tail);
|
||||||
@ -235,7 +274,7 @@ std::vector<size_t> ParsedText::computeLineBreaks(const GfxRenderer& renderer, c
|
|||||||
}
|
}
|
||||||
|
|
||||||
const int availableWidth = pageWidth - lineWidth - interWordSpace;
|
const int availableWidth = pageWidth - lineWidth - interWordSpace;
|
||||||
HyphenSplitDecision decision;
|
HyphenSplitDecision decision{};
|
||||||
if (!chooseSplitForWidth(renderer, fontId, *wordNodeIt, *styleNodeIt, availableWidth, false, &decision)) {
|
if (!chooseSplitForWidth(renderer, fontId, *wordNodeIt, *styleNodeIt, availableWidth, false, &decision)) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -245,7 +284,10 @@ std::vector<size_t> ParsedText::computeLineBreaks(const GfxRenderer& renderer, c
|
|||||||
if (tail.empty()) {
|
if (tail.empty()) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
const std::string prefix = originalWord.substr(0, decision.byteOffset) + "-";
|
std::string prefix = originalWord.substr(0, decision.byteOffset);
|
||||||
|
if (decision.appendHyphen) {
|
||||||
|
prefix += "-";
|
||||||
|
}
|
||||||
|
|
||||||
const EpdFontStyle styleForSplit = *styleNodeIt;
|
const EpdFontStyle styleForSplit = *styleNodeIt;
|
||||||
*wordNodeIt = tail;
|
*wordNodeIt = tail;
|
||||||
|
|||||||
@ -79,8 +79,6 @@ bool isPunctuation(const uint32_t cp) {
|
|||||||
case 0x2019: // ’
|
case 0x2019: // ’
|
||||||
case 0x201C: // “
|
case 0x201C: // “
|
||||||
case 0x201D: // ”
|
case 0x201D: // ”
|
||||||
case '[':
|
|
||||||
case ']':
|
|
||||||
case '{':
|
case '{':
|
||||||
case '}':
|
case '}':
|
||||||
case '/':
|
case '/':
|
||||||
@ -92,6 +90,33 @@ bool isPunctuation(const uint32_t cp) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool isExplicitHyphen(const uint32_t cp) {
|
||||||
|
switch (cp) {
|
||||||
|
case '-':
|
||||||
|
case 0x00AD: // soft hyphen
|
||||||
|
case 0x058A: // Armenian hyphen
|
||||||
|
case 0x2010: // hyphen
|
||||||
|
case 0x2011: // non-breaking hyphen
|
||||||
|
case 0x2012: // figure dash
|
||||||
|
case 0x2013: // en dash
|
||||||
|
case 0x2014: // em dash
|
||||||
|
case 0x2015: // horizontal bar
|
||||||
|
case 0x2043: // hyphen bullet
|
||||||
|
case 0x207B: // superscript minus
|
||||||
|
case 0x208B: // subscript minus
|
||||||
|
case 0x2212: // minus sign
|
||||||
|
case 0x2E17: // double oblique hyphen
|
||||||
|
case 0x2E3A: // two-em dash
|
||||||
|
case 0x2E3B: // three-em dash
|
||||||
|
case 0xFE58: // small em dash
|
||||||
|
case 0xFE63: // small hyphen-minus
|
||||||
|
case 0xFF0D: // fullwidth hyphen-minus
|
||||||
|
return true;
|
||||||
|
default:
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void trimSurroundingPunctuation(std::vector<CodepointInfo>& cps) {
|
void trimSurroundingPunctuation(std::vector<CodepointInfo>& cps) {
|
||||||
while (!cps.empty() && isPunctuation(cps.front().value)) {
|
while (!cps.empty() && isPunctuation(cps.front().value)) {
|
||||||
cps.erase(cps.begin());
|
cps.erase(cps.begin());
|
||||||
|
|||||||
@ -28,6 +28,7 @@ bool isCyrillicConsonant(uint32_t cp);
|
|||||||
bool isAlphabetic(uint32_t cp);
|
bool isAlphabetic(uint32_t cp);
|
||||||
bool isVowel(uint32_t cp);
|
bool isVowel(uint32_t cp);
|
||||||
bool isPunctuation(uint32_t cp);
|
bool isPunctuation(uint32_t cp);
|
||||||
|
bool isExplicitHyphen(uint32_t cp);
|
||||||
void trimSurroundingPunctuation(std::vector<CodepointInfo>& cps);
|
void trimSurroundingPunctuation(std::vector<CodepointInfo>& cps);
|
||||||
|
|
||||||
Script detectScript(const std::vector<CodepointInfo>& cps);
|
Script detectScript(const std::vector<CodepointInfo>& cps);
|
||||||
|
|||||||
@ -48,8 +48,6 @@ std::vector<CodepointInfo> collectCodepoints(const std::string& word) {
|
|||||||
return cps;
|
return cps;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool isExplicitHyphen(const uint32_t cp) { return cp == '-' || cp == 0x2010; }
|
|
||||||
|
|
||||||
std::vector<size_t> collectExplicitHyphenIndexes(const std::vector<CodepointInfo>& cps) {
|
std::vector<size_t> collectExplicitHyphenIndexes(const std::vector<CodepointInfo>& cps) {
|
||||||
std::vector<size_t> indexes;
|
std::vector<size_t> indexes;
|
||||||
for (size_t i = 0; i < cps.size(); ++i) {
|
for (size_t i = 0; i < cps.size(); ++i) {
|
||||||
@ -74,6 +72,32 @@ std::vector<size_t> collectExplicitHyphenIndexes(const std::vector<CodepointInfo
|
|||||||
return indexes;
|
return indexes;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool isAsciiDigit(const uint32_t cp) { return cp >= '0' && cp <= '9'; }
|
||||||
|
|
||||||
|
void trimTrailingFootnoteReference(std::vector<CodepointInfo>& cps) {
|
||||||
|
if (cps.size() < 3) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
int closing = static_cast<int>(cps.size()) - 1;
|
||||||
|
if (cps[closing].value != ']') {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
int pos = closing - 1;
|
||||||
|
if (pos < 0 || !isAsciiDigit(cps[pos].value)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
while (pos >= 0 && isAsciiDigit(cps[pos].value)) {
|
||||||
|
--pos;
|
||||||
|
}
|
||||||
|
if (pos < 0 || cps[pos].value != '[') {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (closing - pos <= 1) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
cps.erase(cps.begin() + pos, cps.end());
|
||||||
|
}
|
||||||
|
|
||||||
// Rejects words containing punctuation or digits unless forced.
|
// Rejects words containing punctuation or digits unless forced.
|
||||||
bool hasOnlyAlphabetic(const std::vector<CodepointInfo>& cps) {
|
bool hasOnlyAlphabetic(const std::vector<CodepointInfo>& cps) {
|
||||||
if (cps.empty()) {
|
if (cps.empty()) {
|
||||||
@ -120,11 +144,13 @@ std::vector<size_t> Hyphenator::breakOffsets(const std::string& word, const bool
|
|||||||
|
|
||||||
auto cps = collectCodepoints(word);
|
auto cps = collectCodepoints(word);
|
||||||
trimSurroundingPunctuation(cps);
|
trimSurroundingPunctuation(cps);
|
||||||
|
trimTrailingFootnoteReference(cps);
|
||||||
if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) {
|
if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) {
|
||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
|
|
||||||
if (auto explicitIndexes = collectExplicitHyphenIndexes(cps); !explicitIndexes.empty()) {
|
auto explicitIndexes = collectExplicitHyphenIndexes(cps);
|
||||||
|
if (!explicitIndexes.empty()) {
|
||||||
std::sort(explicitIndexes.begin(), explicitIndexes.end());
|
std::sort(explicitIndexes.begin(), explicitIndexes.end());
|
||||||
explicitIndexes.erase(std::unique(explicitIndexes.begin(), explicitIndexes.end()), explicitIndexes.end());
|
explicitIndexes.erase(std::unique(explicitIndexes.begin(), explicitIndexes.end()), explicitIndexes.end());
|
||||||
std::vector<size_t> byteOffsets;
|
std::vector<size_t> byteOffsets;
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user