mirror of
https://github.com/daveallie/crosspoint-reader.git
synced 2026-02-04 14:47:37 +03:00
## Summary * Add additional punctuation marks to the list of characters that can be immediately followed by a line break even where there is no explicit space ## Additional Context * Huge appreciation to @osteotek for his amazing work on hyphenation. Reading on the device is so much better now. * I am getting bad line breaks when ellipses (…) are between words and book file does not explicitly include some kind of breaking space. * Per [discussion](https://github.com/crosspoint-reader/crosspoint-reader/pull/305#issuecomment-3765411406), several new characters are added in this PR to the `isExplicitHyphen` list to allow line breaks immediately after them: Character | Unicode | Usage | Why include it? -- | -- | -- | -- Solidus (Slash) | U+002F | / | Essential for breaking URLs and "and/or" constructs. Backslash | U+005C | \ | Critical for technical text, file paths, and coding documentation. Underscore | U+005F | _ | Prevents "runaway" line lengths in usernames or code snippets. Middle Dot | U+00B7 | · | Acts as a semantic separator in dictionaries or stylistic lists. Ellipsis | U+2026 | … | Prevents justification failure when dialogue lacks following spaces. Midline Horizontal Ellipsis | U+22EF | ⋯ | Useful for mathematical sequences and technical notation. ### Example: This shows an example of what line breaking looks like *with* this PR. Note the line break after "matter…" (which would not previously have been allowed). It's particularly important here because the book includes non-breaking spaces in "Mr. Aldrich" and "Mr. Rockefeller."  --- ### AI Usage While CrossPoint doesn't have restrictions on AI tools in contributing, please be transparent about their usage as it helps set the right context for reviewers. Did you use AI tools to help write this code? **PARTIALLY**
182 lines
4.5 KiB
C++
182 lines
4.5 KiB
C++
#include "HyphenationCommon.h"
|
||
|
||
#include <Utf8.h>
|
||
|
||
namespace {
|
||
|
||
// Convert Latin uppercase letters (ASCII plus Latin-1 supplement) to lowercase
|
||
uint32_t toLowerLatinImpl(const uint32_t cp) {
|
||
if (cp >= 'A' && cp <= 'Z') {
|
||
return cp - 'A' + 'a';
|
||
}
|
||
if ((cp >= 0x00C0 && cp <= 0x00D6) || (cp >= 0x00D8 && cp <= 0x00DE)) {
|
||
return cp + 0x20;
|
||
}
|
||
|
||
switch (cp) {
|
||
case 0x0152: // Œ
|
||
return 0x0153; // œ
|
||
case 0x0178: // Ÿ
|
||
return 0x00FF; // ÿ
|
||
case 0x1E9E: // ẞ
|
||
return 0x00DF; // ß
|
||
default:
|
||
return cp;
|
||
}
|
||
}
|
||
|
||
// Convert Cyrillic uppercase letters to lowercase
|
||
// Cyrillic uppercase range 0x0410-0x042F maps to lowercase by adding 0x20
|
||
// Special case: Cyrillic capital IO (0x0401) maps to lowercase io (0x0451)
|
||
uint32_t toLowerCyrillicImpl(const uint32_t cp) {
|
||
if (cp >= 0x0410 && cp <= 0x042F) {
|
||
return cp + 0x20;
|
||
}
|
||
if (cp == 0x0401) {
|
||
return 0x0451;
|
||
}
|
||
return cp;
|
||
}
|
||
|
||
} // namespace
|
||
|
||
uint32_t toLowerLatin(const uint32_t cp) { return toLowerLatinImpl(cp); }
|
||
|
||
uint32_t toLowerCyrillic(const uint32_t cp) { return toLowerCyrillicImpl(cp); }
|
||
|
||
bool isLatinLetter(const uint32_t cp) {
|
||
if ((cp >= 'A' && cp <= 'Z') || (cp >= 'a' && cp <= 'z')) {
|
||
return true;
|
||
}
|
||
|
||
if (((cp >= 0x00C0 && cp <= 0x00D6) || (cp >= 0x00D8 && cp <= 0x00F6) || (cp >= 0x00F8 && cp <= 0x00FF)) &&
|
||
cp != 0x00D7 && cp != 0x00F7) {
|
||
return true;
|
||
}
|
||
|
||
switch (cp) {
|
||
case 0x0152: // Œ
|
||
case 0x0153: // œ
|
||
case 0x0178: // Ÿ
|
||
case 0x1E9E: // ẞ
|
||
return true;
|
||
default:
|
||
return false;
|
||
}
|
||
}
|
||
|
||
bool isCyrillicLetter(const uint32_t cp) { return (cp >= 0x0400 && cp <= 0x052F); }
|
||
|
||
bool isAlphabetic(const uint32_t cp) { return isLatinLetter(cp) || isCyrillicLetter(cp); }
|
||
|
||
bool isPunctuation(const uint32_t cp) {
|
||
switch (cp) {
|
||
case '-':
|
||
case '.':
|
||
case ',':
|
||
case '!':
|
||
case '?':
|
||
case ';':
|
||
case ':':
|
||
case '"':
|
||
case '\'':
|
||
case ')':
|
||
case '(':
|
||
case 0x00AB: // «
|
||
case 0x00BB: // »
|
||
case 0x2018: // ‘
|
||
case 0x2019: // ’
|
||
case 0x201C: // “
|
||
case 0x201D: // ”
|
||
case 0x00A0: // no-break space
|
||
case '{':
|
||
case '}':
|
||
case '[':
|
||
case ']':
|
||
case '/':
|
||
case 0x203A: // ›
|
||
case 0x2026: // …
|
||
return true;
|
||
default:
|
||
return false;
|
||
}
|
||
}
|
||
|
||
bool isAsciiDigit(const uint32_t cp) { return cp >= '0' && cp <= '9'; }
|
||
|
||
bool isExplicitHyphen(const uint32_t cp) {
|
||
switch (cp) {
|
||
case '-':
|
||
case 0x00AD: // soft hyphen
|
||
case 0x058A: // Armenian hyphen
|
||
case 0x2010: // hyphen
|
||
case 0x2011: // non-breaking hyphen
|
||
case 0x2012: // figure dash
|
||
case 0x2013: // en dash
|
||
case 0x2014: // em dash
|
||
case 0x2015: // horizontal bar
|
||
case 0x2043: // hyphen bullet
|
||
case 0x207B: // superscript minus
|
||
case 0x208B: // subscript minus
|
||
case 0x2212: // minus sign
|
||
case 0x2E17: // double oblique hyphen
|
||
case 0x2E3A: // two-em dash
|
||
case 0x2E3B: // three-em dash
|
||
case 0xFE58: // small em dash
|
||
case 0xFE63: // small hyphen-minus
|
||
case 0xFF0D: // fullwidth hyphen-minus
|
||
case 0x005F: // Underscore
|
||
case 0x2026: // Ellipsis
|
||
return true;
|
||
default:
|
||
return false;
|
||
}
|
||
}
|
||
|
||
bool isSoftHyphen(const uint32_t cp) { return cp == 0x00AD; }
|
||
|
||
void trimSurroundingPunctuationAndFootnote(std::vector<CodepointInfo>& cps) {
|
||
if (cps.empty()) {
|
||
return;
|
||
}
|
||
|
||
// Remove trailing footnote references like [12], even if punctuation trails after the closing bracket.
|
||
if (cps.size() >= 3) {
|
||
int end = static_cast<int>(cps.size()) - 1;
|
||
while (end >= 0 && isPunctuation(cps[end].value)) {
|
||
--end;
|
||
}
|
||
int pos = end;
|
||
if (pos >= 0 && isAsciiDigit(cps[pos].value)) {
|
||
while (pos >= 0 && isAsciiDigit(cps[pos].value)) {
|
||
--pos;
|
||
}
|
||
if (pos >= 0 && cps[pos].value == '[' && end - pos > 1) {
|
||
cps.erase(cps.begin() + pos, cps.end());
|
||
}
|
||
}
|
||
}
|
||
|
||
while (!cps.empty() && isPunctuation(cps.front().value)) {
|
||
cps.erase(cps.begin());
|
||
}
|
||
while (!cps.empty() && isPunctuation(cps.back().value)) {
|
||
cps.pop_back();
|
||
}
|
||
}
|
||
|
||
std::vector<CodepointInfo> collectCodepoints(const std::string& word) {
|
||
std::vector<CodepointInfo> cps;
|
||
cps.reserve(word.size());
|
||
|
||
const unsigned char* base = reinterpret_cast<const unsigned char*>(word.c_str());
|
||
const unsigned char* ptr = base;
|
||
while (*ptr != 0) {
|
||
const unsigned char* current = ptr;
|
||
const uint32_t cp = utf8NextCodepoint(&ptr);
|
||
cps.push_back({cp, static_cast<size_t>(current - base)});
|
||
}
|
||
|
||
return cps;
|
||
}
|