Xteink-X4-crosspoint-reader/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp
Luke Stein 7a53342f9d
fix: Allow line break after ellipsis and underscore (#425)
## Summary

* Add additional punctuation marks to the list of characters that can be
immediately followed by a line break even where there is no explicit
space

## Additional Context

* Huge appreciation to @osteotek for his amazing work on hyphenation.
Reading on the device is so much better now.
* I am getting bad line breaks when ellipses (…) are between words and
book file does not explicitly include some kind of breaking space.
* Per
[discussion](https://github.com/crosspoint-reader/crosspoint-reader/pull/305#issuecomment-3765411406),
several new characters are added in this PR to the `isExplicitHyphen`
list to allow line breaks immediately after them:

Character | Unicode | Usage | Why include it?
-- | -- | -- | --
Solidus (Slash) | U+002F | / | Essential for breaking URLs and "and/or"
constructs.
Backslash | U+005C | \ | Critical for technical text, file paths, and
coding documentation.
Underscore | U+005F | _ | Prevents "runaway" line lengths in usernames
or code snippets.
Middle Dot | U+00B7 | · | Acts as a semantic separator in dictionaries
or stylistic lists.
Ellipsis | U+2026 | … | Prevents justification failure when dialogue
lacks following spaces.
Midline Horizontal Ellipsis | U+22EF | ⋯ | Useful for mathematical
sequences and technical notation.


### Example:

This shows an example of what line breaking looks like *with* this PR.
Note the line break after "matter…" (which would not previously have
been allowed). It's particularly important here because the book
includes non-breaking spaces in "Mr. Aldrich" and "Mr. Rockefeller."


![IMG_2917](https://github.com/user-attachments/assets/8fa610a9-91dd-407f-8526-0019a8a7195f)

---

### AI Usage

While CrossPoint doesn't have restrictions on AI tools in contributing,
please be transparent about their usage as it
helps set the right context for reviewers.

Did you use AI tools to help write this code? **PARTIALLY**
2026-01-27 20:18:09 +11:00

182 lines
4.5 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include "HyphenationCommon.h"
#include <Utf8.h>
namespace {
// Convert Latin uppercase letters (ASCII plus Latin-1 supplement) to lowercase
uint32_t toLowerLatinImpl(const uint32_t cp) {
if (cp >= 'A' && cp <= 'Z') {
return cp - 'A' + 'a';
}
if ((cp >= 0x00C0 && cp <= 0x00D6) || (cp >= 0x00D8 && cp <= 0x00DE)) {
return cp + 0x20;
}
switch (cp) {
case 0x0152: // Œ
return 0x0153; // œ
case 0x0178: // Ÿ
return 0x00FF; // ÿ
case 0x1E9E: // ẞ
return 0x00DF; // ß
default:
return cp;
}
}
// Convert Cyrillic uppercase letters to lowercase
// Cyrillic uppercase range 0x0410-0x042F maps to lowercase by adding 0x20
// Special case: Cyrillic capital IO (0x0401) maps to lowercase io (0x0451)
uint32_t toLowerCyrillicImpl(const uint32_t cp) {
if (cp >= 0x0410 && cp <= 0x042F) {
return cp + 0x20;
}
if (cp == 0x0401) {
return 0x0451;
}
return cp;
}
} // namespace
uint32_t toLowerLatin(const uint32_t cp) { return toLowerLatinImpl(cp); }
uint32_t toLowerCyrillic(const uint32_t cp) { return toLowerCyrillicImpl(cp); }
bool isLatinLetter(const uint32_t cp) {
if ((cp >= 'A' && cp <= 'Z') || (cp >= 'a' && cp <= 'z')) {
return true;
}
if (((cp >= 0x00C0 && cp <= 0x00D6) || (cp >= 0x00D8 && cp <= 0x00F6) || (cp >= 0x00F8 && cp <= 0x00FF)) &&
cp != 0x00D7 && cp != 0x00F7) {
return true;
}
switch (cp) {
case 0x0152: // Œ
case 0x0153: // œ
case 0x0178: // Ÿ
case 0x1E9E: // ẞ
return true;
default:
return false;
}
}
bool isCyrillicLetter(const uint32_t cp) { return (cp >= 0x0400 && cp <= 0x052F); }
bool isAlphabetic(const uint32_t cp) { return isLatinLetter(cp) || isCyrillicLetter(cp); }
bool isPunctuation(const uint32_t cp) {
switch (cp) {
case '-':
case '.':
case ',':
case '!':
case '?':
case ';':
case ':':
case '"':
case '\'':
case ')':
case '(':
case 0x00AB: // «
case 0x00BB: // »
case 0x2018: //
case 0x2019: //
case 0x201C: // “
case 0x201D: // ”
case 0x00A0: // no-break space
case '{':
case '}':
case '[':
case ']':
case '/':
case 0x203A: //
case 0x2026: // …
return true;
default:
return false;
}
}
bool isAsciiDigit(const uint32_t cp) { return cp >= '0' && cp <= '9'; }
bool isExplicitHyphen(const uint32_t cp) {
switch (cp) {
case '-':
case 0x00AD: // soft hyphen
case 0x058A: // Armenian hyphen
case 0x2010: // hyphen
case 0x2011: // non-breaking hyphen
case 0x2012: // figure dash
case 0x2013: // en dash
case 0x2014: // em dash
case 0x2015: // horizontal bar
case 0x2043: // hyphen bullet
case 0x207B: // superscript minus
case 0x208B: // subscript minus
case 0x2212: // minus sign
case 0x2E17: // double oblique hyphen
case 0x2E3A: // two-em dash
case 0x2E3B: // three-em dash
case 0xFE58: // small em dash
case 0xFE63: // small hyphen-minus
case 0xFF0D: // fullwidth hyphen-minus
case 0x005F: // Underscore
case 0x2026: // Ellipsis
return true;
default:
return false;
}
}
bool isSoftHyphen(const uint32_t cp) { return cp == 0x00AD; }
void trimSurroundingPunctuationAndFootnote(std::vector<CodepointInfo>& cps) {
if (cps.empty()) {
return;
}
// Remove trailing footnote references like [12], even if punctuation trails after the closing bracket.
if (cps.size() >= 3) {
int end = static_cast<int>(cps.size()) - 1;
while (end >= 0 && isPunctuation(cps[end].value)) {
--end;
}
int pos = end;
if (pos >= 0 && isAsciiDigit(cps[pos].value)) {
while (pos >= 0 && isAsciiDigit(cps[pos].value)) {
--pos;
}
if (pos >= 0 && cps[pos].value == '[' && end - pos > 1) {
cps.erase(cps.begin() + pos, cps.end());
}
}
}
while (!cps.empty() && isPunctuation(cps.front().value)) {
cps.erase(cps.begin());
}
while (!cps.empty() && isPunctuation(cps.back().value)) {
cps.pop_back();
}
}
std::vector<CodepointInfo> collectCodepoints(const std::string& word) {
std::vector<CodepointInfo> cps;
cps.reserve(word.size());
const unsigned char* base = reinterpret_cast<const unsigned char*>(word.c_str());
const unsigned char* ptr = base;
while (*ptr != 0) {
const unsigned char* current = ptr;
const uint32_t cp = utf8NextCodepoint(&ptr);
cps.push_back({cp, static_cast<size_t>(current - base)});
}
return cps;
}