Cleanup ParsedText

This commit is contained in:
Dave Allie 2025-12-12 20:36:13 +11:00
parent a9d702777e
commit 07151479f0
No known key found for this signature in database
GPG Key ID: F2FDDB3AD8D0276F
6 changed files with 139 additions and 148 deletions

View File

@ -117,13 +117,21 @@ void XMLCALL EpubHtmlParserSlim::characterData(void* userData, const XML_Char* s
return; return;
} }
EpdFontStyle fontStyle = REGULAR;
if (self->boldUntilDepth < self->depth && self->italicUntilDepth < self->depth) {
fontStyle = BOLD_ITALIC;
} else if (self->boldUntilDepth < self->depth) {
fontStyle = BOLD;
} else if (self->italicUntilDepth < self->depth) {
fontStyle = ITALIC;
}
for (int i = 0; i < len; i++) { for (int i = 0; i < len; i++) {
if (isWhitespace(s[i])) { if (isWhitespace(s[i])) {
// Currently looking at whitespace, if there's anything in the partWordBuffer, flush it // Currently looking at whitespace, if there's anything in the partWordBuffer, flush it
if (self->partWordBufferIndex > 0) { if (self->partWordBufferIndex > 0) {
self->partWordBuffer[self->partWordBufferIndex] = '\0'; self->partWordBuffer[self->partWordBufferIndex] = '\0';
self->currentTextBlock->addWord(std::move(replaceHtmlEntities(self->partWordBuffer)), self->currentTextBlock->addWord(std::move(replaceHtmlEntities(self->partWordBuffer)), fontStyle);
self->boldUntilDepth < self->depth, self->italicUntilDepth < self->depth);
self->partWordBufferIndex = 0; self->partWordBufferIndex = 0;
} }
// Skip the whitespace char // Skip the whitespace char
@ -133,8 +141,7 @@ void XMLCALL EpubHtmlParserSlim::characterData(void* userData, const XML_Char* s
// If we're about to run out of space, then cut the word off and start a new one // If we're about to run out of space, then cut the word off and start a new one
if (self->partWordBufferIndex >= MAX_WORD_SIZE) { if (self->partWordBufferIndex >= MAX_WORD_SIZE) {
self->partWordBuffer[self->partWordBufferIndex] = '\0'; self->partWordBuffer[self->partWordBufferIndex] = '\0';
self->currentTextBlock->addWord(std::move(replaceHtmlEntities(self->partWordBuffer)), self->currentTextBlock->addWord(std::move(replaceHtmlEntities(self->partWordBuffer)), fontStyle);
self->boldUntilDepth < self->depth, self->italicUntilDepth < self->depth);
self->partWordBufferIndex = 0; self->partWordBufferIndex = 0;
} }
@ -156,9 +163,17 @@ void XMLCALL EpubHtmlParserSlim::endElement(void* userData, const XML_Char* name
matches(name, BOLD_TAGS, NUM_BOLD_TAGS) || matches(name, ITALIC_TAGS, NUM_ITALIC_TAGS) || self->depth == 1; matches(name, BOLD_TAGS, NUM_BOLD_TAGS) || matches(name, ITALIC_TAGS, NUM_ITALIC_TAGS) || self->depth == 1;
if (shouldBreakText) { if (shouldBreakText) {
EpdFontStyle fontStyle = REGULAR;
if (self->boldUntilDepth < self->depth && self->italicUntilDepth < self->depth) {
fontStyle = BOLD_ITALIC;
} else if (self->boldUntilDepth < self->depth) {
fontStyle = BOLD;
} else if (self->italicUntilDepth < self->depth) {
fontStyle = ITALIC;
}
self->partWordBuffer[self->partWordBufferIndex] = '\0'; self->partWordBuffer[self->partWordBufferIndex] = '\0';
self->currentTextBlock->addWord(std::move(replaceHtmlEntities(self->partWordBuffer)), self->currentTextBlock->addWord(std::move(replaceHtmlEntities(self->partWordBuffer)), fontStyle);
self->boldUntilDepth < self->depth, self->italicUntilDepth < self->depth);
self->partWordBufferIndex = 0; self->partWordBufferIndex = 0;
} }
} }
@ -263,7 +278,7 @@ void EpubHtmlParserSlim::makePages() {
// Long running task, make sure to let other things happen // Long running task, make sure to let other things happen
vTaskDelay(1); vTaskDelay(1);
const auto lines = currentTextBlock->splitIntoLines(renderer, fontId, marginLeft + marginRight); const auto lines = currentTextBlock->layoutAndExtractLines(renderer, fontId, marginLeft + marginRight);
for (auto&& line : lines) { for (auto&& line : lines) {
if (currentPageNextY + lineHeight > pageHeight) { if (currentPageNextY + lineHeight > pageHeight) {

View File

@ -1,144 +1,137 @@
#include "ParsedText.h" #include "ParsedText.h"
#include <GfxRenderer.h> #include <GfxRenderer.h>
#include <Serialization.h>
#include <algorithm>
#include <cmath>
#include <limits>
#include <vector> #include <vector>
void ParsedText::addWord(std::string word, const bool is_bold, const bool is_italic) { constexpr int MAX_COST = std::numeric_limits<int>::max();
if (word.length() == 0) return;
void ParsedText::addWord(std::string word, const EpdFontStyle fontStyle) {
if (word.empty()) return;
words.push_back(std::move(word)); words.push_back(std::move(word));
wordStyles.push_back((is_bold ? TextBlock::BOLD_SPAN : 0) | (is_italic ? TextBlock::ITALIC_SPAN : 0)); wordStyles.push_back(fontStyle);
} }
// Consumes data // Consumes data to minimize memory usage
std::list<std::shared_ptr<TextBlock>> ParsedText::splitIntoLines(const GfxRenderer& renderer, const int fontId, std::list<std::shared_ptr<TextBlock>> ParsedText::layoutAndExtractLines(const GfxRenderer& renderer, const int fontId,
const int horizontalMargin) { const int horizontalMargin) {
const int totalWordCount = words.size(); if (words.empty()) {
const int pageWidth = GfxRenderer::getScreenWidth() - horizontalMargin; return {};
}
const size_t totalWordCount = words.size();
const int pageWidth = renderer.getScreenWidth() - horizontalMargin;
const int spaceWidth = renderer.getSpaceWidth(fontId); const int spaceWidth = renderer.getSpaceWidth(fontId);
// measure each word
std::vector<uint16_t> wordWidths; std::vector<uint16_t> wordWidths;
{ wordWidths.reserve(totalWordCount);
auto wordsIt = words.begin();
auto wordStylesIt = wordStyles.begin(); auto wordsIt = words.begin();
while (wordsIt != words.end() && wordStylesIt != wordStyles.end()) { auto wordStylesIt = wordStyles.begin();
// measure the word
EpdFontStyle fontStyle = REGULAR; while (wordsIt != words.end()) {
if (*wordStylesIt & TextBlock::BOLD_SPAN) { wordWidths.push_back(renderer.getTextWidth(fontId, wordsIt->c_str(), *wordStylesIt));
if (*wordStylesIt & TextBlock::ITALIC_SPAN) {
fontStyle = BOLD_ITALIC; std::advance(wordsIt, 1);
} else { std::advance(wordStylesIt, 1);
fontStyle = BOLD;
}
} else if (*wordStylesIt & TextBlock::ITALIC_SPAN) {
fontStyle = ITALIC;
}
const int width = renderer.getTextWidth(fontId, wordsIt->c_str(), fontStyle);
wordWidths.push_back(width);
std::advance(wordsIt, 1);
std::advance(wordStylesIt, 1);
}
} }
// Array in which ans[i] store index of last word in line starting with word // DP table to store the minimum badness (cost) of lines starting at index i
// word[i] std::vector<int> dp(totalWordCount);
size_t ans[totalWordCount]; // 'ans[i]' stores the index 'j' of the *last word* in the optimal line starting at 'i'
{ std::vector<size_t> ans(totalWordCount);
// now apply the dynamic programming algorithm to find the best line breaks
// DP table in which dp[i] represents cost of line starting with word words[i]
int dp[totalWordCount];
// If only one word is present then only one line is required. Cost of last // Base Case
// line is zero. Hence cost of this line is zero. Ending point is also n-1 as dp[totalWordCount - 1] = 0;
// single word is present ans[totalWordCount - 1] = totalWordCount - 1;
dp[totalWordCount - 1] = 0;
ans[totalWordCount - 1] = totalWordCount - 1;
// Make each word first word of line by iterating over each index in arr. for (int i = totalWordCount - 2; i >= 0; --i) {
for (int i = totalWordCount - 2; i >= 0; i--) { int currlen = -spaceWidth;
int currlen = -1; dp[i] = MAX_COST;
dp[i] = INT_MAX;
for (size_t j = i; j < totalWordCount; ++j) {
// Current line length: previous width + space + current word width
currlen += wordWidths[j] + spaceWidth;
if (currlen > pageWidth) {
break;
}
// Variable to store possible minimum cost of line.
int cost; int cost;
if (j == totalWordCount - 1) {
cost = 0; // Last line
} else {
const int remainingSpace = pageWidth - currlen;
// Use long long for the square to prevent overflow
const long long cost_ll = static_cast<long long>(remainingSpace) * remainingSpace + dp[j + 1];
// Keep on adding words in current line by iterating from starting word upto if (cost_ll > MAX_COST) {
// last word in arr. cost = MAX_COST;
for (int j = i; j < totalWordCount; j++) { } else {
// Update the width of the words in current line + the space between two cost = static_cast<int>(cost_ll);
// words.
currlen += wordWidths[j] + spaceWidth;
// If we're bigger than the current pagewidth then we can't add more words
if (currlen > pageWidth) break;
// if we've run out of words then this is last line and the cost should be
// 0 Otherwise the cost is the sqaure of the left over space + the costs
// of all the previous lines
if (j == totalWordCount - 1)
cost = 0;
else
cost = (pageWidth - currlen) * (pageWidth - currlen) + dp[j + 1];
// Check if this arrangement gives minimum cost for line starting with
// word words[i].
if (cost < dp[i]) {
dp[i] = cost;
ans[i] = j;
} }
} }
if (cost < dp[i]) {
dp[i] = cost;
ans[i] = j; // j is the index of the last word in this optimal line
}
} }
} }
// We can now iterate through the answer to find the line break positions // Stores the index of the word that starts the next line (last_word_index + 1)
std::list<uint16_t> lineBreaks; std::vector<size_t> lineBreakIndices;
for (size_t i = 0; i < totalWordCount;) { size_t currentWordIndex = 0;
i = ans[i] + 1; constexpr size_t MAX_LINES = 1000;
if (i > totalWordCount) {
break; while (currentWordIndex < totalWordCount) {
} if (lineBreakIndices.size() >= MAX_LINES) {
lineBreaks.push_back(i);
// Text too big, just exit
if (lineBreaks.size() > 1000) {
break; break;
} }
size_t nextBreakIndex = ans[currentWordIndex] + 1;
lineBreakIndices.push_back(nextBreakIndex);
currentWordIndex = nextBreakIndex;
} }
std::list<std::shared_ptr<TextBlock>> lines; std::list<std::shared_ptr<TextBlock>> lines;
// With the line breaks calculated we can now position the words along the // Initialize iterators for consumption
// line
auto wordStartIt = words.begin(); auto wordStartIt = words.begin();
auto wordStyleStartIt = wordStyles.begin(); auto wordStyleStartIt = wordStyles.begin();
auto wordWidthStartIt = wordWidths.begin(); size_t wordWidthIndex = 0;
uint16_t lastBreakAt = 0;
for (const auto lineBreak : lineBreaks) {
const int lineWordCount = lineBreak - lastBreakAt;
size_t lastBreakAt = 0;
for (const size_t lineBreak : lineBreakIndices) {
const size_t lineWordCount = lineBreak - lastBreakAt;
// Calculate end iterators for the range to splice
auto wordEndIt = wordStartIt; auto wordEndIt = wordStartIt;
auto wordStyleEndIt = wordStyleStartIt; auto wordStyleEndIt = wordStyleStartIt;
auto wordWidthEndIt = wordWidthStartIt;
std::advance(wordEndIt, lineWordCount); std::advance(wordEndIt, lineWordCount);
std::advance(wordStyleEndIt, lineWordCount); std::advance(wordStyleEndIt, lineWordCount);
std::advance(wordWidthEndIt, lineWordCount);
// Calculate total word width for this line
int lineWordWidthSum = 0; int lineWordWidthSum = 0;
for (auto it = wordWidthStartIt; it != wordWidthEndIt; std::advance(it, 1)) { for (size_t i = 0; i < lineWordCount; ++i) {
lineWordWidthSum += *it; lineWordWidthSum += wordWidths[wordWidthIndex + i];
} }
// Calculate spacing between words // Calculate spacing
const uint16_t spareSpace = pageWidth - lineWordWidthSum; const int spareSpace = pageWidth - lineWordWidthSum;
uint16_t spacing = spaceWidth; int spacing = spaceWidth;
// evenly space words if using justified style, not the last line, and at const bool isLastLine = lineBreak == totalWordCount;
// least 2 words
if (style == TextBlock::JUSTIFIED && lineBreak != lineBreaks.back() && lineWordCount >= 2) { if (style == TextBlock::JUSTIFIED && !isLastLine && lineWordCount >= 2) {
spacing = spareSpace / (lineWordCount - 1); spacing = spareSpace / (lineWordCount - 1);
} }
// Calculate initial x position
uint16_t xpos = 0; uint16_t xpos = 0;
if (style == TextBlock::RIGHT_ALIGN) { if (style == TextBlock::RIGHT_ALIGN) {
xpos = spareSpace - (lineWordCount - 1) * spaceWidth; xpos = spareSpace - (lineWordCount - 1) * spaceWidth;
@ -146,24 +139,27 @@ std::list<std::shared_ptr<TextBlock>> ParsedText::splitIntoLines(const GfxRender
xpos = (spareSpace - (lineWordCount - 1) * spaceWidth) / 2; xpos = (spareSpace - (lineWordCount - 1) * spaceWidth) / 2;
} }
// Pre-calculate X positions for words
std::list<uint16_t> lineXPos; std::list<uint16_t> lineXPos;
for (size_t i = 0; i < lineWordCount; ++i) {
for (auto it = wordWidthStartIt; it != wordWidthEndIt; std::advance(it, 1)) { const uint16_t currentWordWidth = wordWidths[wordWidthIndex + i];
lineXPos.push_back(xpos); lineXPos.push_back(xpos);
xpos += *it + spacing; xpos += currentWordWidth + spacing;
} }
// *** CRITICAL STEP: CONSUME DATA USING SPLICE ***
std::list<std::string> lineWords; std::list<std::string> lineWords;
std::list<uint8_t> lineWordStyles;
lineWords.splice(lineWords.begin(), words, wordStartIt, wordEndIt); lineWords.splice(lineWords.begin(), words, wordStartIt, wordEndIt);
std::list<EpdFontStyle> lineWordStyles;
lineWordStyles.splice(lineWordStyles.begin(), wordStyles, wordStyleStartIt, wordStyleEndIt); lineWordStyles.splice(lineWordStyles.begin(), wordStyles, wordStyleStartIt, wordStyleEndIt);
lines.push_back( lines.push_back(
std::make_shared<TextBlock>(std::move(lineWords), std::move(lineXPos), std::move(lineWordStyles), style)); std::make_shared<TextBlock>(std::move(lineWords), std::move(lineXPos), std::move(lineWordStyles), style));
// Update pointers/indices for the next line
wordStartIt = wordEndIt; wordStartIt = wordEndIt;
wordStyleStartIt = wordStyleEndIt; wordStyleStartIt = wordStyleEndIt;
wordWidthStartIt = wordWidthEndIt; wordWidthIndex += lineWordCount;
lastBreakAt = lineBreak; lastBreakAt = lineBreak;
} }

View File

@ -1,24 +1,29 @@
#pragma once #pragma once
#include <EpdFontFamily.h>
#include <cstdint>
#include <list> #include <list>
#include <memory>
#include <string> #include <string>
#include "blocks/TextBlock.h" #include "blocks/TextBlock.h"
class GfxRenderer;
class ParsedText { class ParsedText {
std::list<std::string> words; std::list<std::string> words;
std::list<uint8_t> wordStyles; std::list<EpdFontStyle> wordStyles;
// the style of the block - left, center, right aligned
TextBlock::BLOCK_STYLE style; TextBlock::BLOCK_STYLE style;
public: public:
explicit ParsedText(const TextBlock::BLOCK_STYLE style) : style(style) {} explicit ParsedText(const TextBlock::BLOCK_STYLE style) : style(style) {}
explicit ParsedText(std::list<std::string> words, std::list<uint8_t> word_styles, const TextBlock::BLOCK_STYLE style)
: words(std::move(words)), wordStyles(std::move(word_styles)), style(style) {}
~ParsedText() = default; ~ParsedText() = default;
void addWord(std::string word, bool is_bold, bool is_italic);
void addWord(std::string word, EpdFontStyle fontStyle);
void setStyle(const TextBlock::BLOCK_STYLE style) { this->style = style; } void setStyle(const TextBlock::BLOCK_STYLE style) { this->style = style; }
TextBlock::BLOCK_STYLE getStyle() const { return style; } TextBlock::BLOCK_STYLE getStyle() const { return style; }
bool isEmpty() const { return words.empty(); } bool isEmpty() const { return words.empty(); }
std::list<std::shared_ptr<TextBlock>> splitIntoLines(const GfxRenderer& renderer, int fontId, int horizontalMargin); std::list<std::shared_ptr<TextBlock>> layoutAndExtractLines(const GfxRenderer& renderer, int fontId,
int horizontalMargin);
}; };

View File

@ -3,29 +3,13 @@
#include <GfxRenderer.h> #include <GfxRenderer.h>
#include <Serialization.h> #include <Serialization.h>
void TextBlock::addWord(std::string word, const bool is_bold, const bool is_italic) {
if (word.length() == 0) return;
words.push_back(std::move(word));
wordStyles.push_back((is_bold ? BOLD_SPAN : 0) | (is_italic ? ITALIC_SPAN : 0));
}
void TextBlock::render(const GfxRenderer& renderer, const int fontId, const int x, const int y) const { void TextBlock::render(const GfxRenderer& renderer, const int fontId, const int x, const int y) const {
auto wordIt = words.begin(); auto wordIt = words.begin();
auto wordStylesIt = wordStyles.begin(); auto wordStylesIt = wordStyles.begin();
auto wordXposIt = wordXpos.begin(); auto wordXposIt = wordXpos.begin();
for (int i = 0; i < words.size(); i++) { for (int i = 0; i < words.size(); i++) {
// render the word renderer.drawText(fontId, *wordXposIt + x, y, wordIt->c_str(), true, *wordStylesIt);
EpdFontStyle fontStyle = REGULAR;
if (*wordStylesIt & BOLD_SPAN && *wordStylesIt & ITALIC_SPAN) {
fontStyle = BOLD_ITALIC;
} else if (*wordStylesIt & BOLD_SPAN) {
fontStyle = BOLD;
} else if (*wordStylesIt & ITALIC_SPAN) {
fontStyle = ITALIC;
}
renderer.drawText(fontId, *wordXposIt + x, y, wordIt->c_str(), true, fontStyle);
std::advance(wordIt, 1); std::advance(wordIt, 1);
std::advance(wordStylesIt, 1); std::advance(wordStylesIt, 1);
@ -57,7 +41,7 @@ std::unique_ptr<TextBlock> TextBlock::deserialize(std::istream& is) {
uint32_t wc, xc, sc; uint32_t wc, xc, sc;
std::list<std::string> words; std::list<std::string> words;
std::list<uint16_t> wordXpos; std::list<uint16_t> wordXpos;
std::list<uint8_t> wordStyles; std::list<EpdFontStyle> wordStyles;
BLOCK_STYLE style; BLOCK_STYLE style;
// words // words

View File

@ -1,4 +1,6 @@
#pragma once #pragma once
#include <EpdFontFamily.h>
#include <list> #include <list>
#include <memory> #include <memory>
#include <string> #include <string>
@ -8,11 +10,6 @@
// represents a block of words in the html document // represents a block of words in the html document
class TextBlock final : public Block { class TextBlock final : public Block {
public: public:
enum SPAN_STYLE : uint8_t {
BOLD_SPAN = 1,
ITALIC_SPAN = 2,
};
enum BLOCK_STYLE : uint8_t { enum BLOCK_STYLE : uint8_t {
JUSTIFIED = 0, JUSTIFIED = 0,
LEFT_ALIGN = 1, LEFT_ALIGN = 1,
@ -23,19 +20,14 @@ class TextBlock final : public Block {
private: private:
std::list<std::string> words; std::list<std::string> words;
std::list<uint16_t> wordXpos; std::list<uint16_t> wordXpos;
std::list<uint8_t> wordStyles; std::list<EpdFontStyle> wordStyles;
// the style of the block - left, center, right aligned
BLOCK_STYLE style; BLOCK_STYLE style;
public: public:
explicit TextBlock(const BLOCK_STYLE style) : style(style) {} explicit TextBlock(std::list<std::string> words, std::list<uint16_t> word_xpos, std::list<EpdFontStyle> word_styles,
explicit TextBlock(std::list<std::string> words, std::list<uint16_t> word_xpos, const BLOCK_STYLE style)
// the styles of each word
std::list<uint8_t> word_styles, const BLOCK_STYLE style)
: words(std::move(words)), wordXpos(std::move(word_xpos)), wordStyles(std::move(word_styles)), style(style) {} : words(std::move(words)), wordXpos(std::move(word_xpos)), wordStyles(std::move(word_styles)), style(style) {}
~TextBlock() override = default; ~TextBlock() override = default;
void addWord(std::string word, bool is_bold, bool is_italic);
void setStyle(const BLOCK_STYLE style) { this->style = style; } void setStyle(const BLOCK_STYLE style) { this->style = style; }
BLOCK_STYLE getStyle() const { return style; } BLOCK_STYLE getStyle() const { return style; }
bool isEmpty() override { return words.empty(); } bool isEmpty() override { return words.empty(); }

View File

@ -1,11 +1,10 @@
#pragma once #pragma once
#include <EInkDisplay.h> #include <EInkDisplay.h>
#include <EpdFontFamily.h>
#include <map> #include <map>
#include "EpdFontFamily.h"
class GfxRenderer { class GfxRenderer {
public: public:
enum FontRenderMode { BW, GRAYSCALE_LSB, GRAYSCALE_MSB }; enum FontRenderMode { BW, GRAYSCALE_LSB, GRAYSCALE_MSB };