mirror of
https://github.com/daveallie/crosspoint-reader.git
synced 2025-12-18 15:17:42 +03:00
Cleanup ParsedText
This commit is contained in:
parent
a9d702777e
commit
07151479f0
@ -117,13 +117,21 @@ void XMLCALL EpubHtmlParserSlim::characterData(void* userData, const XML_Char* s
|
||||
return;
|
||||
}
|
||||
|
||||
EpdFontStyle fontStyle = REGULAR;
|
||||
if (self->boldUntilDepth < self->depth && self->italicUntilDepth < self->depth) {
|
||||
fontStyle = BOLD_ITALIC;
|
||||
} else if (self->boldUntilDepth < self->depth) {
|
||||
fontStyle = BOLD;
|
||||
} else if (self->italicUntilDepth < self->depth) {
|
||||
fontStyle = ITALIC;
|
||||
}
|
||||
|
||||
for (int i = 0; i < len; i++) {
|
||||
if (isWhitespace(s[i])) {
|
||||
// Currently looking at whitespace, if there's anything in the partWordBuffer, flush it
|
||||
if (self->partWordBufferIndex > 0) {
|
||||
self->partWordBuffer[self->partWordBufferIndex] = '\0';
|
||||
self->currentTextBlock->addWord(std::move(replaceHtmlEntities(self->partWordBuffer)),
|
||||
self->boldUntilDepth < self->depth, self->italicUntilDepth < self->depth);
|
||||
self->currentTextBlock->addWord(std::move(replaceHtmlEntities(self->partWordBuffer)), fontStyle);
|
||||
self->partWordBufferIndex = 0;
|
||||
}
|
||||
// Skip the whitespace char
|
||||
@ -133,8 +141,7 @@ void XMLCALL EpubHtmlParserSlim::characterData(void* userData, const XML_Char* s
|
||||
// If we're about to run out of space, then cut the word off and start a new one
|
||||
if (self->partWordBufferIndex >= MAX_WORD_SIZE) {
|
||||
self->partWordBuffer[self->partWordBufferIndex] = '\0';
|
||||
self->currentTextBlock->addWord(std::move(replaceHtmlEntities(self->partWordBuffer)),
|
||||
self->boldUntilDepth < self->depth, self->italicUntilDepth < self->depth);
|
||||
self->currentTextBlock->addWord(std::move(replaceHtmlEntities(self->partWordBuffer)), fontStyle);
|
||||
self->partWordBufferIndex = 0;
|
||||
}
|
||||
|
||||
@ -156,9 +163,17 @@ void XMLCALL EpubHtmlParserSlim::endElement(void* userData, const XML_Char* name
|
||||
matches(name, BOLD_TAGS, NUM_BOLD_TAGS) || matches(name, ITALIC_TAGS, NUM_ITALIC_TAGS) || self->depth == 1;
|
||||
|
||||
if (shouldBreakText) {
|
||||
EpdFontStyle fontStyle = REGULAR;
|
||||
if (self->boldUntilDepth < self->depth && self->italicUntilDepth < self->depth) {
|
||||
fontStyle = BOLD_ITALIC;
|
||||
} else if (self->boldUntilDepth < self->depth) {
|
||||
fontStyle = BOLD;
|
||||
} else if (self->italicUntilDepth < self->depth) {
|
||||
fontStyle = ITALIC;
|
||||
}
|
||||
|
||||
self->partWordBuffer[self->partWordBufferIndex] = '\0';
|
||||
self->currentTextBlock->addWord(std::move(replaceHtmlEntities(self->partWordBuffer)),
|
||||
self->boldUntilDepth < self->depth, self->italicUntilDepth < self->depth);
|
||||
self->currentTextBlock->addWord(std::move(replaceHtmlEntities(self->partWordBuffer)), fontStyle);
|
||||
self->partWordBufferIndex = 0;
|
||||
}
|
||||
}
|
||||
@ -263,7 +278,7 @@ void EpubHtmlParserSlim::makePages() {
|
||||
// Long running task, make sure to let other things happen
|
||||
vTaskDelay(1);
|
||||
|
||||
const auto lines = currentTextBlock->splitIntoLines(renderer, fontId, marginLeft + marginRight);
|
||||
const auto lines = currentTextBlock->layoutAndExtractLines(renderer, fontId, marginLeft + marginRight);
|
||||
|
||||
for (auto&& line : lines) {
|
||||
if (currentPageNextY + lineHeight > pageHeight) {
|
||||
|
||||
@ -1,144 +1,137 @@
|
||||
#include "ParsedText.h"
|
||||
|
||||
#include <GfxRenderer.h>
|
||||
#include <Serialization.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <limits>
|
||||
#include <vector>
|
||||
|
||||
void ParsedText::addWord(std::string word, const bool is_bold, const bool is_italic) {
|
||||
if (word.length() == 0) return;
|
||||
constexpr int MAX_COST = std::numeric_limits<int>::max();
|
||||
|
||||
void ParsedText::addWord(std::string word, const EpdFontStyle fontStyle) {
|
||||
if (word.empty()) return;
|
||||
|
||||
words.push_back(std::move(word));
|
||||
wordStyles.push_back((is_bold ? TextBlock::BOLD_SPAN : 0) | (is_italic ? TextBlock::ITALIC_SPAN : 0));
|
||||
wordStyles.push_back(fontStyle);
|
||||
}
|
||||
|
||||
// Consumes data
|
||||
std::list<std::shared_ptr<TextBlock>> ParsedText::splitIntoLines(const GfxRenderer& renderer, const int fontId,
|
||||
const int horizontalMargin) {
|
||||
const int totalWordCount = words.size();
|
||||
const int pageWidth = GfxRenderer::getScreenWidth() - horizontalMargin;
|
||||
// Consumes data to minimize memory usage
|
||||
std::list<std::shared_ptr<TextBlock>> ParsedText::layoutAndExtractLines(const GfxRenderer& renderer, const int fontId,
|
||||
const int horizontalMargin) {
|
||||
if (words.empty()) {
|
||||
return {};
|
||||
}
|
||||
|
||||
const size_t totalWordCount = words.size();
|
||||
const int pageWidth = renderer.getScreenWidth() - horizontalMargin;
|
||||
const int spaceWidth = renderer.getSpaceWidth(fontId);
|
||||
|
||||
// measure each word
|
||||
std::vector<uint16_t> wordWidths;
|
||||
{
|
||||
auto wordsIt = words.begin();
|
||||
auto wordStylesIt = wordStyles.begin();
|
||||
while (wordsIt != words.end() && wordStylesIt != wordStyles.end()) {
|
||||
// measure the word
|
||||
EpdFontStyle fontStyle = REGULAR;
|
||||
if (*wordStylesIt & TextBlock::BOLD_SPAN) {
|
||||
if (*wordStylesIt & TextBlock::ITALIC_SPAN) {
|
||||
fontStyle = BOLD_ITALIC;
|
||||
} else {
|
||||
fontStyle = BOLD;
|
||||
}
|
||||
} else if (*wordStylesIt & TextBlock::ITALIC_SPAN) {
|
||||
fontStyle = ITALIC;
|
||||
}
|
||||
const int width = renderer.getTextWidth(fontId, wordsIt->c_str(), fontStyle);
|
||||
wordWidths.push_back(width);
|
||||
std::advance(wordsIt, 1);
|
||||
std::advance(wordStylesIt, 1);
|
||||
}
|
||||
wordWidths.reserve(totalWordCount);
|
||||
|
||||
auto wordsIt = words.begin();
|
||||
auto wordStylesIt = wordStyles.begin();
|
||||
|
||||
while (wordsIt != words.end()) {
|
||||
wordWidths.push_back(renderer.getTextWidth(fontId, wordsIt->c_str(), *wordStylesIt));
|
||||
|
||||
std::advance(wordsIt, 1);
|
||||
std::advance(wordStylesIt, 1);
|
||||
}
|
||||
|
||||
// Array in which ans[i] store index of last word in line starting with word
|
||||
// word[i]
|
||||
size_t ans[totalWordCount];
|
||||
{
|
||||
// now apply the dynamic programming algorithm to find the best line breaks
|
||||
// DP table in which dp[i] represents cost of line starting with word words[i]
|
||||
int dp[totalWordCount];
|
||||
// DP table to store the minimum badness (cost) of lines starting at index i
|
||||
std::vector<int> dp(totalWordCount);
|
||||
// 'ans[i]' stores the index 'j' of the *last word* in the optimal line starting at 'i'
|
||||
std::vector<size_t> ans(totalWordCount);
|
||||
|
||||
// If only one word is present then only one line is required. Cost of last
|
||||
// line is zero. Hence cost of this line is zero. Ending point is also n-1 as
|
||||
// single word is present
|
||||
dp[totalWordCount - 1] = 0;
|
||||
ans[totalWordCount - 1] = totalWordCount - 1;
|
||||
// Base Case
|
||||
dp[totalWordCount - 1] = 0;
|
||||
ans[totalWordCount - 1] = totalWordCount - 1;
|
||||
|
||||
// Make each word first word of line by iterating over each index in arr.
|
||||
for (int i = totalWordCount - 2; i >= 0; i--) {
|
||||
int currlen = -1;
|
||||
dp[i] = INT_MAX;
|
||||
for (int i = totalWordCount - 2; i >= 0; --i) {
|
||||
int currlen = -spaceWidth;
|
||||
dp[i] = MAX_COST;
|
||||
|
||||
for (size_t j = i; j < totalWordCount; ++j) {
|
||||
// Current line length: previous width + space + current word width
|
||||
currlen += wordWidths[j] + spaceWidth;
|
||||
|
||||
if (currlen > pageWidth) {
|
||||
break;
|
||||
}
|
||||
|
||||
// Variable to store possible minimum cost of line.
|
||||
int cost;
|
||||
if (j == totalWordCount - 1) {
|
||||
cost = 0; // Last line
|
||||
} else {
|
||||
const int remainingSpace = pageWidth - currlen;
|
||||
// Use long long for the square to prevent overflow
|
||||
const long long cost_ll = static_cast<long long>(remainingSpace) * remainingSpace + dp[j + 1];
|
||||
|
||||
// Keep on adding words in current line by iterating from starting word upto
|
||||
// last word in arr.
|
||||
for (int j = i; j < totalWordCount; j++) {
|
||||
// Update the width of the words in current line + the space between two
|
||||
// words.
|
||||
currlen += wordWidths[j] + spaceWidth;
|
||||
|
||||
// If we're bigger than the current pagewidth then we can't add more words
|
||||
if (currlen > pageWidth) break;
|
||||
|
||||
// if we've run out of words then this is last line and the cost should be
|
||||
// 0 Otherwise the cost is the sqaure of the left over space + the costs
|
||||
// of all the previous lines
|
||||
if (j == totalWordCount - 1)
|
||||
cost = 0;
|
||||
else
|
||||
cost = (pageWidth - currlen) * (pageWidth - currlen) + dp[j + 1];
|
||||
|
||||
// Check if this arrangement gives minimum cost for line starting with
|
||||
// word words[i].
|
||||
if (cost < dp[i]) {
|
||||
dp[i] = cost;
|
||||
ans[i] = j;
|
||||
if (cost_ll > MAX_COST) {
|
||||
cost = MAX_COST;
|
||||
} else {
|
||||
cost = static_cast<int>(cost_ll);
|
||||
}
|
||||
}
|
||||
|
||||
if (cost < dp[i]) {
|
||||
dp[i] = cost;
|
||||
ans[i] = j; // j is the index of the last word in this optimal line
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// We can now iterate through the answer to find the line break positions
|
||||
std::list<uint16_t> lineBreaks;
|
||||
for (size_t i = 0; i < totalWordCount;) {
|
||||
i = ans[i] + 1;
|
||||
if (i > totalWordCount) {
|
||||
break;
|
||||
}
|
||||
lineBreaks.push_back(i);
|
||||
// Text too big, just exit
|
||||
if (lineBreaks.size() > 1000) {
|
||||
// Stores the index of the word that starts the next line (last_word_index + 1)
|
||||
std::vector<size_t> lineBreakIndices;
|
||||
size_t currentWordIndex = 0;
|
||||
constexpr size_t MAX_LINES = 1000;
|
||||
|
||||
while (currentWordIndex < totalWordCount) {
|
||||
if (lineBreakIndices.size() >= MAX_LINES) {
|
||||
break;
|
||||
}
|
||||
|
||||
size_t nextBreakIndex = ans[currentWordIndex] + 1;
|
||||
lineBreakIndices.push_back(nextBreakIndex);
|
||||
|
||||
currentWordIndex = nextBreakIndex;
|
||||
}
|
||||
|
||||
std::list<std::shared_ptr<TextBlock>> lines;
|
||||
|
||||
// With the line breaks calculated we can now position the words along the
|
||||
// line
|
||||
// Initialize iterators for consumption
|
||||
auto wordStartIt = words.begin();
|
||||
auto wordStyleStartIt = wordStyles.begin();
|
||||
auto wordWidthStartIt = wordWidths.begin();
|
||||
uint16_t lastBreakAt = 0;
|
||||
for (const auto lineBreak : lineBreaks) {
|
||||
const int lineWordCount = lineBreak - lastBreakAt;
|
||||
size_t wordWidthIndex = 0;
|
||||
|
||||
size_t lastBreakAt = 0;
|
||||
for (const size_t lineBreak : lineBreakIndices) {
|
||||
const size_t lineWordCount = lineBreak - lastBreakAt;
|
||||
|
||||
// Calculate end iterators for the range to splice
|
||||
auto wordEndIt = wordStartIt;
|
||||
auto wordStyleEndIt = wordStyleStartIt;
|
||||
auto wordWidthEndIt = wordWidthStartIt;
|
||||
std::advance(wordEndIt, lineWordCount);
|
||||
std::advance(wordStyleEndIt, lineWordCount);
|
||||
std::advance(wordWidthEndIt, lineWordCount);
|
||||
|
||||
// Calculate total word width for this line
|
||||
int lineWordWidthSum = 0;
|
||||
for (auto it = wordWidthStartIt; it != wordWidthEndIt; std::advance(it, 1)) {
|
||||
lineWordWidthSum += *it;
|
||||
for (size_t i = 0; i < lineWordCount; ++i) {
|
||||
lineWordWidthSum += wordWidths[wordWidthIndex + i];
|
||||
}
|
||||
|
||||
// Calculate spacing between words
|
||||
const uint16_t spareSpace = pageWidth - lineWordWidthSum;
|
||||
uint16_t spacing = spaceWidth;
|
||||
// evenly space words if using justified style, not the last line, and at
|
||||
// least 2 words
|
||||
if (style == TextBlock::JUSTIFIED && lineBreak != lineBreaks.back() && lineWordCount >= 2) {
|
||||
// Calculate spacing
|
||||
const int spareSpace = pageWidth - lineWordWidthSum;
|
||||
int spacing = spaceWidth;
|
||||
const bool isLastLine = lineBreak == totalWordCount;
|
||||
|
||||
if (style == TextBlock::JUSTIFIED && !isLastLine && lineWordCount >= 2) {
|
||||
spacing = spareSpace / (lineWordCount - 1);
|
||||
}
|
||||
|
||||
// Calculate initial x position
|
||||
uint16_t xpos = 0;
|
||||
if (style == TextBlock::RIGHT_ALIGN) {
|
||||
xpos = spareSpace - (lineWordCount - 1) * spaceWidth;
|
||||
@ -146,24 +139,27 @@ std::list<std::shared_ptr<TextBlock>> ParsedText::splitIntoLines(const GfxRender
|
||||
xpos = (spareSpace - (lineWordCount - 1) * spaceWidth) / 2;
|
||||
}
|
||||
|
||||
// Pre-calculate X positions for words
|
||||
std::list<uint16_t> lineXPos;
|
||||
|
||||
for (auto it = wordWidthStartIt; it != wordWidthEndIt; std::advance(it, 1)) {
|
||||
for (size_t i = 0; i < lineWordCount; ++i) {
|
||||
const uint16_t currentWordWidth = wordWidths[wordWidthIndex + i];
|
||||
lineXPos.push_back(xpos);
|
||||
xpos += *it + spacing;
|
||||
xpos += currentWordWidth + spacing;
|
||||
}
|
||||
|
||||
// *** CRITICAL STEP: CONSUME DATA USING SPLICE ***
|
||||
std::list<std::string> lineWords;
|
||||
std::list<uint8_t> lineWordStyles;
|
||||
lineWords.splice(lineWords.begin(), words, wordStartIt, wordEndIt);
|
||||
std::list<EpdFontStyle> lineWordStyles;
|
||||
lineWordStyles.splice(lineWordStyles.begin(), wordStyles, wordStyleStartIt, wordStyleEndIt);
|
||||
|
||||
lines.push_back(
|
||||
std::make_shared<TextBlock>(std::move(lineWords), std::move(lineXPos), std::move(lineWordStyles), style));
|
||||
|
||||
// Update pointers/indices for the next line
|
||||
wordStartIt = wordEndIt;
|
||||
wordStyleStartIt = wordStyleEndIt;
|
||||
wordWidthStartIt = wordWidthEndIt;
|
||||
wordWidthIndex += lineWordCount;
|
||||
lastBreakAt = lineBreak;
|
||||
}
|
||||
|
||||
|
||||
@ -1,24 +1,29 @@
|
||||
#pragma once
|
||||
|
||||
#include <EpdFontFamily.h>
|
||||
|
||||
#include <cstdint>
|
||||
#include <list>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "blocks/TextBlock.h"
|
||||
|
||||
class GfxRenderer;
|
||||
|
||||
class ParsedText {
|
||||
std::list<std::string> words;
|
||||
std::list<uint8_t> wordStyles;
|
||||
|
||||
// the style of the block - left, center, right aligned
|
||||
std::list<EpdFontStyle> wordStyles;
|
||||
TextBlock::BLOCK_STYLE style;
|
||||
|
||||
public:
|
||||
explicit ParsedText(const TextBlock::BLOCK_STYLE style) : style(style) {}
|
||||
explicit ParsedText(std::list<std::string> words, std::list<uint8_t> word_styles, const TextBlock::BLOCK_STYLE style)
|
||||
: words(std::move(words)), wordStyles(std::move(word_styles)), style(style) {}
|
||||
~ParsedText() = default;
|
||||
void addWord(std::string word, bool is_bold, bool is_italic);
|
||||
|
||||
void addWord(std::string word, EpdFontStyle fontStyle);
|
||||
void setStyle(const TextBlock::BLOCK_STYLE style) { this->style = style; }
|
||||
TextBlock::BLOCK_STYLE getStyle() const { return style; }
|
||||
bool isEmpty() const { return words.empty(); }
|
||||
std::list<std::shared_ptr<TextBlock>> splitIntoLines(const GfxRenderer& renderer, int fontId, int horizontalMargin);
|
||||
std::list<std::shared_ptr<TextBlock>> layoutAndExtractLines(const GfxRenderer& renderer, int fontId,
|
||||
int horizontalMargin);
|
||||
};
|
||||
|
||||
@ -3,29 +3,13 @@
|
||||
#include <GfxRenderer.h>
|
||||
#include <Serialization.h>
|
||||
|
||||
void TextBlock::addWord(std::string word, const bool is_bold, const bool is_italic) {
|
||||
if (word.length() == 0) return;
|
||||
|
||||
words.push_back(std::move(word));
|
||||
wordStyles.push_back((is_bold ? BOLD_SPAN : 0) | (is_italic ? ITALIC_SPAN : 0));
|
||||
}
|
||||
|
||||
void TextBlock::render(const GfxRenderer& renderer, const int fontId, const int x, const int y) const {
|
||||
auto wordIt = words.begin();
|
||||
auto wordStylesIt = wordStyles.begin();
|
||||
auto wordXposIt = wordXpos.begin();
|
||||
|
||||
for (int i = 0; i < words.size(); i++) {
|
||||
// render the word
|
||||
EpdFontStyle fontStyle = REGULAR;
|
||||
if (*wordStylesIt & BOLD_SPAN && *wordStylesIt & ITALIC_SPAN) {
|
||||
fontStyle = BOLD_ITALIC;
|
||||
} else if (*wordStylesIt & BOLD_SPAN) {
|
||||
fontStyle = BOLD;
|
||||
} else if (*wordStylesIt & ITALIC_SPAN) {
|
||||
fontStyle = ITALIC;
|
||||
}
|
||||
renderer.drawText(fontId, *wordXposIt + x, y, wordIt->c_str(), true, fontStyle);
|
||||
renderer.drawText(fontId, *wordXposIt + x, y, wordIt->c_str(), true, *wordStylesIt);
|
||||
|
||||
std::advance(wordIt, 1);
|
||||
std::advance(wordStylesIt, 1);
|
||||
@ -57,7 +41,7 @@ std::unique_ptr<TextBlock> TextBlock::deserialize(std::istream& is) {
|
||||
uint32_t wc, xc, sc;
|
||||
std::list<std::string> words;
|
||||
std::list<uint16_t> wordXpos;
|
||||
std::list<uint8_t> wordStyles;
|
||||
std::list<EpdFontStyle> wordStyles;
|
||||
BLOCK_STYLE style;
|
||||
|
||||
// words
|
||||
|
||||
@ -1,4 +1,6 @@
|
||||
#pragma once
|
||||
#include <EpdFontFamily.h>
|
||||
|
||||
#include <list>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
@ -8,11 +10,6 @@
|
||||
// represents a block of words in the html document
|
||||
class TextBlock final : public Block {
|
||||
public:
|
||||
enum SPAN_STYLE : uint8_t {
|
||||
BOLD_SPAN = 1,
|
||||
ITALIC_SPAN = 2,
|
||||
};
|
||||
|
||||
enum BLOCK_STYLE : uint8_t {
|
||||
JUSTIFIED = 0,
|
||||
LEFT_ALIGN = 1,
|
||||
@ -23,19 +20,14 @@ class TextBlock final : public Block {
|
||||
private:
|
||||
std::list<std::string> words;
|
||||
std::list<uint16_t> wordXpos;
|
||||
std::list<uint8_t> wordStyles;
|
||||
|
||||
// the style of the block - left, center, right aligned
|
||||
std::list<EpdFontStyle> wordStyles;
|
||||
BLOCK_STYLE style;
|
||||
|
||||
public:
|
||||
explicit TextBlock(const BLOCK_STYLE style) : style(style) {}
|
||||
explicit TextBlock(std::list<std::string> words, std::list<uint16_t> word_xpos,
|
||||
// the styles of each word
|
||||
std::list<uint8_t> word_styles, const BLOCK_STYLE style)
|
||||
explicit TextBlock(std::list<std::string> words, std::list<uint16_t> word_xpos, std::list<EpdFontStyle> word_styles,
|
||||
const BLOCK_STYLE style)
|
||||
: words(std::move(words)), wordXpos(std::move(word_xpos)), wordStyles(std::move(word_styles)), style(style) {}
|
||||
~TextBlock() override = default;
|
||||
void addWord(std::string word, bool is_bold, bool is_italic);
|
||||
void setStyle(const BLOCK_STYLE style) { this->style = style; }
|
||||
BLOCK_STYLE getStyle() const { return style; }
|
||||
bool isEmpty() override { return words.empty(); }
|
||||
|
||||
@ -1,11 +1,10 @@
|
||||
#pragma once
|
||||
|
||||
#include <EInkDisplay.h>
|
||||
#include <EpdFontFamily.h>
|
||||
|
||||
#include <map>
|
||||
|
||||
#include "EpdFontFamily.h"
|
||||
|
||||
class GfxRenderer {
|
||||
public:
|
||||
enum FontRenderMode { BW, GRAYSCALE_LSB, GRAYSCALE_MSB };
|
||||
|
||||
Loading…
Reference in New Issue
Block a user