mirror of
https://github.com/daveallie/crosspoint-reader.git
synced 2026-02-06 15:47:39 +03:00
Add language support to Epub metadata and hyphenation logic
This commit is contained in:
parent
4f94cf2c36
commit
61d0e1cadf
@ -74,9 +74,16 @@ bool Epub::parseContentOpf(BookMetadataCache::BookMetadata& bookMetadata) {
|
|||||||
// Grab data from opfParser into epub
|
// Grab data from opfParser into epub
|
||||||
bookMetadata.title = opfParser.title;
|
bookMetadata.title = opfParser.title;
|
||||||
bookMetadata.author = opfParser.author;
|
bookMetadata.author = opfParser.author;
|
||||||
|
bookMetadata.language = opfParser.language;
|
||||||
bookMetadata.coverItemHref = opfParser.coverItemHref;
|
bookMetadata.coverItemHref = opfParser.coverItemHref;
|
||||||
bookMetadata.textReferenceHref = opfParser.textReferenceHref;
|
bookMetadata.textReferenceHref = opfParser.textReferenceHref;
|
||||||
|
|
||||||
|
if (!bookMetadata.language.empty()) {
|
||||||
|
Serial.printf("[%lu] [EBP] OPF language: %s\n", millis(), bookMetadata.language.c_str());
|
||||||
|
} else {
|
||||||
|
Serial.printf("[%lu] [EBP] OPF language: <none>\n", millis());
|
||||||
|
}
|
||||||
|
|
||||||
if (!opfParser.tocNcxPath.empty()) {
|
if (!opfParser.tocNcxPath.empty()) {
|
||||||
tocNcxItem = opfParser.tocNcxPath;
|
tocNcxItem = opfParser.tocNcxPath;
|
||||||
}
|
}
|
||||||
@ -345,6 +352,15 @@ const std::string& Epub::getAuthor() const {
|
|||||||
return bookMetadataCache->coreMetadata.author;
|
return bookMetadataCache->coreMetadata.author;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const std::string& Epub::getLanguage() const {
|
||||||
|
static std::string blank;
|
||||||
|
if (!bookMetadataCache || !bookMetadataCache->isLoaded()) {
|
||||||
|
return blank;
|
||||||
|
}
|
||||||
|
|
||||||
|
return bookMetadataCache->coreMetadata.language;
|
||||||
|
}
|
||||||
|
|
||||||
std::string Epub::getCoverBmpPath() const { return cachePath + "/cover.bmp"; }
|
std::string Epub::getCoverBmpPath() const { return cachePath + "/cover.bmp"; }
|
||||||
|
|
||||||
bool Epub::generateCoverBmp() const {
|
bool Epub::generateCoverBmp() const {
|
||||||
|
|||||||
@ -44,6 +44,7 @@ class Epub {
|
|||||||
const std::string& getPath() const;
|
const std::string& getPath() const;
|
||||||
const std::string& getTitle() const;
|
const std::string& getTitle() const;
|
||||||
const std::string& getAuthor() const;
|
const std::string& getAuthor() const;
|
||||||
|
const std::string& getLanguage() const;
|
||||||
std::string getCoverBmpPath() const;
|
std::string getCoverBmpPath() const;
|
||||||
bool generateCoverBmp() const;
|
bool generateCoverBmp() const;
|
||||||
uint8_t* readItemContentsToBytes(const std::string& itemHref, size_t* size = nullptr,
|
uint8_t* readItemContentsToBytes(const std::string& itemHref, size_t* size = nullptr,
|
||||||
|
|||||||
@ -9,7 +9,7 @@
|
|||||||
#include "FsHelpers.h"
|
#include "FsHelpers.h"
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
constexpr uint8_t BOOK_CACHE_VERSION = 3;
|
constexpr uint8_t BOOK_CACHE_VERSION = 4;
|
||||||
constexpr char bookBinFile[] = "/book.bin";
|
constexpr char bookBinFile[] = "/book.bin";
|
||||||
constexpr char tmpSpineBinFile[] = "/spine.bin.tmp";
|
constexpr char tmpSpineBinFile[] = "/spine.bin.tmp";
|
||||||
constexpr char tmpTocBinFile[] = "/toc.bin.tmp";
|
constexpr char tmpTocBinFile[] = "/toc.bin.tmp";
|
||||||
@ -87,8 +87,9 @@ bool BookMetadataCache::buildBookBin(const std::string& epubPath, const BookMeta
|
|||||||
|
|
||||||
constexpr uint32_t headerASize =
|
constexpr uint32_t headerASize =
|
||||||
sizeof(BOOK_CACHE_VERSION) + /* LUT Offset */ sizeof(uint32_t) + sizeof(spineCount) + sizeof(tocCount);
|
sizeof(BOOK_CACHE_VERSION) + /* LUT Offset */ sizeof(uint32_t) + sizeof(spineCount) + sizeof(tocCount);
|
||||||
const uint32_t metadataSize = metadata.title.size() + metadata.author.size() + metadata.coverItemHref.size() +
|
const uint32_t metadataSize = metadata.title.size() + metadata.author.size() + metadata.language.size() +
|
||||||
metadata.textReferenceHref.size() + sizeof(uint32_t) * 4;
|
metadata.coverItemHref.size() + metadata.textReferenceHref.size() +
|
||||||
|
sizeof(uint32_t) * 5;
|
||||||
const uint32_t lutSize = sizeof(uint32_t) * spineCount + sizeof(uint32_t) * tocCount;
|
const uint32_t lutSize = sizeof(uint32_t) * spineCount + sizeof(uint32_t) * tocCount;
|
||||||
const uint32_t lutOffset = headerASize + metadataSize;
|
const uint32_t lutOffset = headerASize + metadataSize;
|
||||||
|
|
||||||
@ -100,6 +101,7 @@ bool BookMetadataCache::buildBookBin(const std::string& epubPath, const BookMeta
|
|||||||
// Metadata
|
// Metadata
|
||||||
serialization::writeString(bookFile, metadata.title);
|
serialization::writeString(bookFile, metadata.title);
|
||||||
serialization::writeString(bookFile, metadata.author);
|
serialization::writeString(bookFile, metadata.author);
|
||||||
|
serialization::writeString(bookFile, metadata.language);
|
||||||
serialization::writeString(bookFile, metadata.coverItemHref);
|
serialization::writeString(bookFile, metadata.coverItemHref);
|
||||||
serialization::writeString(bookFile, metadata.textReferenceHref);
|
serialization::writeString(bookFile, metadata.textReferenceHref);
|
||||||
|
|
||||||
@ -289,6 +291,7 @@ bool BookMetadataCache::load() {
|
|||||||
|
|
||||||
serialization::readString(bookFile, coreMetadata.title);
|
serialization::readString(bookFile, coreMetadata.title);
|
||||||
serialization::readString(bookFile, coreMetadata.author);
|
serialization::readString(bookFile, coreMetadata.author);
|
||||||
|
serialization::readString(bookFile, coreMetadata.language);
|
||||||
serialization::readString(bookFile, coreMetadata.coverItemHref);
|
serialization::readString(bookFile, coreMetadata.coverItemHref);
|
||||||
serialization::readString(bookFile, coreMetadata.textReferenceHref);
|
serialization::readString(bookFile, coreMetadata.textReferenceHref);
|
||||||
|
|
||||||
|
|||||||
@ -9,6 +9,7 @@ class BookMetadataCache {
|
|||||||
struct BookMetadata {
|
struct BookMetadata {
|
||||||
std::string title;
|
std::string title;
|
||||||
std::string author;
|
std::string author;
|
||||||
|
std::string language;
|
||||||
std::string coverItemHref;
|
std::string coverItemHref;
|
||||||
std::string textReferenceHref;
|
std::string textReferenceHref;
|
||||||
};
|
};
|
||||||
|
|||||||
@ -5,6 +5,7 @@
|
|||||||
|
|
||||||
#include "Page.h"
|
#include "Page.h"
|
||||||
#include "parsers/ChapterHtmlSlimParser.h"
|
#include "parsers/ChapterHtmlSlimParser.h"
|
||||||
|
#include "hyphenation/Hyphenator.h"
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
constexpr uint8_t SECTION_FILE_VERSION = 9;
|
constexpr uint8_t SECTION_FILE_VERSION = 9;
|
||||||
@ -186,6 +187,8 @@ bool Section::createSectionFile(const int fontId, const float lineCompression, c
|
|||||||
viewportHeight, hyphenationEnabled,
|
viewportHeight, hyphenationEnabled,
|
||||||
[this, &lut](std::unique_ptr<Page> page) { lut.emplace_back(this->onPageComplete(std::move(page))); },
|
[this, &lut](std::unique_ptr<Page> page) { lut.emplace_back(this->onPageComplete(std::move(page))); },
|
||||||
progressFn);
|
progressFn);
|
||||||
|
Hyphenator::setPreferredLanguage(epub->getLanguage());
|
||||||
|
Serial.printf("[%lu] [SCT] Hyphenation language set to: %s\n", millis(), epub->getLanguage().c_str());
|
||||||
success = visitor.parseAndBuildPages();
|
success = visitor.parseAndBuildPages();
|
||||||
|
|
||||||
SdMan.remove(tmpHtmlPath.c_str());
|
SdMan.remove(tmpHtmlPath.c_str());
|
||||||
|
|||||||
@ -143,22 +143,3 @@ bool hasOnlyAlphabetic(const std::vector<CodepointInfo>& cps) {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
Script detectScript(const std::vector<CodepointInfo>& cps) {
|
|
||||||
bool hasLatin = false;
|
|
||||||
bool hasCyrillic = false;
|
|
||||||
for (const auto& info : cps) {
|
|
||||||
if (isLatinLetter(info.value)) {
|
|
||||||
hasLatin = true;
|
|
||||||
} else if (isCyrillicLetter(info.value)) {
|
|
||||||
hasCyrillic = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (hasLatin && !hasCyrillic) {
|
|
||||||
return Script::Latin;
|
|
||||||
}
|
|
||||||
if (!hasLatin && hasCyrillic) {
|
|
||||||
return Script::Cyrillic;
|
|
||||||
}
|
|
||||||
return Script::Mixed;
|
|
||||||
}
|
|
||||||
|
|||||||
@ -35,4 +35,3 @@ bool isSoftHyphen(uint32_t cp);
|
|||||||
void trimSurroundingPunctuation(std::vector<CodepointInfo>& cps);
|
void trimSurroundingPunctuation(std::vector<CodepointInfo>& cps);
|
||||||
bool hasOnlyAlphabetic(const std::vector<CodepointInfo>& cps);
|
bool hasOnlyAlphabetic(const std::vector<CodepointInfo>& cps);
|
||||||
|
|
||||||
Script detectScript(const std::vector<CodepointInfo>& cps);
|
|
||||||
|
|||||||
@ -32,6 +32,37 @@ const LanguageHyphenator* hyphenatorForScript(const Script script) {
|
|||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Maps a BCP-47 language tag to a language-specific hyphenator.
|
||||||
|
const LanguageHyphenator* hyphenatorForLanguage(const std::string& langTag) {
|
||||||
|
if (langTag.empty()) return nullptr;
|
||||||
|
|
||||||
|
// Extract primary subtag and normalize to lowercase (e.g., "en-US" -> "en").
|
||||||
|
std::string primary;
|
||||||
|
primary.reserve(langTag.size());
|
||||||
|
for (char c : langTag) {
|
||||||
|
if (c == '-' || c == '_') break;
|
||||||
|
if (c >= 'A' && c <= 'Z') c = static_cast<char>(c - 'A' + 'a');
|
||||||
|
primary.push_back(c);
|
||||||
|
}
|
||||||
|
if (primary.empty()) return nullptr;
|
||||||
|
|
||||||
|
if (primary == "en") return &EnglishHyphenator::instance();
|
||||||
|
if (primary == "ru") return &RussianHyphenator::instance();
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Preferred language hint; empty means "auto".
|
||||||
|
std::string& preferredLanguage() {
|
||||||
|
static std::string lang;
|
||||||
|
return lang;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cached hyphenator instance for the current preferred language.
|
||||||
|
const LanguageHyphenator*& cachedHyphenator() {
|
||||||
|
static const LanguageHyphenator* hyphenator = nullptr;
|
||||||
|
return hyphenator;
|
||||||
|
}
|
||||||
|
|
||||||
// Converts the UTF-8 word into codepoint metadata for downstream rules.
|
// Converts the UTF-8 word into codepoint metadata for downstream rules.
|
||||||
std::vector<CodepointInfo> collectCodepoints(const std::string& word) {
|
std::vector<CodepointInfo> collectCodepoints(const std::string& word) {
|
||||||
std::vector<CodepointInfo> cps;
|
std::vector<CodepointInfo> cps;
|
||||||
@ -78,8 +109,8 @@ std::vector<size_t> collectBreakIndexes(const std::vector<CodepointInfo>& cps) {
|
|||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
|
|
||||||
const Script script = detectScript(cps);
|
// Use cached hyphenator to avoid repeated language lookups.
|
||||||
if (const auto* hyphenator = hyphenatorForScript(script)) {
|
if (const auto* hyphenator = cachedHyphenator()) {
|
||||||
auto indexes = hyphenator->breakIndexes(cps);
|
auto indexes = hyphenator->breakIndexes(cps);
|
||||||
return indexes;
|
return indexes;
|
||||||
}
|
}
|
||||||
@ -95,6 +126,7 @@ size_t byteOffsetForIndex(const std::vector<CodepointInfo>& cps, const size_t in
|
|||||||
return cps[index].byteOffset;
|
return cps[index].byteOffset;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Builds a vector of break information from explicit hyphen markers in the given codepoints.
|
||||||
std::vector<Hyphenator::BreakInfo> buildExplicitBreakInfos(const std::vector<CodepointInfo>& cps) {
|
std::vector<Hyphenator::BreakInfo> buildExplicitBreakInfos(const std::vector<CodepointInfo>& cps) {
|
||||||
std::vector<Hyphenator::BreakInfo> breaks;
|
std::vector<Hyphenator::BreakInfo> breaks;
|
||||||
breaks.reserve(cps.size());
|
breaks.reserve(cps.size());
|
||||||
@ -182,3 +214,8 @@ std::vector<Hyphenator::BreakInfo> Hyphenator::breakOffsets(const std::string& w
|
|||||||
|
|
||||||
return breaks;
|
return breaks;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void Hyphenator::setPreferredLanguage(const std::string& lang) {
|
||||||
|
preferredLanguage() = lang;
|
||||||
|
cachedHyphenator() = hyphenatorForLanguage(lang);
|
||||||
|
}
|
||||||
|
|||||||
@ -13,4 +13,7 @@ class Hyphenator {
|
|||||||
// Returns byte offsets where the word may be hyphenated. When includeFallback is true, all positions obeying the
|
// Returns byte offsets where the word may be hyphenated. When includeFallback is true, all positions obeying the
|
||||||
// minimum prefix/suffix constraints are returned even if no language-specific rule matches.
|
// minimum prefix/suffix constraints are returned even if no language-specific rule matches.
|
||||||
static std::vector<BreakInfo> breakOffsets(const std::string& word, bool includeFallback);
|
static std::vector<BreakInfo> breakOffsets(const std::string& word, bool includeFallback);
|
||||||
|
|
||||||
|
// Provide a publication-level language hint (e.g. "en", "en-US", "ru") used to select hyphenation rules.
|
||||||
|
static void setPreferredLanguage(const std::string& lang);
|
||||||
};
|
};
|
||||||
@ -107,6 +107,11 @@ void XMLCALL ContentOpfParser::startElement(void* userData, const XML_Char* name
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (self->state == IN_METADATA && strcmp(name, "dc:language") == 0) {
|
||||||
|
self->state = IN_BOOK_LANGUAGE;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (self->state == IN_PACKAGE && (strcmp(name, "manifest") == 0 || strcmp(name, "opf:manifest") == 0)) {
|
if (self->state == IN_PACKAGE && (strcmp(name, "manifest") == 0 || strcmp(name, "opf:manifest") == 0)) {
|
||||||
self->state = IN_MANIFEST;
|
self->state = IN_MANIFEST;
|
||||||
if (!SdMan.openFileForWrite("COF", self->cachePath + itemCacheFile, self->tempItemStore)) {
|
if (!SdMan.openFileForWrite("COF", self->cachePath + itemCacheFile, self->tempItemStore)) {
|
||||||
@ -266,6 +271,11 @@ void XMLCALL ContentOpfParser::characterData(void* userData, const XML_Char* s,
|
|||||||
self->author.append(s, len);
|
self->author.append(s, len);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (self->state == IN_BOOK_LANGUAGE) {
|
||||||
|
self->language.append(s, len);
|
||||||
|
return;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void XMLCALL ContentOpfParser::endElement(void* userData, const XML_Char* name) {
|
void XMLCALL ContentOpfParser::endElement(void* userData, const XML_Char* name) {
|
||||||
@ -300,6 +310,11 @@ void XMLCALL ContentOpfParser::endElement(void* userData, const XML_Char* name)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (self->state == IN_BOOK_LANGUAGE && strcmp(name, "dc:language") == 0) {
|
||||||
|
self->state = IN_METADATA;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (self->state == IN_METADATA && (strcmp(name, "metadata") == 0 || strcmp(name, "opf:metadata") == 0)) {
|
if (self->state == IN_METADATA && (strcmp(name, "metadata") == 0 || strcmp(name, "opf:metadata") == 0)) {
|
||||||
self->state = IN_PACKAGE;
|
self->state = IN_PACKAGE;
|
||||||
return;
|
return;
|
||||||
|
|||||||
@ -13,6 +13,7 @@ class ContentOpfParser final : public Print {
|
|||||||
IN_METADATA,
|
IN_METADATA,
|
||||||
IN_BOOK_TITLE,
|
IN_BOOK_TITLE,
|
||||||
IN_BOOK_AUTHOR,
|
IN_BOOK_AUTHOR,
|
||||||
|
IN_BOOK_LANGUAGE,
|
||||||
IN_MANIFEST,
|
IN_MANIFEST,
|
||||||
IN_SPINE,
|
IN_SPINE,
|
||||||
IN_GUIDE,
|
IN_GUIDE,
|
||||||
@ -34,6 +35,7 @@ class ContentOpfParser final : public Print {
|
|||||||
public:
|
public:
|
||||||
std::string title;
|
std::string title;
|
||||||
std::string author;
|
std::string author;
|
||||||
|
std::string language;
|
||||||
std::string tocNcxPath;
|
std::string tocNcxPath;
|
||||||
std::string tocNavPath; // EPUB 3 nav document path
|
std::string tocNavPath; // EPUB 3 nav document path
|
||||||
std::string coverItemHref;
|
std::string coverItemHref;
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user