diff --git a/lib/Epub/Epub.cpp b/lib/Epub/Epub.cpp index 234344d7..5ac44652 100644 --- a/lib/Epub/Epub.cpp +++ b/lib/Epub/Epub.cpp @@ -74,9 +74,16 @@ bool Epub::parseContentOpf(BookMetadataCache::BookMetadata& bookMetadata) { // Grab data from opfParser into epub bookMetadata.title = opfParser.title; bookMetadata.author = opfParser.author; + bookMetadata.language = opfParser.language; bookMetadata.coverItemHref = opfParser.coverItemHref; bookMetadata.textReferenceHref = opfParser.textReferenceHref; + if (!bookMetadata.language.empty()) { + Serial.printf("[%lu] [EBP] OPF language: %s\n", millis(), bookMetadata.language.c_str()); + } else { + Serial.printf("[%lu] [EBP] OPF language: \n", millis()); + } + if (!opfParser.tocNcxPath.empty()) { tocNcxItem = opfParser.tocNcxPath; } @@ -345,6 +352,15 @@ const std::string& Epub::getAuthor() const { return bookMetadataCache->coreMetadata.author; } +const std::string& Epub::getLanguage() const { + static std::string blank; + if (!bookMetadataCache || !bookMetadataCache->isLoaded()) { + return blank; + } + + return bookMetadataCache->coreMetadata.language; +} + std::string Epub::getCoverBmpPath() const { return cachePath + "/cover.bmp"; } bool Epub::generateCoverBmp() const { diff --git a/lib/Epub/Epub.h b/lib/Epub/Epub.h index a6555e7e..8d62bbbe 100644 --- a/lib/Epub/Epub.h +++ b/lib/Epub/Epub.h @@ -44,6 +44,7 @@ class Epub { const std::string& getPath() const; const std::string& getTitle() const; const std::string& getAuthor() const; + const std::string& getLanguage() const; std::string getCoverBmpPath() const; bool generateCoverBmp() const; uint8_t* readItemContentsToBytes(const std::string& itemHref, size_t* size = nullptr, diff --git a/lib/Epub/Epub/BookMetadataCache.cpp b/lib/Epub/Epub/BookMetadataCache.cpp index 06b4f458..0ef363ec 100644 --- a/lib/Epub/Epub/BookMetadataCache.cpp +++ b/lib/Epub/Epub/BookMetadataCache.cpp @@ -9,7 +9,7 @@ #include "FsHelpers.h" namespace { -constexpr uint8_t BOOK_CACHE_VERSION = 3; +constexpr uint8_t BOOK_CACHE_VERSION = 4; constexpr char bookBinFile[] = "/book.bin"; constexpr char tmpSpineBinFile[] = "/spine.bin.tmp"; constexpr char tmpTocBinFile[] = "/toc.bin.tmp"; @@ -87,8 +87,9 @@ bool BookMetadataCache::buildBookBin(const std::string& epubPath, const BookMeta constexpr uint32_t headerASize = sizeof(BOOK_CACHE_VERSION) + /* LUT Offset */ sizeof(uint32_t) + sizeof(spineCount) + sizeof(tocCount); - const uint32_t metadataSize = metadata.title.size() + metadata.author.size() + metadata.coverItemHref.size() + - metadata.textReferenceHref.size() + sizeof(uint32_t) * 4; + const uint32_t metadataSize = metadata.title.size() + metadata.author.size() + metadata.language.size() + + metadata.coverItemHref.size() + metadata.textReferenceHref.size() + + sizeof(uint32_t) * 5; const uint32_t lutSize = sizeof(uint32_t) * spineCount + sizeof(uint32_t) * tocCount; const uint32_t lutOffset = headerASize + metadataSize; @@ -100,6 +101,7 @@ bool BookMetadataCache::buildBookBin(const std::string& epubPath, const BookMeta // Metadata serialization::writeString(bookFile, metadata.title); serialization::writeString(bookFile, metadata.author); + serialization::writeString(bookFile, metadata.language); serialization::writeString(bookFile, metadata.coverItemHref); serialization::writeString(bookFile, metadata.textReferenceHref); @@ -289,6 +291,7 @@ bool BookMetadataCache::load() { serialization::readString(bookFile, coreMetadata.title); serialization::readString(bookFile, coreMetadata.author); + serialization::readString(bookFile, coreMetadata.language); serialization::readString(bookFile, coreMetadata.coverItemHref); serialization::readString(bookFile, coreMetadata.textReferenceHref); diff --git a/lib/Epub/Epub/BookMetadataCache.h b/lib/Epub/Epub/BookMetadataCache.h index 5f1862c5..29b2ae4a 100644 --- a/lib/Epub/Epub/BookMetadataCache.h +++ b/lib/Epub/Epub/BookMetadataCache.h @@ -9,6 +9,7 @@ class BookMetadataCache { struct BookMetadata { std::string title; std::string author; + std::string language; std::string coverItemHref; std::string textReferenceHref; }; diff --git a/lib/Epub/Epub/Section.cpp b/lib/Epub/Epub/Section.cpp index 15594bdd..c798eaf0 100644 --- a/lib/Epub/Epub/Section.cpp +++ b/lib/Epub/Epub/Section.cpp @@ -5,6 +5,7 @@ #include "Page.h" #include "parsers/ChapterHtmlSlimParser.h" +#include "hyphenation/Hyphenator.h" namespace { constexpr uint8_t SECTION_FILE_VERSION = 9; @@ -186,6 +187,8 @@ bool Section::createSectionFile(const int fontId, const float lineCompression, c viewportHeight, hyphenationEnabled, [this, &lut](std::unique_ptr page) { lut.emplace_back(this->onPageComplete(std::move(page))); }, progressFn); + Hyphenator::setPreferredLanguage(epub->getLanguage()); + Serial.printf("[%lu] [SCT] Hyphenation language set to: %s\n", millis(), epub->getLanguage().c_str()); success = visitor.parseAndBuildPages(); SdMan.remove(tmpHtmlPath.c_str()); diff --git a/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp b/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp index 999cb53c..09393bbd 100644 --- a/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp +++ b/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp @@ -143,22 +143,3 @@ bool hasOnlyAlphabetic(const std::vector& cps) { return true; } -Script detectScript(const std::vector& cps) { - bool hasLatin = false; - bool hasCyrillic = false; - for (const auto& info : cps) { - if (isLatinLetter(info.value)) { - hasLatin = true; - } else if (isCyrillicLetter(info.value)) { - hasCyrillic = true; - } - } - - if (hasLatin && !hasCyrillic) { - return Script::Latin; - } - if (!hasLatin && hasCyrillic) { - return Script::Cyrillic; - } - return Script::Mixed; -} diff --git a/lib/Epub/Epub/hyphenation/HyphenationCommon.h b/lib/Epub/Epub/hyphenation/HyphenationCommon.h index c28acfa7..752b2b9e 100644 --- a/lib/Epub/Epub/hyphenation/HyphenationCommon.h +++ b/lib/Epub/Epub/hyphenation/HyphenationCommon.h @@ -35,4 +35,3 @@ bool isSoftHyphen(uint32_t cp); void trimSurroundingPunctuation(std::vector& cps); bool hasOnlyAlphabetic(const std::vector& cps); -Script detectScript(const std::vector& cps); diff --git a/lib/Epub/Epub/hyphenation/Hyphenator.cpp b/lib/Epub/Epub/hyphenation/Hyphenator.cpp index a06bb283..9bac28b7 100644 --- a/lib/Epub/Epub/hyphenation/Hyphenator.cpp +++ b/lib/Epub/Epub/hyphenation/Hyphenator.cpp @@ -32,6 +32,37 @@ const LanguageHyphenator* hyphenatorForScript(const Script script) { return nullptr; } +// Maps a BCP-47 language tag to a language-specific hyphenator. +const LanguageHyphenator* hyphenatorForLanguage(const std::string& langTag) { + if (langTag.empty()) return nullptr; + + // Extract primary subtag and normalize to lowercase (e.g., "en-US" -> "en"). + std::string primary; + primary.reserve(langTag.size()); + for (char c : langTag) { + if (c == '-' || c == '_') break; + if (c >= 'A' && c <= 'Z') c = static_cast(c - 'A' + 'a'); + primary.push_back(c); + } + if (primary.empty()) return nullptr; + + if (primary == "en") return &EnglishHyphenator::instance(); + if (primary == "ru") return &RussianHyphenator::instance(); + return nullptr; +} + +// Preferred language hint; empty means "auto". +std::string& preferredLanguage() { + static std::string lang; + return lang; +} + +// Cached hyphenator instance for the current preferred language. +const LanguageHyphenator*& cachedHyphenator() { + static const LanguageHyphenator* hyphenator = nullptr; + return hyphenator; +} + // Converts the UTF-8 word into codepoint metadata for downstream rules. std::vector collectCodepoints(const std::string& word) { std::vector cps; @@ -78,8 +109,8 @@ std::vector collectBreakIndexes(const std::vector& cps) { return {}; } - const Script script = detectScript(cps); - if (const auto* hyphenator = hyphenatorForScript(script)) { + // Use cached hyphenator to avoid repeated language lookups. + if (const auto* hyphenator = cachedHyphenator()) { auto indexes = hyphenator->breakIndexes(cps); return indexes; } @@ -95,6 +126,7 @@ size_t byteOffsetForIndex(const std::vector& cps, const size_t in return cps[index].byteOffset; } +// Builds a vector of break information from explicit hyphen markers in the given codepoints. std::vector buildExplicitBreakInfos(const std::vector& cps) { std::vector breaks; breaks.reserve(cps.size()); @@ -182,3 +214,8 @@ std::vector Hyphenator::breakOffsets(const std::string& w return breaks; } + +void Hyphenator::setPreferredLanguage(const std::string& lang) { + preferredLanguage() = lang; + cachedHyphenator() = hyphenatorForLanguage(lang); +} diff --git a/lib/Epub/Epub/hyphenation/Hyphenator.h b/lib/Epub/Epub/hyphenation/Hyphenator.h index 3d1ed040..992d1dd5 100644 --- a/lib/Epub/Epub/hyphenation/Hyphenator.h +++ b/lib/Epub/Epub/hyphenation/Hyphenator.h @@ -13,4 +13,7 @@ class Hyphenator { // Returns byte offsets where the word may be hyphenated. When includeFallback is true, all positions obeying the // minimum prefix/suffix constraints are returned even if no language-specific rule matches. static std::vector breakOffsets(const std::string& word, bool includeFallback); + + // Provide a publication-level language hint (e.g. "en", "en-US", "ru") used to select hyphenation rules. + static void setPreferredLanguage(const std::string& lang); }; \ No newline at end of file diff --git a/lib/Epub/Epub/parsers/ContentOpfParser.cpp b/lib/Epub/Epub/parsers/ContentOpfParser.cpp index 2c90d01d..c02d4ee8 100644 --- a/lib/Epub/Epub/parsers/ContentOpfParser.cpp +++ b/lib/Epub/Epub/parsers/ContentOpfParser.cpp @@ -107,6 +107,11 @@ void XMLCALL ContentOpfParser::startElement(void* userData, const XML_Char* name return; } + if (self->state == IN_METADATA && strcmp(name, "dc:language") == 0) { + self->state = IN_BOOK_LANGUAGE; + return; + } + if (self->state == IN_PACKAGE && (strcmp(name, "manifest") == 0 || strcmp(name, "opf:manifest") == 0)) { self->state = IN_MANIFEST; if (!SdMan.openFileForWrite("COF", self->cachePath + itemCacheFile, self->tempItemStore)) { @@ -266,6 +271,11 @@ void XMLCALL ContentOpfParser::characterData(void* userData, const XML_Char* s, self->author.append(s, len); return; } + + if (self->state == IN_BOOK_LANGUAGE) { + self->language.append(s, len); + return; + } } void XMLCALL ContentOpfParser::endElement(void* userData, const XML_Char* name) { @@ -300,6 +310,11 @@ void XMLCALL ContentOpfParser::endElement(void* userData, const XML_Char* name) return; } + if (self->state == IN_BOOK_LANGUAGE && strcmp(name, "dc:language") == 0) { + self->state = IN_METADATA; + return; + } + if (self->state == IN_METADATA && (strcmp(name, "metadata") == 0 || strcmp(name, "opf:metadata") == 0)) { self->state = IN_PACKAGE; return; diff --git a/lib/Epub/Epub/parsers/ContentOpfParser.h b/lib/Epub/Epub/parsers/ContentOpfParser.h index 1940aaaf..8c56a86f 100644 --- a/lib/Epub/Epub/parsers/ContentOpfParser.h +++ b/lib/Epub/Epub/parsers/ContentOpfParser.h @@ -13,6 +13,7 @@ class ContentOpfParser final : public Print { IN_METADATA, IN_BOOK_TITLE, IN_BOOK_AUTHOR, + IN_BOOK_LANGUAGE, IN_MANIFEST, IN_SPINE, IN_GUIDE, @@ -34,6 +35,7 @@ class ContentOpfParser final : public Print { public: std::string title; std::string author; + std::string language; std::string tocNcxPath; std::string tocNavPath; // EPUB 3 nav document path std::string coverItemHref;