From 59f717216a20c62dd7081426b9371daff88b86d7 Mon Sep 17 00:00:00 2001 From: Dave Allie Date: Mon, 8 Dec 2025 00:30:26 +1100 Subject: [PATCH] Stream HTML from ZIP down to disk instead of loading all in mem --- lib/Epub/Epub.cpp | 20 +- lib/Epub/Epub.h | 5 +- lib/Epub/Epub/EpubHtmlParserSlim.cpp | 5 + lib/Epub/Epub/Section.cpp | 29 ++- lib/ZipFile/ZipFile.cpp | 285 +++++++++++++++++++++------ lib/ZipFile/ZipFile.h | 9 +- src/screens/EpubReaderScreen.cpp | 1 + 7 files changed, 261 insertions(+), 93 deletions(-) diff --git a/lib/Epub/Epub.cpp b/lib/Epub/Epub.cpp index 83caf1d..3beaa54 100644 --- a/lib/Epub/Epub.cpp +++ b/lib/Epub/Epub.cpp @@ -9,7 +9,7 @@ bool Epub::findContentOpfFile(const ZipFile& zip, std::string& contentOpfFile) { // open up the meta data to find where the content.opf file lives size_t s; - const auto metaInfo = zip.readTextFileToMemory("META-INF/container.xml", &s); + const auto metaInfo = reinterpret_cast(zip.readFileToMemory("META-INF/container.xml", &s, true)); if (!metaInfo) { Serial.println("Could not find META-INF/container.xml"); return false; @@ -57,7 +57,7 @@ bool Epub::findContentOpfFile(const ZipFile& zip, std::string& contentOpfFile) { bool Epub::parseContentOpf(ZipFile& zip, std::string& content_opf_file) { // read in the content.opf file and parse it - auto contents = zip.readTextFileToMemory(content_opf_file.c_str()); + auto contents = reinterpret_cast(zip.readFileToMemory(content_opf_file.c_str(), nullptr, true)); // parse the contents tinyxml2::XMLDocument doc; @@ -168,7 +168,7 @@ bool Epub::parseTocNcxFile(const ZipFile& zip) { return false; } - const auto ncxData = zip.readTextFileToMemory(tocNcxItem.c_str()); + const auto ncxData = reinterpret_cast(zip.readFileToMemory(tocNcxItem.c_str(), nullptr, true)); if (!ncxData) { Serial.printf("Could not find %s\n", tocNcxItem.c_str()); return false; @@ -308,11 +308,11 @@ std::string normalisePath(const std::string& path) { return result; } -uint8_t* Epub::getItemContents(const std::string& itemHref, size_t* size) const { +uint8_t* Epub::readItemContentsToBytes(const std::string& itemHref, size_t* size, bool trailingNullByte) const { const ZipFile zip("/sd" + filepath); const std::string path = normalisePath(itemHref); - const auto content = zip.readFileToMemory(path.c_str(), size); + const auto content = zip.readFileToMemory(path.c_str(), size, trailingNullByte); if (!content) { Serial.printf("Failed to read item %s\n", path.c_str()); return nullptr; @@ -321,17 +321,11 @@ uint8_t* Epub::getItemContents(const std::string& itemHref, size_t* size) const return content; } -char* Epub::getTextItemContents(const std::string& itemHref, size_t* size) const { +bool Epub::readItemContentsToStream(const std::string& itemHref, Print& out, const size_t chunkSize) const { const ZipFile zip("/sd" + filepath); const std::string path = normalisePath(itemHref); - const auto content = zip.readTextFileToMemory(path.c_str(), size); - if (!content) { - Serial.printf("Failed to read item %s\n", path.c_str()); - return nullptr; - } - - return content; + return zip.readFileToStream(path.c_str(), out, chunkSize); } int Epub::getSpineItemsCount() const { return spine.size(); } diff --git a/lib/Epub/Epub.h b/lib/Epub/Epub.h index b59dcfa..0a772b3 100644 --- a/lib/Epub/Epub.h +++ b/lib/Epub/Epub.h @@ -56,8 +56,9 @@ class Epub { const std::string& getPath() const; const std::string& getTitle() const; const std::string& getCoverImageItem() const; - uint8_t* getItemContents(const std::string& itemHref, size_t* size = nullptr) const; - char* getTextItemContents(const std::string& itemHref, size_t* size = nullptr) const; + uint8_t* readItemContentsToBytes(const std::string& itemHref, size_t* size = nullptr, + bool trailingNullByte = false) const; + bool readItemContentsToStream(const std::string& itemHref, Print& out, size_t chunkSize) const; std::string& getSpineItem(int spineIndex); int getSpineItemsCount() const; EpubTocEntry& getTocItem(int tocTndex); diff --git a/lib/Epub/Epub/EpubHtmlParserSlim.cpp b/lib/Epub/Epub/EpubHtmlParserSlim.cpp index cb425ee..50e588d 100644 --- a/lib/Epub/Epub/EpubHtmlParserSlim.cpp +++ b/lib/Epub/Epub/EpubHtmlParserSlim.cpp @@ -199,6 +199,11 @@ bool EpubHtmlParserSlim::parseAndBuildPages() { XML_SetCharacterDataHandler(parser, characterData); FILE* file = fopen(filepath, "r"); + if (!file) { + Serial.printf("Couldn't open file %s\n", filepath); + XML_ParserFree(parser); + return false; + } do { void* const buf = XML_GetBuffer(parser, 1024); diff --git a/lib/Epub/Epub/Section.cpp b/lib/Epub/Epub/Section.cpp index aa9ad95..dbc0457 100644 --- a/lib/Epub/Epub/Section.cpp +++ b/lib/Epub/Epub/Section.cpp @@ -64,35 +64,28 @@ void Section::setupCacheDir() const { void Section::clearCache() const { SD.rmdir(cachePath.c_str()); } bool Section::persistPageDataToSD() { - size_t size = 0; - auto localPath = epub->getSpineItem(spineIndex); + const auto localPath = epub->getSpineItem(spineIndex); - const auto html = epub->getItemContents(epub->getSpineItem(spineIndex), &size); - if (!html) { - Serial.println("Failed to read item contents"); - return false; - } - - // TODO: Would love to stream this through an XML visitor + // TODO: Should we get rid of this file all together? + // It currently saves us a bit of memory by allowing for all the inflation bits to be released + // before loading the XML parser const auto tmpHtmlPath = epub->getCachePath() + "/.tmp_" + std::to_string(spineIndex) + ".html"; - File f = SD.open(tmpHtmlPath.c_str(), FILE_WRITE); - const auto written = f.write(html, size); + File f = SD.open(tmpHtmlPath.c_str(), FILE_WRITE, true); + bool success = epub->readItemContentsToStream(localPath, f, 1024); f.close(); - free(html); - Serial.printf("Wrote %d bytes to %s\n", written, tmpHtmlPath.c_str()); - - if (size != written) { - Serial.println("Failed to inflate section contents to SD"); - SD.remove(tmpHtmlPath.c_str()); + if (!success) { + Serial.println("Failed to stream item contents"); return false; } + Serial.printf("Streamed HTML to %s\n", tmpHtmlPath.c_str()); + const auto sdTmpHtmlPath = "/sd" + tmpHtmlPath; auto visitor = EpubHtmlParserSlim(sdTmpHtmlPath.c_str(), renderer, [this](const Page* page) { this->onPageComplete(page); }); - const bool success = visitor.parseAndBuildPages(); + success = visitor.parseAndBuildPages(); SD.remove(tmpHtmlPath.c_str()); if (!success) { diff --git a/lib/ZipFile/ZipFile.cpp b/lib/ZipFile/ZipFile.cpp index 0834838..a6dbbe8 100644 --- a/lib/ZipFile/ZipFile.cpp +++ b/lib/ZipFile/ZipFile.cpp @@ -3,51 +3,37 @@ #include #include -int libzInflateOneShot(const uint8_t* inputBuff, const size_t compSize, uint8_t* outputBuff, const size_t uncompSize) { - mz_stream pStream = { - .next_in = inputBuff, - .avail_in = compSize, - .total_in = 0, - .next_out = outputBuff, - .avail_out = uncompSize, - .total_out = 0, - }; +bool inflateOneShot(const uint8_t* inputBuf, const size_t deflatedSize, uint8_t* outputBuf, const size_t inflatedSize) { + // Setup inflator + const auto inflator = static_cast(malloc(sizeof(tinfl_decompressor))); + if (!inflator) { + Serial.println("Failed to allocate memory for inflator"); + return false; + } + memset(inflator, 0, sizeof(tinfl_decompressor)); + tinfl_init(inflator); - int status = 0; - status = mz_inflateInit2(&pStream, -MZ_DEFAULT_WINDOW_BITS); + size_t inBytes = deflatedSize; + size_t outBytes = inflatedSize; + const tinfl_status status = tinfl_decompress(inflator, inputBuf, &inBytes, nullptr, outputBuf, &outBytes, + TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF); + free(inflator); - if (status != MZ_OK) { - Serial.printf("inflateInit2 failed: %d\n", status); - return status; + if (status != TINFL_STATUS_DONE) { + Serial.printf("tinfl_decompress() failed with status %d\n", status); + return false; } - status = mz_inflate(&pStream, MZ_FINISH); - if (status != MZ_STREAM_END) { - Serial.printf("inflate failed: %d\n", status); - return status; - } - - status = mz_inflateEnd(&pStream); - if (status != MZ_OK) { - Serial.printf("inflateEnd failed: %d\n", status); - return status; - } - - return status; + return true; } -char* ZipFile::readTextFileToMemory(const char* filename, size_t* size) const { - const auto data = readFileToMemory(filename, size, true); - return data ? reinterpret_cast(data) : nullptr; -} - -uint8_t* ZipFile::readFileToMemory(const char* filename, size_t* size, bool trailingNullByte) const { +bool ZipFile::loadFileStat(const char* filename, mz_zip_archive_file_stat* fileStat) const { mz_zip_archive zipArchive = {}; const bool status = mz_zip_reader_init_file(&zipArchive, filePath.c_str(), 0); if (!status) { Serial.printf("mz_zip_reader_init_file() failed!\nError %s\n", mz_zip_get_error_string(zipArchive.m_last_error)); - return nullptr; + return false; } // find the file @@ -55,41 +41,57 @@ uint8_t* ZipFile::readFileToMemory(const char* filename, size_t* size, bool trai if (!mz_zip_reader_locate_file_v2(&zipArchive, filename, nullptr, 0, &fileIndex)) { Serial.printf("Could not find file %s\n", filename); mz_zip_reader_end(&zipArchive); - return nullptr; + return false; } - mz_zip_archive_file_stat fileStat; - if (!mz_zip_reader_file_stat(&zipArchive, fileIndex, &fileStat)) { + if (!mz_zip_reader_file_stat(&zipArchive, fileIndex, fileStat)) { Serial.printf("mz_zip_reader_file_stat() failed!\nError %s\n", mz_zip_get_error_string(zipArchive.m_last_error)); mz_zip_reader_end(&zipArchive); - return nullptr; + return false; } mz_zip_reader_end(&zipArchive); + return true; +} - uint8_t pLocalHeader[30]; - uint64_t fileOffset = fileStat.m_local_header_ofs; +long ZipFile::getDataOffset(const mz_zip_archive_file_stat& fileStat) const { + constexpr auto localHeaderSize = 30; - // Reopen the file to manual read out delated bytes - FILE* file = fopen(filePath.c_str(), "rb"); + uint8_t pLocalHeader[localHeaderSize]; + const uint64_t fileOffset = fileStat.m_local_header_ofs; + + FILE* file = fopen(filePath.c_str(), "r"); fseek(file, fileOffset, SEEK_SET); + const size_t read = fread(pLocalHeader, 1, localHeaderSize, file); + fclose(file); - const size_t read = fread(pLocalHeader, 1, 30, file); - if (read != 30) { + if (read != localHeaderSize) { Serial.println("Something went wrong reading the local header"); - fclose(file); - return nullptr; + return -1; } if (pLocalHeader[0] + (pLocalHeader[1] << 8) + (pLocalHeader[2] << 16) + (pLocalHeader[3] << 24) != 0x04034b50 /* MZ_ZIP_LOCAL_DIR_HEADER_SIG */) { Serial.println("Not a valid zip file header"); - fclose(file); - return nullptr; + return -1; } const uint16_t filenameLength = pLocalHeader[26] + (pLocalHeader[27] << 8); const uint16_t extraOffset = pLocalHeader[28] + (pLocalHeader[29] << 8); - fileOffset += 30 + filenameLength + extraOffset; + return fileOffset + localHeaderSize + filenameLength + extraOffset; +} + +uint8_t* ZipFile::readFileToMemory(const char* filename, size_t* size, const bool trailingNullByte) const { + mz_zip_archive_file_stat fileStat; + if (!loadFileStat(filename, &fileStat)) { + return nullptr; + } + + const long fileOffset = getDataOffset(fileStat); + if (fileOffset < 0) { + return nullptr; + } + + FILE* file = fopen(filePath.c_str(), "rb"); fseek(file, fileOffset, SEEK_SET); const auto deflatedDataSize = static_cast(fileStat.m_comp_size); @@ -97,15 +99,19 @@ uint8_t* ZipFile::readFileToMemory(const char* filename, size_t* size, bool trai const auto dataSize = trailingNullByte ? inflatedDataSize + 1 : inflatedDataSize; const auto data = static_cast(malloc(dataSize)); - if (!fileStat.m_method) { + if (fileStat.m_method == MZ_NO_COMPRESSION) { // no deflation, just read content const size_t dataRead = fread(data, 1, inflatedDataSize, file); fclose(file); + if (dataRead != inflatedDataSize) { Serial.println("Failed to read data"); + free(data); return nullptr; } - } else { + + // Continue out of block with data set + } else if (fileStat.m_method == MZ_DEFLATED) { // Read out deflated content from file const auto deflatedData = static_cast(malloc(deflatedDataSize)); if (deflatedData == nullptr) { @@ -116,25 +122,186 @@ uint8_t* ZipFile::readFileToMemory(const char* filename, size_t* size, bool trai const size_t dataRead = fread(deflatedData, 1, deflatedDataSize, file); fclose(file); + if (dataRead != deflatedDataSize) { Serial.printf("Failed to read data, expected %d got %d\n", deflatedDataSize, dataRead); free(deflatedData); + free(data); return nullptr; } - const int result = libzInflateOneShot(deflatedData, deflatedDataSize, data, inflatedDataSize); + bool success = inflateOneShot(deflatedData, deflatedDataSize, data, inflatedDataSize); free(deflatedData); - if (result != MZ_OK) { + + if (!success) { Serial.println("Failed to inflate file"); + free(data); return nullptr; } + + // Continue out of block with data set + } else { + Serial.println("Unsupported compression method"); + fclose(file); + return nullptr; } - if (trailingNullByte) { - data[inflatedDataSize] = '\0'; - } - if (size) { - *size = inflatedDataSize; - } + if (trailingNullByte) data[inflatedDataSize] = '\0'; + if (size) *size = inflatedDataSize; return data; } + +bool ZipFile::readFileToStream(const char* filename, Print& out, const size_t chunkSize) const { + mz_zip_archive_file_stat fileStat; + if (!loadFileStat(filename, &fileStat)) { + return false; + } + + const long fileOffset = getDataOffset(fileStat); + if (fileOffset < 0) { + return false; + } + + FILE* file = fopen(filePath.c_str(), "rb"); + fseek(file, fileOffset, SEEK_SET); + + const auto deflatedDataSize = static_cast(fileStat.m_comp_size); + const auto inflatedDataSize = static_cast(fileStat.m_uncomp_size); + + if (fileStat.m_method == MZ_NO_COMPRESSION) { + // no deflation, just read content + const auto buffer = static_cast(malloc(chunkSize)); + if (!buffer) { + Serial.println("Failed to allocate memory for buffer"); + fclose(file); + return false; + } + + size_t remaining = inflatedDataSize; + while (remaining > 0) { + const size_t dataRead = fread(buffer, 1, remaining < chunkSize ? remaining : chunkSize, file); + if (dataRead == 0) { + Serial.println("Could not read more bytes"); + free(buffer); + fclose(file); + return false; + } + + out.write(buffer, dataRead); + remaining -= dataRead; + } + + fclose(file); + free(buffer); + return true; + } + + if (fileStat.m_method == MZ_DEFLATED) { + // Setup inflator + const auto inflator = static_cast(malloc(sizeof(tinfl_decompressor))); + if (!inflator) { + Serial.println("Failed to allocate memory for inflator"); + fclose(file); + return false; + } + memset(inflator, 0, sizeof(tinfl_decompressor)); + tinfl_init(inflator); + + // Setup file read buffer + const auto fileReadBuffer = static_cast(malloc(chunkSize)); + if (!fileReadBuffer) { + Serial.println("Failed to allocate memory for zip file read buffer"); + free(inflator); + fclose(file); + return false; + } + + const auto outputBuffer = static_cast(malloc(TINFL_LZ_DICT_SIZE)); + if (!outputBuffer) { + Serial.println("Failed to allocate memory for dictionary"); + free(inflator); + free(fileReadBuffer); + fclose(file); + return false; + } + memset(outputBuffer, 0, TINFL_LZ_DICT_SIZE); + + size_t fileRemainingBytes = deflatedDataSize; + size_t processedOutputBytes = 0; + size_t fileReadBufferFilledBytes = 0; + size_t fileReadBufferCursor = 0; + size_t outputCursor = 0; // Current offset in the circular dictionary + + while (true) { + // Load more compressed bytes when needed + if (fileReadBufferCursor >= fileReadBufferFilledBytes) { + if (fileRemainingBytes == 0) { + // Should not be hit, but a safe protection + break; // EOF + } + + fileReadBufferFilledBytes = + fread(fileReadBuffer, 1, fileRemainingBytes < chunkSize ? fileRemainingBytes : chunkSize, file); + fileRemainingBytes -= fileReadBufferFilledBytes; + fileReadBufferCursor = 0; + + if (fileReadBufferFilledBytes == 0) { + // Bad read + break; // EOF + } + } + + // Available bytes in fileReadBuffer to process + size_t inBytes = fileReadBufferFilledBytes - fileReadBufferCursor; + // Space remaining in outputBuffer + size_t outBytes = TINFL_LZ_DICT_SIZE - outputCursor; + + const tinfl_status status = tinfl_decompress(inflator, fileReadBuffer + fileReadBufferCursor, &inBytes, + outputBuffer, outputBuffer + outputCursor, &outBytes, + fileRemainingBytes > 0 ? TINFL_FLAG_HAS_MORE_INPUT : 0); + + // Update input position + fileReadBufferCursor += inBytes; + + // Write output chunk + if (outBytes > 0) { + processedOutputBytes += outBytes; + out.write(outputBuffer + outputCursor, outBytes); + // Update output position in buffer (with wraparound) + outputCursor = (outputCursor + outBytes) & (TINFL_LZ_DICT_SIZE - 1); + } + + Serial.printf("Decompressing - %d/%d deflated into %d/%d inflated\n", deflatedDataSize - fileRemainingBytes, + deflatedDataSize, processedOutputBytes, inflatedDataSize); + + if (status < 0) { + Serial.printf("tinfl_decompress() failed with status %d\n", status); + fclose(file); + free(outputBuffer); + free(fileReadBuffer); + free(inflator); + return false; + } + + if (status == TINFL_STATUS_DONE) { + Serial.println("Decompression finished"); + fclose(file); + free(inflator); + free(fileReadBuffer); + free(outputBuffer); + return true; + } + } + + // If we get here, EOF reached without TINFL_STATUS_DONE + Serial.println("Unexpected EOF"); + fclose(file); + free(outputBuffer); + free(fileReadBuffer); + free(inflator); + return false; + } + + Serial.println("Unsupported compression method"); + return false; +} diff --git a/lib/ZipFile/ZipFile.h b/lib/ZipFile/ZipFile.h index 9702072..36ac7c4 100644 --- a/lib/ZipFile/ZipFile.h +++ b/lib/ZipFile/ZipFile.h @@ -1,12 +1,19 @@ #pragma once +#include + +#include #include +#include "miniz.h" + class ZipFile { std::string filePath; + bool loadFileStat(const char* filename, mz_zip_archive_file_stat* fileStat) const; + long getDataOffset(const mz_zip_archive_file_stat& fileStat) const; public: explicit ZipFile(std::string filePath) : filePath(std::move(filePath)) {} ~ZipFile() = default; - char* readTextFileToMemory(const char* filename, size_t* size = nullptr) const; uint8_t* readFileToMemory(const char* filename, size_t* size = nullptr, bool trailingNullByte = false) const; + bool readFileToStream(const char* filename, Print& out, size_t chunkSize) const; }; diff --git a/src/screens/EpubReaderScreen.cpp b/src/screens/EpubReaderScreen.cpp index 5a87fc1..ed6c53a 100644 --- a/src/screens/EpubReaderScreen.cpp +++ b/src/screens/EpubReaderScreen.cpp @@ -135,6 +135,7 @@ void EpubReaderScreen::displayTaskLoop() { } } +// TODO: Failure handling void EpubReaderScreen::renderPage() { if (!epub) { return;