Stream inflated EPUB HTMLs down to disk instead of inflating in memory (#4)

* Downgrade miniz for stability

* Stream HTML from ZIP down to disk instead of loading all in mem
This commit is contained in:
Dave Allie 2025-12-08 00:39:17 +11:00 committed by GitHub
parent c715c18bf7
commit de453fed1d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 857 additions and 1108 deletions

View File

@ -9,7 +9,7 @@
bool Epub::findContentOpfFile(const ZipFile& zip, std::string& contentOpfFile) {
// open up the meta data to find where the content.opf file lives
size_t s;
const auto metaInfo = zip.readTextFileToMemory("META-INF/container.xml", &s);
const auto metaInfo = reinterpret_cast<char*>(zip.readFileToMemory("META-INF/container.xml", &s, true));
if (!metaInfo) {
Serial.println("Could not find META-INF/container.xml");
return false;
@ -57,7 +57,7 @@ bool Epub::findContentOpfFile(const ZipFile& zip, std::string& contentOpfFile) {
bool Epub::parseContentOpf(ZipFile& zip, std::string& content_opf_file) {
// read in the content.opf file and parse it
auto contents = zip.readTextFileToMemory(content_opf_file.c_str());
auto contents = reinterpret_cast<char*>(zip.readFileToMemory(content_opf_file.c_str(), nullptr, true));
// parse the contents
tinyxml2::XMLDocument doc;
@ -168,7 +168,7 @@ bool Epub::parseTocNcxFile(const ZipFile& zip) {
return false;
}
const auto ncxData = zip.readTextFileToMemory(tocNcxItem.c_str());
const auto ncxData = reinterpret_cast<char*>(zip.readFileToMemory(tocNcxItem.c_str(), nullptr, true));
if (!ncxData) {
Serial.printf("Could not find %s\n", tocNcxItem.c_str());
return false;
@ -308,11 +308,11 @@ std::string normalisePath(const std::string& path) {
return result;
}
uint8_t* Epub::getItemContents(const std::string& itemHref, size_t* size) const {
uint8_t* Epub::readItemContentsToBytes(const std::string& itemHref, size_t* size, bool trailingNullByte) const {
const ZipFile zip("/sd" + filepath);
const std::string path = normalisePath(itemHref);
const auto content = zip.readFileToMemory(path.c_str(), size);
const auto content = zip.readFileToMemory(path.c_str(), size, trailingNullByte);
if (!content) {
Serial.printf("Failed to read item %s\n", path.c_str());
return nullptr;
@ -321,17 +321,11 @@ uint8_t* Epub::getItemContents(const std::string& itemHref, size_t* size) const
return content;
}
char* Epub::getTextItemContents(const std::string& itemHref, size_t* size) const {
bool Epub::readItemContentsToStream(const std::string& itemHref, Print& out, const size_t chunkSize) const {
const ZipFile zip("/sd" + filepath);
const std::string path = normalisePath(itemHref);
const auto content = zip.readTextFileToMemory(path.c_str(), size);
if (!content) {
Serial.printf("Failed to read item %s\n", path.c_str());
return nullptr;
}
return content;
return zip.readFileToStream(path.c_str(), out, chunkSize);
}
int Epub::getSpineItemsCount() const { return spine.size(); }

View File

@ -56,8 +56,9 @@ class Epub {
const std::string& getPath() const;
const std::string& getTitle() const;
const std::string& getCoverImageItem() const;
uint8_t* getItemContents(const std::string& itemHref, size_t* size = nullptr) const;
char* getTextItemContents(const std::string& itemHref, size_t* size = nullptr) const;
uint8_t* readItemContentsToBytes(const std::string& itemHref, size_t* size = nullptr,
bool trailingNullByte = false) const;
bool readItemContentsToStream(const std::string& itemHref, Print& out, size_t chunkSize) const;
std::string& getSpineItem(int spineIndex);
int getSpineItemsCount() const;
EpubTocEntry& getTocItem(int tocTndex);

View File

@ -199,6 +199,11 @@ bool EpubHtmlParserSlim::parseAndBuildPages() {
XML_SetCharacterDataHandler(parser, characterData);
FILE* file = fopen(filepath, "r");
if (!file) {
Serial.printf("Couldn't open file %s\n", filepath);
XML_ParserFree(parser);
return false;
}
do {
void* const buf = XML_GetBuffer(parser, 1024);

View File

@ -64,35 +64,28 @@ void Section::setupCacheDir() const {
void Section::clearCache() const { SD.rmdir(cachePath.c_str()); }
bool Section::persistPageDataToSD() {
size_t size = 0;
auto localPath = epub->getSpineItem(spineIndex);
const auto localPath = epub->getSpineItem(spineIndex);
const auto html = epub->getItemContents(epub->getSpineItem(spineIndex), &size);
if (!html) {
Serial.println("Failed to read item contents");
return false;
}
// TODO: Would love to stream this through an XML visitor
// TODO: Should we get rid of this file all together?
// It currently saves us a bit of memory by allowing for all the inflation bits to be released
// before loading the XML parser
const auto tmpHtmlPath = epub->getCachePath() + "/.tmp_" + std::to_string(spineIndex) + ".html";
File f = SD.open(tmpHtmlPath.c_str(), FILE_WRITE);
const auto written = f.write(html, size);
File f = SD.open(tmpHtmlPath.c_str(), FILE_WRITE, true);
bool success = epub->readItemContentsToStream(localPath, f, 1024);
f.close();
free(html);
Serial.printf("Wrote %d bytes to %s\n", written, tmpHtmlPath.c_str());
if (size != written) {
Serial.println("Failed to inflate section contents to SD");
SD.remove(tmpHtmlPath.c_str());
if (!success) {
Serial.println("Failed to stream item contents");
return false;
}
Serial.printf("Streamed HTML to %s\n", tmpHtmlPath.c_str());
const auto sdTmpHtmlPath = "/sd" + tmpHtmlPath;
auto visitor =
EpubHtmlParserSlim(sdTmpHtmlPath.c_str(), renderer, [this](const Page* page) { this->onPageComplete(page); });
const bool success = visitor.parseAndBuildPages();
success = visitor.parseAndBuildPages();
SD.remove(tmpHtmlPath.c_str());
if (!success) {

View File

@ -3,51 +3,37 @@
#include <HardwareSerial.h>
#include <miniz.h>
int libzInflateOneShot(const uint8_t* inputBuff, const size_t compSize, uint8_t* outputBuff, const size_t uncompSize) {
mz_stream pStream = {
.next_in = inputBuff,
.avail_in = compSize,
.total_in = 0,
.next_out = outputBuff,
.avail_out = uncompSize,
.total_out = 0,
};
bool inflateOneShot(const uint8_t* inputBuf, const size_t deflatedSize, uint8_t* outputBuf, const size_t inflatedSize) {
// Setup inflator
const auto inflator = static_cast<tinfl_decompressor*>(malloc(sizeof(tinfl_decompressor)));
if (!inflator) {
Serial.println("Failed to allocate memory for inflator");
return false;
}
memset(inflator, 0, sizeof(tinfl_decompressor));
tinfl_init(inflator);
int status = 0;
status = mz_inflateInit2(&pStream, -MZ_DEFAULT_WINDOW_BITS);
size_t inBytes = deflatedSize;
size_t outBytes = inflatedSize;
const tinfl_status status = tinfl_decompress(inflator, inputBuf, &inBytes, nullptr, outputBuf, &outBytes,
TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF);
free(inflator);
if (status != MZ_OK) {
Serial.printf("inflateInit2 failed: %d\n", status);
return status;
if (status != TINFL_STATUS_DONE) {
Serial.printf("tinfl_decompress() failed with status %d\n", status);
return false;
}
status = mz_inflate(&pStream, MZ_FINISH);
if (status != MZ_STREAM_END) {
Serial.printf("inflate failed: %d\n", status);
return status;
}
status = mz_inflateEnd(&pStream);
if (status != MZ_OK) {
Serial.printf("inflateEnd failed: %d\n", status);
return status;
}
return status;
return true;
}
char* ZipFile::readTextFileToMemory(const char* filename, size_t* size) const {
const auto data = readFileToMemory(filename, size, true);
return data ? reinterpret_cast<char*>(data) : nullptr;
}
uint8_t* ZipFile::readFileToMemory(const char* filename, size_t* size, bool trailingNullByte) const {
bool ZipFile::loadFileStat(const char* filename, mz_zip_archive_file_stat* fileStat) const {
mz_zip_archive zipArchive = {};
const bool status = mz_zip_reader_init_file(&zipArchive, filePath.c_str(), 0);
if (!status) {
Serial.printf("mz_zip_reader_init_file() failed!\nError %s\n", mz_zip_get_error_string(zipArchive.m_last_error));
return nullptr;
return false;
}
// find the file
@ -55,41 +41,57 @@ uint8_t* ZipFile::readFileToMemory(const char* filename, size_t* size, bool trai
if (!mz_zip_reader_locate_file_v2(&zipArchive, filename, nullptr, 0, &fileIndex)) {
Serial.printf("Could not find file %s\n", filename);
mz_zip_reader_end(&zipArchive);
return nullptr;
return false;
}
mz_zip_archive_file_stat fileStat;
if (!mz_zip_reader_file_stat(&zipArchive, fileIndex, &fileStat)) {
if (!mz_zip_reader_file_stat(&zipArchive, fileIndex, fileStat)) {
Serial.printf("mz_zip_reader_file_stat() failed!\nError %s\n", mz_zip_get_error_string(zipArchive.m_last_error));
mz_zip_reader_end(&zipArchive);
return nullptr;
return false;
}
mz_zip_reader_end(&zipArchive);
return true;
}
uint8_t pLocalHeader[30];
uint64_t fileOffset = fileStat.m_local_header_ofs;
long ZipFile::getDataOffset(const mz_zip_archive_file_stat& fileStat) const {
constexpr auto localHeaderSize = 30;
// Reopen the file to manual read out delated bytes
FILE* file = fopen(filePath.c_str(), "rb");
uint8_t pLocalHeader[localHeaderSize];
const uint64_t fileOffset = fileStat.m_local_header_ofs;
FILE* file = fopen(filePath.c_str(), "r");
fseek(file, fileOffset, SEEK_SET);
const size_t read = fread(pLocalHeader, 1, localHeaderSize, file);
fclose(file);
const size_t read = fread(pLocalHeader, 1, 30, file);
if (read != 30) {
if (read != localHeaderSize) {
Serial.println("Something went wrong reading the local header");
fclose(file);
return nullptr;
return -1;
}
if (pLocalHeader[0] + (pLocalHeader[1] << 8) + (pLocalHeader[2] << 16) + (pLocalHeader[3] << 24) !=
0x04034b50 /* MZ_ZIP_LOCAL_DIR_HEADER_SIG */) {
Serial.println("Not a valid zip file header");
fclose(file);
return nullptr;
return -1;
}
const uint16_t filenameLength = pLocalHeader[26] + (pLocalHeader[27] << 8);
const uint16_t extraOffset = pLocalHeader[28] + (pLocalHeader[29] << 8);
fileOffset += 30 + filenameLength + extraOffset;
return fileOffset + localHeaderSize + filenameLength + extraOffset;
}
uint8_t* ZipFile::readFileToMemory(const char* filename, size_t* size, const bool trailingNullByte) const {
mz_zip_archive_file_stat fileStat;
if (!loadFileStat(filename, &fileStat)) {
return nullptr;
}
const long fileOffset = getDataOffset(fileStat);
if (fileOffset < 0) {
return nullptr;
}
FILE* file = fopen(filePath.c_str(), "rb");
fseek(file, fileOffset, SEEK_SET);
const auto deflatedDataSize = static_cast<size_t>(fileStat.m_comp_size);
@ -97,15 +99,19 @@ uint8_t* ZipFile::readFileToMemory(const char* filename, size_t* size, bool trai
const auto dataSize = trailingNullByte ? inflatedDataSize + 1 : inflatedDataSize;
const auto data = static_cast<uint8_t*>(malloc(dataSize));
if (!fileStat.m_method) {
if (fileStat.m_method == MZ_NO_COMPRESSION) {
// no deflation, just read content
const size_t dataRead = fread(data, 1, inflatedDataSize, file);
fclose(file);
if (dataRead != inflatedDataSize) {
Serial.println("Failed to read data");
free(data);
return nullptr;
}
} else {
// Continue out of block with data set
} else if (fileStat.m_method == MZ_DEFLATED) {
// Read out deflated content from file
const auto deflatedData = static_cast<uint8_t*>(malloc(deflatedDataSize));
if (deflatedData == nullptr) {
@ -116,25 +122,186 @@ uint8_t* ZipFile::readFileToMemory(const char* filename, size_t* size, bool trai
const size_t dataRead = fread(deflatedData, 1, deflatedDataSize, file);
fclose(file);
if (dataRead != deflatedDataSize) {
Serial.printf("Failed to read data, expected %d got %d\n", deflatedDataSize, dataRead);
free(deflatedData);
free(data);
return nullptr;
}
const int result = libzInflateOneShot(deflatedData, deflatedDataSize, data, inflatedDataSize);
bool success = inflateOneShot(deflatedData, deflatedDataSize, data, inflatedDataSize);
free(deflatedData);
if (result != MZ_OK) {
if (!success) {
Serial.println("Failed to inflate file");
free(data);
return nullptr;
}
// Continue out of block with data set
} else {
Serial.println("Unsupported compression method");
fclose(file);
return nullptr;
}
if (trailingNullByte) {
data[inflatedDataSize] = '\0';
}
if (size) {
*size = inflatedDataSize;
}
if (trailingNullByte) data[inflatedDataSize] = '\0';
if (size) *size = inflatedDataSize;
return data;
}
bool ZipFile::readFileToStream(const char* filename, Print& out, const size_t chunkSize) const {
mz_zip_archive_file_stat fileStat;
if (!loadFileStat(filename, &fileStat)) {
return false;
}
const long fileOffset = getDataOffset(fileStat);
if (fileOffset < 0) {
return false;
}
FILE* file = fopen(filePath.c_str(), "rb");
fseek(file, fileOffset, SEEK_SET);
const auto deflatedDataSize = static_cast<size_t>(fileStat.m_comp_size);
const auto inflatedDataSize = static_cast<size_t>(fileStat.m_uncomp_size);
if (fileStat.m_method == MZ_NO_COMPRESSION) {
// no deflation, just read content
const auto buffer = static_cast<uint8_t*>(malloc(chunkSize));
if (!buffer) {
Serial.println("Failed to allocate memory for buffer");
fclose(file);
return false;
}
size_t remaining = inflatedDataSize;
while (remaining > 0) {
const size_t dataRead = fread(buffer, 1, remaining < chunkSize ? remaining : chunkSize, file);
if (dataRead == 0) {
Serial.println("Could not read more bytes");
free(buffer);
fclose(file);
return false;
}
out.write(buffer, dataRead);
remaining -= dataRead;
}
fclose(file);
free(buffer);
return true;
}
if (fileStat.m_method == MZ_DEFLATED) {
// Setup inflator
const auto inflator = static_cast<tinfl_decompressor*>(malloc(sizeof(tinfl_decompressor)));
if (!inflator) {
Serial.println("Failed to allocate memory for inflator");
fclose(file);
return false;
}
memset(inflator, 0, sizeof(tinfl_decompressor));
tinfl_init(inflator);
// Setup file read buffer
const auto fileReadBuffer = static_cast<uint8_t*>(malloc(chunkSize));
if (!fileReadBuffer) {
Serial.println("Failed to allocate memory for zip file read buffer");
free(inflator);
fclose(file);
return false;
}
const auto outputBuffer = static_cast<uint8_t*>(malloc(TINFL_LZ_DICT_SIZE));
if (!outputBuffer) {
Serial.println("Failed to allocate memory for dictionary");
free(inflator);
free(fileReadBuffer);
fclose(file);
return false;
}
memset(outputBuffer, 0, TINFL_LZ_DICT_SIZE);
size_t fileRemainingBytes = deflatedDataSize;
size_t processedOutputBytes = 0;
size_t fileReadBufferFilledBytes = 0;
size_t fileReadBufferCursor = 0;
size_t outputCursor = 0; // Current offset in the circular dictionary
while (true) {
// Load more compressed bytes when needed
if (fileReadBufferCursor >= fileReadBufferFilledBytes) {
if (fileRemainingBytes == 0) {
// Should not be hit, but a safe protection
break; // EOF
}
fileReadBufferFilledBytes =
fread(fileReadBuffer, 1, fileRemainingBytes < chunkSize ? fileRemainingBytes : chunkSize, file);
fileRemainingBytes -= fileReadBufferFilledBytes;
fileReadBufferCursor = 0;
if (fileReadBufferFilledBytes == 0) {
// Bad read
break; // EOF
}
}
// Available bytes in fileReadBuffer to process
size_t inBytes = fileReadBufferFilledBytes - fileReadBufferCursor;
// Space remaining in outputBuffer
size_t outBytes = TINFL_LZ_DICT_SIZE - outputCursor;
const tinfl_status status = tinfl_decompress(inflator, fileReadBuffer + fileReadBufferCursor, &inBytes,
outputBuffer, outputBuffer + outputCursor, &outBytes,
fileRemainingBytes > 0 ? TINFL_FLAG_HAS_MORE_INPUT : 0);
// Update input position
fileReadBufferCursor += inBytes;
// Write output chunk
if (outBytes > 0) {
processedOutputBytes += outBytes;
out.write(outputBuffer + outputCursor, outBytes);
// Update output position in buffer (with wraparound)
outputCursor = (outputCursor + outBytes) & (TINFL_LZ_DICT_SIZE - 1);
}
Serial.printf("Decompressing - %d/%d deflated into %d/%d inflated\n", deflatedDataSize - fileRemainingBytes,
deflatedDataSize, processedOutputBytes, inflatedDataSize);
if (status < 0) {
Serial.printf("tinfl_decompress() failed with status %d\n", status);
fclose(file);
free(outputBuffer);
free(fileReadBuffer);
free(inflator);
return false;
}
if (status == TINFL_STATUS_DONE) {
Serial.println("Decompression finished");
fclose(file);
free(inflator);
free(fileReadBuffer);
free(outputBuffer);
return true;
}
}
// If we get here, EOF reached without TINFL_STATUS_DONE
Serial.println("Unexpected EOF");
fclose(file);
free(outputBuffer);
free(fileReadBuffer);
free(inflator);
return false;
}
Serial.println("Unsupported compression method");
return false;
}

View File

@ -1,12 +1,19 @@
#pragma once
#include <Print.h>
#include <functional>
#include <string>
#include "miniz.h"
class ZipFile {
std::string filePath;
bool loadFileStat(const char* filename, mz_zip_archive_file_stat* fileStat) const;
long getDataOffset(const mz_zip_archive_file_stat& fileStat) const;
public:
explicit ZipFile(std::string filePath) : filePath(std::move(filePath)) {}
~ZipFile() = default;
char* readTextFileToMemory(const char* filename, size_t* size = nullptr) const;
uint8_t* readFileToMemory(const char* filename, size_t* size = nullptr, bool trailingNullByte = false) const;
bool readFileToStream(const char* filename, Print& out, size_t chunkSize) const;
};

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -135,6 +135,7 @@ void EpubReaderScreen::displayTaskLoop() {
}
}
// TODO: Failure handling
void EpubReaderScreen::renderPage() {
if (!epub) {
return;