Xteink-X4-crosspoint-reader/lib/Epub/Epub/parsers/ContentOpfParser.h
Daniel 06ced8f2d1 perf: optimize large EPUB indexing from O(n²) to O(n log n)
Three optimizations for EPUBs with many chapters (e.g. 2768 chapters):

1. OPF idref→href lookup: Build sorted hash index during manifest parsing,
   use binary search during spine resolution. Reduces ~4min to ~30-60s.

2. TOC href→spineIndex lookup: Build sorted hash index in beginTocPass(),
   use binary search in createTocEntry(). Reduces ~4min to ~30-60s.

3. ZIP central-dir cursor: Resume scanning from last position instead of
   restarting from beginning. Reduces ~8min to ~1-3min.

All optimizations only activate for large EPUBs (≥400 spine items).
Small books use unchanged code paths.

Memory impact: ~33KB + ~39KB temporary during indexing, freed after.
Expected total: ~17min → ~3-5min for Shadow Slave (2768 chapters).

Also adds phase timing logs for performance measurement.
2026-01-20 23:35:54 -08:00

78 lines
2.0 KiB
C++

#pragma once
#include <Print.h>
#include <vector>
#include <algorithm>
#include "Epub.h"
#include "expat.h"
class BookMetadataCache;
class ContentOpfParser final : public Print {
enum ParserState {
START,
IN_PACKAGE,
IN_METADATA,
IN_BOOK_TITLE,
IN_BOOK_AUTHOR,
IN_BOOK_LANGUAGE,
IN_MANIFEST,
IN_SPINE,
IN_GUIDE,
};
const std::string& cachePath;
const std::string& baseContentPath;
size_t remainingSize;
XML_Parser parser = nullptr;
ParserState state = START;
BookMetadataCache* cache;
FsFile tempItemStore;
std::string coverItemId;
// Index for fast idref→href lookup (used only for large EPUBs)
struct ItemIndexEntry {
uint32_t idHash; // FNV-1a hash of itemId
uint16_t idLen; // length for collision reduction
uint32_t fileOffset; // offset in .items.bin
};
std::vector<ItemIndexEntry> itemIndex;
bool useItemIndex = false;
static constexpr uint16_t LARGE_SPINE_THRESHOLD = 400;
// FNV-1a hash function
static uint32_t fnvHash(const std::string& s) {
uint32_t hash = 2166136261u;
for (char c : s) {
hash ^= static_cast<uint8_t>(c);
hash *= 16777619u;
}
return hash;
}
static void startElement(void* userData, const XML_Char* name, const XML_Char** atts);
static void characterData(void* userData, const XML_Char* s, int len);
static void endElement(void* userData, const XML_Char* name);
public:
std::string title;
std::string author;
std::string language;
std::string tocNcxPath;
std::string tocNavPath; // EPUB 3 nav document path
std::string coverItemHref;
std::string textReferenceHref;
explicit ContentOpfParser(const std::string& cachePath, const std::string& baseContentPath, const size_t xmlSize,
BookMetadataCache* cache)
: cachePath(cachePath), baseContentPath(baseContentPath), remainingSize(xmlSize), cache(cache) {}
~ContentOpfParser() override;
bool setup();
size_t write(uint8_t) override;
size_t write(const uint8_t* buffer, size_t size) override;
};