#include "Utf8.h" int utf8CodepointLen(const unsigned char c) { if (c < 0x80) return 1; // 0xxxxxxx if ((c >> 5) == 0x6) return 2; // 110xxxxx if ((c >> 4) == 0xE) return 3; // 1110xxxx if ((c >> 3) == 0x1E) return 4; // 11110xxx return 1; // fallback for invalid } uint32_t utf8NextCodepoint(const unsigned char** string) { if (**string == 0) { return 0; } const int bytes = utf8CodepointLen(**string); const uint8_t* chr = *string; *string += bytes; if (bytes == 1) { return chr[0]; } uint32_t cp = chr[0] & ((1 << (7 - bytes)) - 1); // mask header bits for (int i = 1; i < bytes; i++) { cp = (cp << 6) | (chr[i] & 0x3F); } return cp; } uint32_t utf8NextCodepointWithLigatures(const unsigned char** string) { const uint32_t cp = utf8NextCodepoint(string); if (cp != 'f') { return cp; } // Save position after 'f' for potential rollback const unsigned char* afterF = *string; const uint32_t next = utf8NextCodepoint(string); if (next == 'f') { // ff — check for ffi or ffl const unsigned char* afterFF = *string; const uint32_t third = utf8NextCodepoint(string); if (third == 'i') { return 0xFB03; // ffi } if (third == 'l') { return 0xFB04; // ffl } // Just ff, roll back the third character *string = afterFF; return 0xFB00; // ff } if (next == 'i') { return 0xFB01; // fi } if (next == 'l') { return 0xFB02; // fl } // No ligature match, roll back to after 'f' *string = afterF; return 'f'; } size_t utf8RemoveLastChar(std::string& str) { if (str.empty()) return 0; size_t pos = str.size() - 1; while (pos > 0 && (static_cast(str[pos]) & 0xC0) == 0x80) { --pos; } str.resize(pos); return pos; } // Truncate string by removing N UTF-8 characters from the end void utf8TruncateChars(std::string& str, const size_t numChars) { for (size_t i = 0; i < numChars && !str.empty(); ++i) { utf8RemoveLastChar(str); } }