mirror of
https://github.com/daveallie/crosspoint-reader.git
synced 2026-02-04 14:47:37 +03:00
format fix
This commit is contained in:
parent
3cf52d8bd1
commit
23183a6270
@ -1,5 +1,4 @@
|
||||
#include "EnglishHyphenator.h"
|
||||
#include "HyphenationLiterals.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
@ -7,6 +6,8 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "HyphenationLiterals.h"
|
||||
|
||||
namespace {
|
||||
|
||||
char lowerLatinChar(const uint32_t cp) {
|
||||
@ -50,18 +51,15 @@ bool isEnglishFricativeChar(const char c) {
|
||||
|
||||
using LatinLiteral = HyphenLiteralT<char>;
|
||||
|
||||
constexpr std::array<LatinLiteral, 20> ENGLISH_PREFIXES = {{{"anti", 4}, {"auto", 4}, {"counter", 7}, {"de", 2},
|
||||
{"dis", 3}, {"hyper", 5}, {"inter", 5}, {"micro", 5},
|
||||
{"mis", 3}, {"mono", 4}, {"multi", 5}, {"non", 3},
|
||||
{"over", 4}, {"post", 4}, {"pre", 3}, {"pro", 3},
|
||||
{"re", 2}, {"sub", 3}, {"super", 5}, {"trans", 5}}};
|
||||
constexpr std::array<LatinLiteral, 20> ENGLISH_PREFIXES = {
|
||||
{{"anti", 4}, {"auto", 4}, {"counter", 7}, {"de", 2}, {"dis", 3}, {"hyper", 5}, {"inter", 5},
|
||||
{"micro", 5}, {"mis", 3}, {"mono", 4}, {"multi", 5}, {"non", 3}, {"over", 4}, {"post", 4},
|
||||
{"pre", 3}, {"pro", 3}, {"re", 2}, {"sub", 3}, {"super", 5}, {"trans", 5}}};
|
||||
|
||||
constexpr std::array<LatinLiteral, 24> ENGLISH_SUFFIXES = {{{"able", 4}, {"ible", 4}, {"ing", 3}, {"ings", 4},
|
||||
{"ed", 2}, {"er", 2}, {"ers", 3}, {"est", 3},
|
||||
{"ful", 3}, {"hood", 4}, {"less", 4}, {"lessly", 6},
|
||||
{"ly", 2}, {"ment", 4}, {"ments", 5},{"ness", 4},
|
||||
{"ous", 3}, {"tion", 4}, {"sion", 4}, {"ward", 4},
|
||||
{"wards", 5},{"ship", 4}, {"ships", 5},{"y", 1}}};
|
||||
constexpr std::array<LatinLiteral, 24> ENGLISH_SUFFIXES = {
|
||||
{{"able", 4}, {"ible", 4}, {"ing", 3}, {"ings", 4}, {"ed", 2}, {"er", 2}, {"ers", 3}, {"est", 3},
|
||||
{"ful", 3}, {"hood", 4}, {"less", 4}, {"lessly", 6}, {"ly", 2}, {"ment", 4}, {"ments", 5}, {"ness", 4},
|
||||
{"ous", 3}, {"tion", 4}, {"sion", 4}, {"ward", 4}, {"wards", 5}, {"ship", 4}, {"ships", 5}, {"y", 1}}};
|
||||
|
||||
bool nextToApostrophe(const std::vector<CodepointInfo>& cps, size_t index);
|
||||
|
||||
@ -111,8 +109,9 @@ bool englishBreakAllowed(const std::vector<CodepointInfo>& cps, const size_t bre
|
||||
|
||||
void appendMorphologyBreaks(const std::vector<CodepointInfo>& cps, const std::string& lowerWord,
|
||||
std::vector<size_t>& indexes) {
|
||||
appendLiteralBreaks(lowerWord, ENGLISH_PREFIXES, ENGLISH_SUFFIXES,
|
||||
[&](const size_t breakIndex) { return englishBreakAllowed(cps, breakIndex); }, indexes);
|
||||
appendLiteralBreaks(
|
||||
lowerWord, ENGLISH_PREFIXES, ENGLISH_SUFFIXES,
|
||||
[&](const size_t breakIndex) { return englishBreakAllowed(cps, breakIndex); }, indexes);
|
||||
}
|
||||
|
||||
struct CharPair {
|
||||
@ -313,8 +312,7 @@ std::vector<size_t> englishBreakIndexes(const std::vector<CodepointInfo>& cps) {
|
||||
const size_t rightVowel = vowelPositions[v + 1];
|
||||
|
||||
if (rightVowel - leftVowel == 1) {
|
||||
if (!isEnglishDiphthong(cps[leftVowel].value, cps[rightVowel].value) &&
|
||||
englishBreakAllowed(cps, rightVowel)) {
|
||||
if (!isEnglishDiphthong(cps[leftVowel].value, cps[rightVowel].value) && englishBreakAllowed(cps, rightVowel)) {
|
||||
indexes.push_back(rightVowel);
|
||||
}
|
||||
continue;
|
||||
|
||||
@ -30,8 +30,7 @@ bool matchesLiteralAt(const WordContainer& word, const size_t start, const Liter
|
||||
|
||||
template <typename WordContainer, typename PrefixContainer, typename SuffixContainer, typename BreakAllowedFn>
|
||||
void appendLiteralBreaks(const WordContainer& lowerWord, const PrefixContainer& prefixes,
|
||||
const SuffixContainer& suffixes, BreakAllowedFn&& breakAllowed,
|
||||
std::vector<size_t>& indexes) {
|
||||
const SuffixContainer& suffixes, BreakAllowedFn&& breakAllowed, std::vector<size_t>& indexes) {
|
||||
const size_t length = lowerWord.size();
|
||||
|
||||
const auto tryPush = [&](const size_t breakIndex) {
|
||||
|
||||
@ -1,11 +1,12 @@
|
||||
#include "RussianHyphenator.h"
|
||||
#include "HyphenationLiterals.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <limits>
|
||||
#include <vector>
|
||||
|
||||
#include "HyphenationLiterals.h"
|
||||
|
||||
namespace {
|
||||
|
||||
using CyrillicLiteral = HyphenLiteralT<uint32_t>;
|
||||
@ -23,10 +24,18 @@ constexpr uint32_t PFX_SAMO[4] = {0x0441, 0x0430, 0x043C, 0x043E};
|
||||
constexpr uint32_t PFX_OBO[3] = {0x043E, 0x0431, 0x043E};
|
||||
constexpr uint32_t PFX_PROTIV[6] = {0x043F, 0x0440, 0x043E, 0x0442, 0x0438, 0x0432};
|
||||
|
||||
constexpr std::array<CyrillicLiteral, 12> RUSSIAN_PREFIXES = {{{PFX_BEZ, 3}, {PFX_RAZ, 3}, {PFX_POD, 3},
|
||||
{PFX_NAD, 3}, {PFX_PERE, 4}, {PFX_SVERH, 5},
|
||||
{PFX_MEZH, 3}, {PFX_SUPER, 5},{PFX_PRED, 4},
|
||||
{PFX_SAMO, 4}, {PFX_OBO, 3}, {PFX_PROTIV, 6}}};
|
||||
constexpr std::array<CyrillicLiteral, 12> RUSSIAN_PREFIXES = {{{PFX_BEZ, 3},
|
||||
{PFX_RAZ, 3},
|
||||
{PFX_POD, 3},
|
||||
{PFX_NAD, 3},
|
||||
{PFX_PERE, 4},
|
||||
{PFX_SVERH, 5},
|
||||
{PFX_MEZH, 3},
|
||||
{PFX_SUPER, 5},
|
||||
{PFX_PRED, 4},
|
||||
{PFX_SAMO, 4},
|
||||
{PFX_OBO, 3},
|
||||
{PFX_PROTIV, 6}}};
|
||||
|
||||
constexpr uint32_t SFX_NOST[4] = {0x043D, 0x043E, 0x0441, 0x0442};
|
||||
constexpr uint32_t SFX_STVO[4] = {0x0441, 0x0442, 0x0432, 0x043E};
|
||||
@ -41,10 +50,18 @@ constexpr uint32_t SFX_ISM[3] = {0x0438, 0x0437, 0x043C};
|
||||
constexpr uint32_t SFX_LIV[5] = {0x043B, 0x0438, 0x0432, 0x044B, 0x0439};
|
||||
constexpr uint32_t SFX_OST[4] = {0x043E, 0x0441, 0x0442, 0x044C};
|
||||
|
||||
constexpr std::array<CyrillicLiteral, 12> RUSSIAN_SUFFIXES = {{{SFX_NOST, 4}, {SFX_STVO, 4}, {SFX_ENIE, 4},
|
||||
{SFX_ATION, 4}, {SFX_CHIK, 3}, {SFX_NIK, 3},
|
||||
{SFX_TEL, 4}, {SFX_SKII, 4}, {SFX_AL, 6},
|
||||
{SFX_ISM, 3}, {SFX_LIV, 5}, {SFX_OST, 4}}};
|
||||
constexpr std::array<CyrillicLiteral, 12> RUSSIAN_SUFFIXES = {{{SFX_NOST, 4},
|
||||
{SFX_STVO, 4},
|
||||
{SFX_ENIE, 4},
|
||||
{SFX_ATION, 4},
|
||||
{SFX_CHIK, 3},
|
||||
{SFX_NIK, 3},
|
||||
{SFX_TEL, 4},
|
||||
{SFX_SKII, 4},
|
||||
{SFX_AL, 6},
|
||||
{SFX_ISM, 3},
|
||||
{SFX_LIV, 5},
|
||||
{SFX_OST, 4}}};
|
||||
|
||||
std::vector<uint32_t> lowercaseCyrillicWord(const std::vector<CodepointInfo>& cps) {
|
||||
std::vector<uint32_t> lower;
|
||||
@ -308,8 +325,9 @@ bool nextToSoftSign(const std::vector<CodepointInfo>& cps, const size_t index) {
|
||||
|
||||
void appendMorphologyBreaks(const std::vector<CodepointInfo>& cps, const std::vector<uint32_t>& lowerWord,
|
||||
std::vector<size_t>& indexes) {
|
||||
appendLiteralBreaks(lowerWord, RUSSIAN_PREFIXES, RUSSIAN_SUFFIXES,
|
||||
[&](const size_t breakIndex) { return russianBreakAllowed(cps, breakIndex); }, indexes);
|
||||
appendLiteralBreaks(
|
||||
lowerWord, RUSSIAN_PREFIXES, RUSSIAN_SUFFIXES,
|
||||
[&](const size_t breakIndex) { return russianBreakAllowed(cps, breakIndex); }, indexes);
|
||||
}
|
||||
|
||||
// Produces syllable break indexes tailored to Russian phonotactics.
|
||||
|
||||
Loading…
Reference in New Issue
Block a user