const char* cases[][2] = { // ascii- {"", ""}, {"a", "a"}, {"ab", "ba"}, {"a b", "b a"}, // , UTF-8 {"\xd1\x84", "\xd1\x84"}, // {"x\xd1\x84", "\xd1\x84x"}, {"y\xd1\x84z", "z\xd1\x84y"}, {"\xd1\x84\xd1\x85", "\xd1\x85\xd1\x84"}, // , {"\xd0\x98\xcc\x86", "\xd0\x98\xcc\x86"}, // {"i\xd0\x98\xcc\x86", "\xd0\x98\xcc\x86i"}, {"\xd0\x98\xcc\x86i", "i\xd0\x98\xcc\x86"}, {"\xd0\x98\xcc\x86\xd1\x84", "\xd1\x84\xd0\x98\xcc\x86"}, // : z̆̈y {"z\xd0\x98\xcc\x86\xcc\x88y", "y\xd0\x98\xcc\x86\xcc\x88z"} };
As you can see, the solution requires something that knows that Unicode is “under the hood”. And the responsibility for this is usually placed on the ICU library. Therefore, we can take this note as an ICU review for those who, like me, are going to start using it. // Decoding icu::UnicodeString s = icu::UnicodeString::fromUTF8(cases[test_case][0]); // Encoding std::string result; s.toUTF8String(result);
// Initialize iterator UErrorCode ec = U_ZERO_ERROR; icu::Locale ru_locale = icu::Locale("ru"); std::unique_ptr<icu::BreakIterator> iter; iter.reset(icu::BreakIterator::createCharacterInstance(ru_locale, ec)); iter->setText(my_unicode_string); // Set it to the beginning of my_unicode_string and get next character's position iter->first(); int32_t next_char = iter->next(); // Or set it to the after-last-character position and get previous character position iter->last(); int32_t this_char = iter->previous();
Source: https://habr.com/ru/post/222331/
All Articles