diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp index 71092b82e0f3..12796722b3fb 100644 --- a/icu4c/source/common/uniset_props.cpp +++ b/icu4c/source/common/uniset_props.cpp @@ -59,9 +59,6 @@ constexpr char ANY[] = "ANY"; // [\u0000-\U0010FFFF] constexpr char ASCII[] = "ASCII"; // [\u0000-\u007F] constexpr char ASSIGNED[] = "Assigned"; // [:^Cn:] -// Unicode name property alias -constexpr char16_t NAME_PROP[] = u"na"; - } // namespace // Cached sets ------------------------------------------------------------- *** @@ -147,6 +144,56 @@ isPOSIXClose(const UnicodeString &pattern, int32_t pos) { // memory leak checker tools #define _dbgct(me) +// Strips leading and trailing spaces and turns runs of spaces into single spaces. +// This should be replaced by UAX44-LM1 and UAX44-LM2 skeletonizations as part of ICU-3736. +template +UBool mungeCharName(std::basic_string_view src, char* dst, int32_t dstCapacity) { + int32_t j = 0; + --dstCapacity; /* make room for term. zero */ + if constexpr (!std::is_same_v) { + if (!uprv_isInvariantUString(src.data(), static_cast(src.size()))) { + return false; + } + } + for (CharT uch : src) { + char ch; + if constexpr (std::is_same_v) { + ch = uch; + } else { + // This would want to be UCHAR_TO_CHAR but that is defined in uinvchar.cpp. This function + // should not last long anyway (famous last words)… + u_UCharsToChars(&uch, &ch, 1); + } + if (ch == ' ' && (j == 0 || (j > 0 && dst[j - 1] == ' '))) { + continue; + } + if (j >= dstCapacity) return false; + dst[j++] = ch; + } + if (j > 0 && dst[j-1] == ' ') --j; + dst[j] = 0; + return true; +} + +// Returns the character with the given name or name alias, or U_SENTINEL if no such character +// exists. +template +UChar32 getCharacterByName(const std::basic_string_view name) { + // Must munge name, since u_charFromName() does not do 'loose' matching. + char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength + if (!mungeCharName(name, buf, sizeof(buf))) { + return U_SENTINEL; + } + for (const UCharNameChoice nameChoice : std::array{U_EXTENDED_CHAR_NAME, U_CHAR_NAME_ALIAS}) { + UErrorCode ec = U_ZERO_ERROR; + UChar32 ch = u_charFromName(nameChoice, buf, &ec); + if (U_SUCCESS(ec)) { + return ch; + } + } + return U_SENTINEL; +} + } // namespace //---------------------------------------------------------------- @@ -657,19 +704,14 @@ class UnicodeSet::Lexer { } start = parsePosition_.getIndex(); } else if (last == u'}') { - UnicodeSet result; - result.applyPropertyAlias( - UnicodeString(NAME_PROP), - pattern_.tempSubStringBetween(start, parsePosition_.getIndex() - 1), - errorCode); - result.setPattern( - pattern_.tempSubStringBetween(start - 3, parsePosition_.getIndex())); - if ((hex.has_value() && result.charAt(0) != hex) || - (literal.has_value() && result.charAt(0) != literal)) { + const UChar32 result = getCharacterByName(std::u16string_view(pattern_).substr( + start, parsePosition_.getIndex() - 1 - start)); + if (result < 0 || (hex.has_value() && result != hex) || + (literal.has_value() && result != literal)) { errorCode = U_ILLEGAL_ARGUMENT_ERROR; return {}; } - return result.charAt(0); + return result; } } } @@ -1312,23 +1354,6 @@ void UnicodeSet::applyFilter(UnicodeSet::Filter filter, namespace { -UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) { - /* Note: we use ' ' in compiler code page */ - int32_t j = 0; - char ch; - --dstCapacity; /* make room for term. zero */ - while ((ch = *src++) != 0) { - if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) { - continue; - } - if (j >= dstCapacity) return false; - dst[j++] = ch; - } - if (j > 0 && dst[j-1] == ' ') --j; - dst[j] = 0; - return true; -} - } // namespace //---------------------------------------------------------------- @@ -1452,18 +1477,14 @@ UnicodeSet::applyPropertyAlias(const UnicodeString& prop, } case UCHAR_NAME: { - // Must munge name, since u_charFromName() does not do - // 'loose' matching. - char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength - if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec); - UChar32 ch = u_charFromName(U_EXTENDED_CHAR_NAME, buf, &ec); - if (U_SUCCESS(ec)) { - clear(); - add(ch); - return *this; - } else { + const UChar32 ch = + getCharacterByName(std::string_view(vname.data(), vname.length())); + if (ch < 0) { FAIL(ec); } + clear(); + add(ch); + return *this; } case UCHAR_UNICODE_1_NAME: // ICU 49 deprecates the Unicode_1_Name property APIs. @@ -1473,7 +1494,9 @@ UnicodeSet::applyPropertyAlias(const UnicodeString& prop, // Must munge name, since u_versionFromString() does not do // 'loose' matching. char buf[128]; - if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec); + if (!mungeCharName(std::string_view(vname.data(), vname.length()), buf, + sizeof(buf))) + FAIL(ec); UVersionInfo version; u_versionFromString(version, buf); applyFilter(versionFilter, &version, diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp index 74e8fb5a9628..58f9128d8ec8 100644 --- a/icu4c/source/test/intltest/usettest.cpp +++ b/icu4c/source/test/intltest/usettest.cpp @@ -4713,6 +4713,12 @@ void UnicodeSetTest::TestToPatternOutput() { // Ill-formed in ICU4C 78 and earlier, made well-formed by ICU-23312. {u"[a-{z}]", u"[a-z]"}, {u"[{a}-z]", u"[a-z]"}, + {uR"([\N{PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET}])", u"[︘]"}, + {uR"([\N{bell}])", u"[🔔]"}, + // Ill-formed in ICU 78 and earlier: + {uR"([\N{PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET}])", u"[︘]"}, + {uR"([\N{Hangul jungseong O-E}])", u"[ᆀ]"}, + {uR"([\N{Hangul jungseong OE}])", u"[ᅬ]"}, }) { UErrorCode errorCode = U_ZERO_ERROR; const UnicodeSet set(expression, errorCode); @@ -4832,6 +4838,10 @@ void UnicodeSetTest::TestParseErrors() { // Doubly negated property queries. uR"(\P{Decomposition_Type≠compat})", u"[:^Noncharacter_Code_Point≠No:]", + // This should be [\a]; tracked by ICU-8963. + uR"([\N{BEL}])", + // This should be [œ]; tracked by ICU-3736. + uR"([\N{Latin small ligature o-e}])", }) { UErrorCode errorCode = U_ZERO_ERROR; const UnicodeSet set(expression, errorCode);