From 0935799f7d93ae4493c6fd12ae5b839cc4f7591f Mon Sep 17 00:00:00 2001 From: deepend Date: Tue, 24 Feb 2026 11:18:46 -0700 Subject: [PATCH] Updated emoji-token normalization to first apply Unicode composition normalization (G_NORMALIZE_ALL_COMPOSE) before matching, so canonically-equivalent sequences compare reliably in highlight checks. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Expanded ignored codepoints during highlight token comparison to include zero-width joiner (U+200D) in addition to variation selectors (U+FE0E/U+FE0F), which helps emoji entered from different input methods still match configured “extra highlight words.” This logic is used by the alert/highlight matching path (alert_match_word) that compares configured extra words against extracted message tokens. --- src/common/inbound.c | 61 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 54 insertions(+), 7 deletions(-) diff --git a/src/common/inbound.c b/src/common/inbound.c index 10f5cbc5..0b7ae0db 100644 --- a/src/common/inbound.c +++ b/src/common/inbound.c @@ -222,15 +222,51 @@ inbound_privmsg (server *serv, char *from, char *ip, char *text, int id, /* used for Alerts section. Masks can be separated by commas and spaces. */ +static char * +alert_normalize_word (const char *text) +{ + GString *normalized; + char *composed; + const char *p; + + composed = g_utf8_normalize (text, -1, G_NORMALIZE_ALL_COMPOSE); + if (!composed) + composed = g_strdup (text); + + normalized = g_string_sized_new (strlen (composed)); + p = composed; + + while (*p) + { + gunichar ch = g_utf8_get_char ((const guchar *)p); + + /* Ignore selector/joiner codepoints that vary by input method. */ + if (ch != 0x200D && ch != 0xFE0E && ch != 0xFE0F) + g_string_append_unichar (normalized, ch); + + p = g_utf8_next_char (p); + } + + g_free (composed); + return g_string_free (normalized, FALSE); +} + gboolean alert_match_word (char *word, char *masks) { char *p = masks; char endchar; + char *word_normalized; + char *mask_normalized; int res; + word_normalized = alert_normalize_word (word); + if (masks[0] == 0) + { + g_free (word_normalized); return FALSE; + } while (1) { @@ -239,15 +275,23 @@ alert_match_word (char *word, char *masks) { endchar = *p; *p = 0; - res = match (g_strchug (masks), word); + mask_normalized = alert_normalize_word (g_strchug (masks)); + res = match (mask_normalized, word_normalized); + g_free (mask_normalized); *p = endchar; if (res) + { + g_free (word_normalized); return TRUE; /* yes, matched! */ + } masks = p + 1; if (*p == 0) + { + g_free (word_normalized); return FALSE; + } } p++; } @@ -259,6 +303,7 @@ alert_match_text (char *text, char *masks) unsigned char *p = text; unsigned char endchar; gunichar ch; + GUnicodeType ch_type; int res; if (masks[0] == 0) @@ -267,6 +312,7 @@ alert_match_text (char *text, char *masks) while (1) { ch = g_utf8_get_char (p); + ch_type = g_unichar_type (ch); if (g_unichar_isdigit (ch) || g_unichar_isalpha (ch)) { @@ -285,17 +331,18 @@ alert_match_text (char *text, char *masks) } /* Symbols (including emoji) can be part of highlighted words. */ - if (!g_unichar_isspace (ch) && !g_unichar_ispunct (ch) && - !g_unichar_iscntrl (ch)) + if (ch_type == G_UNICODE_MATH_SYMBOL || + ch_type == G_UNICODE_CURRENCY_SYMBOL || + ch_type == G_UNICODE_MODIFIER_SYMBOL || + ch_type == G_UNICODE_OTHER_SYMBOL) { p += g_utf8_skip [p[0]]; continue; } - /* if it's a 0, space or comma, the word has ended. */ - if (*p == 0 || *p == ' ' || *p == ',' || - /* if it's anything BUT a letter, the word has ended. */ - (!g_unichar_isalpha (ch))) + /* Delimiters end the word. */ + if (*p == 0 || g_unichar_isspace (ch) || g_unichar_ispunct (ch) || + g_unichar_iscntrl (ch)) { endchar = *p; *p = 0;