From 0935799f7d93ae4493c6fd12ae5b839cc4f7591f Mon Sep 17 00:00:00 2001
From: deepend <deepend@tilde.club>
Date: Tue, 24 Feb 2026 11:18:46 -0700
Subject: [PATCH] Updated emoji-token normalization to first apply Unicode
 composition normalization (G_NORMALIZE_ALL_COMPOSE) before matching, so
 canonically-equivalent sequences compare reliably in highlight checks.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Expanded ignored codepoints during highlight token comparison to include zero-width joiner (U+200D) in addition to variation selectors (U+FE0E/U+FE0F), which helps emoji entered from different input methods still match configured “extra highlight words.”

This logic is used by the alert/highlight matching path (alert_match_word) that compares configured extra words against extracted message tokens.
---
 src/common/inbound.c | 61 +++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 54 insertions(+), 7 deletions(-)

diff --git a/src/common/inbound.c b/src/common/inbound.c
index 10f5cbc5..0b7ae0db 100644
--- a/src/common/inbound.c
+++ b/src/common/inbound.c
@@ -222,15 +222,51 @@ inbound_privmsg (server *serv, char *from, char *ip, char *text, int id,
 
 /* used for Alerts section. Masks can be separated by commas and spaces. */
 
+static char *
+alert_normalize_word (const char *text)
+{
+	GString *normalized;
+	char *composed;
+	const char *p;
+
+	composed = g_utf8_normalize (text, -1, G_NORMALIZE_ALL_COMPOSE);
+	if (!composed)
+		composed = g_strdup (text);
+
+	normalized = g_string_sized_new (strlen (composed));
+	p = composed;
+
+	while (*p)
+	{
+		gunichar ch = g_utf8_get_char ((const guchar *)p);
+
+		/* Ignore selector/joiner codepoints that vary by input method. */
+		if (ch != 0x200D && ch != 0xFE0E && ch != 0xFE0F)
+			g_string_append_unichar (normalized, ch);
+
+		p = g_utf8_next_char (p);
+	}
+
+	g_free (composed);
+	return g_string_free (normalized, FALSE);
+}
+
 gboolean
 alert_match_word (char *word, char *masks)
 {
 	char *p = masks;
 	char endchar;
+	char *word_normalized;
+	char *mask_normalized;
 	int res;
 
+	word_normalized = alert_normalize_word (word);
+
 	if (masks[0] == 0)
+	{
+		g_free (word_normalized);
 		return FALSE;
+	}
 
 	while (1)
 	{
@@ -239,15 +275,23 @@ alert_match_word (char *word, char *masks)
 		{
 			endchar = *p;
 			*p = 0;
-			res = match (g_strchug (masks), word);
+			mask_normalized = alert_normalize_word (g_strchug (masks));
+			res = match (mask_normalized, word_normalized);
+			g_free (mask_normalized);
 			*p = endchar;
 
 			if (res)
+			{
+				g_free (word_normalized);
 				return TRUE;	/* yes, matched! */
+			}
 
 			masks = p + 1;
 			if (*p == 0)
+			{
+				g_free (word_normalized);
 				return FALSE;
+			}
 		}
 		p++;
 	}
@@ -259,6 +303,7 @@ alert_match_text (char *text, char *masks)
 	unsigned char *p = text;
 	unsigned char endchar;
 	gunichar ch;
+	GUnicodeType ch_type;
 	int res;
 
 	if (masks[0] == 0)
@@ -267,6 +312,7 @@ alert_match_text (char *text, char *masks)
 	while (1)
 	{
 		ch = g_utf8_get_char (p);
+		ch_type = g_unichar_type (ch);
 
 		if (g_unichar_isdigit (ch) || g_unichar_isalpha (ch))
 		{
@@ -285,17 +331,18 @@ alert_match_text (char *text, char *masks)
 		}
 
 		/* Symbols (including emoji) can be part of highlighted words. */
-		if (!g_unichar_isspace (ch) && !g_unichar_ispunct (ch) &&
-			 !g_unichar_iscntrl (ch))
+		if (ch_type == G_UNICODE_MATH_SYMBOL ||
+			 ch_type == G_UNICODE_CURRENCY_SYMBOL ||
+			 ch_type == G_UNICODE_MODIFIER_SYMBOL ||
+			 ch_type == G_UNICODE_OTHER_SYMBOL)
 		{
 			p += g_utf8_skip [p[0]];
 			continue;
 		}
 
-		/* if it's a 0, space or comma, the word has ended. */
-		if (*p == 0 || *p == ' ' || *p == ',' ||
-			/* if it's anything BUT a letter, the word has ended. */
-			 (!g_unichar_isalpha (ch)))
+		/* Delimiters end the word. */
+		if (*p == 0 || g_unichar_isspace (ch) || g_unichar_ispunct (ch) ||
+			 g_unichar_iscntrl (ch))
 		{
 			endchar = *p;
 			*p = 0;