From 314dfbbd7528b18e79005a41c0c19e7fec954460 Mon Sep 17 00:00:00 2001
From: deepend <deepend@tilde.club>
Date: Mon, 23 Feb 2026 16:11:41 -0700
Subject: [PATCH] =?UTF-8?q?Fixed=20highlight=20word=20parsing=20so=20Unico?=
 =?UTF-8?q?de=20symbols=20(including=20emoji=20like=20=F0=9F=90=9C)=20are?=
 =?UTF-8?q?=20treated=20as=20part=20of=20words=20when=20scanning=20incomin?=
 =?UTF-8?q?g=20text=20for=20alert=20matches,=20which=20enables=20/Extra=20?=
 =?UTF-8?q?words=20to=20highlight/=20entries=20containing=20emoji=20to=20w?=
 =?UTF-8?q?ork.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Improved token scanning to use UTF-8-aware character classification (gunichar, g_unichar_isdigit, g_unichar_isalpha) instead of byte-only checks, avoiding split/mis-detection on multibyte characters.
---
 src/common/inbound.c | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)
diff --git a/src/common/inbound.c b/src/common/inbound.c
index adbe6d34..10f5cbc5 100644
--- a/src/common/inbound.c
+++ b/src/common/inbound.c
@@ -258,6 +258,7 @@ alert_match_text (char *text, char *masks)
 {
 	unsigned char *p = text;
 	unsigned char endchar;
+	gunichar ch;
 	int res;
 
 	if (masks[0] == 0)
@@ -265,26 +266,36 @@ alert_match_text (char *text, char *masks)
 
 	while (1)
 	{
-		if (*p >= '0' && *p <= '9')
+		ch = g_utf8_get_char (p);
+
+		if (g_unichar_isdigit (ch) || g_unichar_isalpha (ch))
 		{
-			p++;
+			p += g_utf8_skip [p[0]];
 			continue;
 		}
 
 		/* if it's RFC1459 <special>, it can be inside a word */
-		switch (*p)
+		switch (ch)
 		{
 		case '-': case '[': case ']': case '\\':
 		case '`': case '^': case '{': case '}':
 		case '_': case '|':
-			p++;
+			p += g_utf8_skip [p[0]];
+			continue;
+		}
+
+		/* Symbols (including emoji) can be part of highlighted words. */
+		if (!g_unichar_isspace (ch) && !g_unichar_ispunct (ch) &&
+			 !g_unichar_iscntrl (ch))
+		{
+			p += g_utf8_skip [p[0]];
 			continue;
 		}
 
 		/* if it's a 0, space or comma, the word has ended. */
 		if (*p == 0 || *p == ' ' || *p == ',' ||
 			/* if it's anything BUT a letter, the word has ended. */
-			 (!g_unichar_isalpha (g_utf8_get_char (p))))
+			 (!g_unichar_isalpha (ch)))
 		{
 			endchar = *p;
 			*p = 0;