From 314dfbbd7528b18e79005a41c0c19e7fec954460 Mon Sep 17 00:00:00 2001 From: deepend Date: Mon, 23 Feb 2026 16:11:41 -0700 Subject: [PATCH] =?UTF-8?q?Fixed=20highlight=20word=20parsing=20so=20Unico?= =?UTF-8?q?de=20symbols=20(including=20emoji=20like=20=F0=9F=90=9C)=20are?= =?UTF-8?q?=20treated=20as=20part=20of=20words=20when=20scanning=20incomin?= =?UTF-8?q?g=20text=20for=20alert=20matches,=20which=20enables=20/Extra=20?= =?UTF-8?q?words=20to=20highlight/=20entries=20containing=20emoji=20to=20w?= =?UTF-8?q?ork.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Improved token scanning to use UTF-8-aware character classification (gunichar, g_unichar_isdigit, g_unichar_isalpha) instead of byte-only checks, avoiding split/mis-detection on multibyte characters. --- src/common/inbound.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/src/common/inbound.c b/src/common/inbound.c index adbe6d34..10f5cbc5 100644 --- a/src/common/inbound.c +++ b/src/common/inbound.c @@ -258,6 +258,7 @@ alert_match_text (char *text, char *masks) { unsigned char *p = text; unsigned char endchar; + gunichar ch; int res; if (masks[0] == 0) @@ -265,26 +266,36 @@ alert_match_text (char *text, char *masks) while (1) { - if (*p >= '0' && *p <= '9') + ch = g_utf8_get_char (p); + + if (g_unichar_isdigit (ch) || g_unichar_isalpha (ch)) { - p++; + p += g_utf8_skip [p[0]]; continue; } /* if it's RFC1459 , it can be inside a word */ - switch (*p) + switch (ch) { case '-': case '[': case ']': case '\\': case '`': case '^': case '{': case '}': case '_': case '|': - p++; + p += g_utf8_skip [p[0]]; + continue; + } + + /* Symbols (including emoji) can be part of highlighted words. */ + if (!g_unichar_isspace (ch) && !g_unichar_ispunct (ch) && + !g_unichar_iscntrl (ch)) + { + p += g_utf8_skip [p[0]]; continue; } /* if it's a 0, space or comma, the word has ended. */ if (*p == 0 || *p == ' ' || *p == ',' || /* if it's anything BUT a letter, the word has ended. */ - (!g_unichar_isalpha (g_utf8_get_char (p)))) + (!g_unichar_isalpha (ch))) { endchar = *p; *p = 0;