Fixed highlight word parsing so Unicode symbols (including emoji like 🐜) are treated as part of words when scanning incoming text for alert matches, which enables /Extra words to highlight/ entries containing emoji to work.

Improved token scanning to use UTF-8-aware character classification (gunichar, g_unichar_isdigit, g_unichar_isalpha) instead of byte-only checks, avoiding split/mis-detection on multibyte characters.
This commit is contained in:
2026-02-23 16:11:41 -07:00
parent aed21ffcae
commit 314dfbbd75

View File

@@ -258,6 +258,7 @@ alert_match_text (char *text, char *masks)
{ {
unsigned char *p = text; unsigned char *p = text;
unsigned char endchar; unsigned char endchar;
gunichar ch;
int res; int res;
if (masks[0] == 0) if (masks[0] == 0)
@@ -265,26 +266,36 @@ alert_match_text (char *text, char *masks)
while (1) while (1)
{ {
if (*p >= '0' && *p <= '9') ch = g_utf8_get_char (p);
if (g_unichar_isdigit (ch) || g_unichar_isalpha (ch))
{ {
p++; p += g_utf8_skip [p[0]];
continue; continue;
} }
/* if it's RFC1459 <special>, it can be inside a word */ /* if it's RFC1459 <special>, it can be inside a word */
switch (*p) switch (ch)
{ {
case '-': case '[': case ']': case '\\': case '-': case '[': case ']': case '\\':
case '`': case '^': case '{': case '}': case '`': case '^': case '{': case '}':
case '_': case '|': case '_': case '|':
p++; p += g_utf8_skip [p[0]];
continue;
}
/* Symbols (including emoji) can be part of highlighted words. */
if (!g_unichar_isspace (ch) && !g_unichar_ispunct (ch) &&
!g_unichar_iscntrl (ch))
{
p += g_utf8_skip [p[0]];
continue; continue;
} }
/* if it's a 0, space or comma, the word has ended. */ /* if it's a 0, space or comma, the word has ended. */
if (*p == 0 || *p == ' ' || *p == ',' || if (*p == 0 || *p == ' ' || *p == ',' ||
/* if it's anything BUT a letter, the word has ended. */ /* if it's anything BUT a letter, the word has ended. */
(!g_unichar_isalpha (g_utf8_get_char (p)))) (!g_unichar_isalpha (ch)))
{ {
endchar = *p; endchar = *p;
*p = 0; *p = 0;