Validate host links with PSL rules

This commit is contained in:
2026-04-13 20:10:28 -06:00
parent 150ad62771
commit 8f7c40caf1
5 changed files with 211 additions and 3 deletions

View File

@@ -29,6 +29,7 @@
<ClInclude Include="plugin-timer.h" /> <ClInclude Include="plugin-timer.h" />
<ClInclude Include="plugin.h" /> <ClInclude Include="plugin.h" />
<ClInclude Include="proto-irc.h" /> <ClInclude Include="proto-irc.h" />
<ClInclude Include="public_suffix_data.h" />
<ClInclude Include="server.h" /> <ClInclude Include="server.h" />
<ClInclude Include="servlist.h" /> <ClInclude Include="servlist.h" />
<ClInclude Include="ssl.h" /> <ClInclude Include="ssl.h" />
@@ -109,6 +110,7 @@
<Command><![CDATA[ <Command><![CDATA[
SET SOLUTIONDIR=$(SolutionDir)..\ SET SOLUTIONDIR=$(SolutionDir)..\
"$(Python3Path)\python.exe" $(ProjectDir)make-te.py "$(ProjectDir)textevents.in" "$(ZoiteChatLib)textevents.h" "$(ZoiteChatLib)textenums.h" "$(Python3Path)\python.exe" $(ProjectDir)make-te.py "$(ProjectDir)textevents.in" "$(ZoiteChatLib)textevents.h" "$(ZoiteChatLib)textenums.h"
"$(Python3Path)\python.exe" $(ProjectDir)gen-public-suffix.py "$(ZoiteChatLib)public_suffix_data.h"
powershell -File "$(SolutionDir)..\win32\version-template.ps1" "$(SolutionDir)..\win32\config.h.tt" "$(ZoiteChatLib)config.h" powershell -File "$(SolutionDir)..\win32\version-template.ps1" "$(SolutionDir)..\win32\config.h.tt" "$(ZoiteChatLib)config.h"
$(GlibGenMarshal) --prefix=_zoitechat_marshal --header "$(ProjectDir)marshalers.list" --output "$(ZoiteChatLib)marshal.h" $(GlibGenMarshal) --prefix=_zoitechat_marshal --header "$(ProjectDir)marshalers.list" --output "$(ZoiteChatLib)marshal.h"
$(GlibGenMarshal) --prefix=_zoitechat_marshal --body "$(ProjectDir)marshalers.list" --output "$(ZoiteChatLib)marshal.c" $(GlibGenMarshal) --prefix=_zoitechat_marshal --body "$(ProjectDir)marshalers.list" --output "$(ZoiteChatLib)marshal.c"

View File

@@ -65,6 +65,9 @@
<ClInclude Include="proto-irc.h"> <ClInclude Include="proto-irc.h">
<Filter>Header Files</Filter> <Filter>Header Files</Filter>
</ClInclude> </ClInclude>
<ClInclude Include="public_suffix_data.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="server.h"> <ClInclude Include="server.h">
<Filter>Header Files</Filter> <Filter>Header Files</Filter>
</ClInclude> </ClInclude>

View File

@@ -0,0 +1,51 @@
#!/usr/bin/env python3
import sys
import urllib.request
from pathlib import Path
URL = "https://raw.githubusercontent.com/publicsuffix/list/main/public_suffix_list.dat"
def parse_rules(text: str):
rules = []
for raw in text.splitlines():
line = raw.strip()
if not line or line.startswith("//"):
continue
if " " in line or "\t" in line:
line = line.split()[0]
rules.append(line.lower())
return sorted(set(rules))
def emit_header(path: str, rules):
with open(path, "w", encoding="utf-8", newline="\n") as out:
out.write("#pragma once\n")
out.write("static const char * const public_suffix_rules[] = {\n")
for rule in rules:
escaped = rule.replace("\\", "\\\\").replace('"', '\\"')
out.write(f'\t"{escaped}",\n')
out.write("};\n")
out.write(
"static const unsigned int public_suffix_rules_len = sizeof(public_suffix_rules) / sizeof(public_suffix_rules[0]);\n"
)
def main():
if len(sys.argv) != 2:
raise SystemExit("usage: gen-public-suffix.py <output>")
output = Path(sys.argv[1])
fallback = Path(__file__).with_name("public_suffix_data.h")
try:
with urllib.request.urlopen(URL, timeout=30) as resp:
data = resp.read().decode("utf-8")
rules = parse_rules(data)
emit_header(str(output), rules)
except Exception:
if not fallback.exists():
raise
output.write_bytes(fallback.read_bytes())
if __name__ == "__main__":
main()

View File

@@ -96,6 +96,14 @@ marshal = [
make_te = find_program('make-te.py') make_te = find_program('make-te.py')
python3 = find_program('python3', required: true)
public_suffix_data = custom_target('public_suffix_data_h',
output: 'public_suffix_data.h',
command: [python3, files('gen-public-suffix.py'), '@OUTPUT@']
)
textevents = custom_target('textevents', textevents = custom_target('textevents',
input: 'textevents.in', input: 'textevents.in',
output: ['textevents.h', 'textenums.h'], output: ['textevents.h', 'textenums.h'],
@@ -119,7 +127,7 @@ if get_option('plugin')
endif endif
zoitechat_common = static_library('zoitechatcommon', zoitechat_common = static_library('zoitechatcommon',
sources: [textevents] + marshal + common_sources, sources: [textevents, public_suffix_data] + marshal + common_sources,
include_directories: config_h_include, include_directories: config_h_include,
dependencies: common_deps + common_sysinfo_deps, dependencies: common_deps + common_sysinfo_deps,
c_args: common_cflags, c_args: common_cflags,
@@ -127,7 +135,7 @@ zoitechat_common = static_library('zoitechatcommon',
) )
zoitechat_common_dep = declare_dependency( zoitechat_common_dep = declare_dependency(
sources: [textevents] + marshal, sources: [textevents, public_suffix_data] + marshal,
link_with: zoitechat_common, link_with: zoitechat_common,
include_directories: common_includes, include_directories: common_includes,
compile_args: common_cflags, compile_args: common_cflags,

View File

@@ -20,12 +20,14 @@
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include <ctype.h> #include <ctype.h>
#include <glib.h>
#include "zoitechat.h" #include "zoitechat.h"
#include "zoitechatc.h" #include "zoitechatc.h"
#include "cfgfiles.h" #include "cfgfiles.h"
#include "fe.h" #include "fe.h"
#include "tree.h" #include "tree.h"
#include "url.h" #include "url.h"
#include "public_suffix_data.h"
#ifdef HAVE_STRINGS_H #ifdef HAVE_STRINGS_H
#include <strings.h> #include <strings.h>
#endif #endif
@@ -35,6 +37,7 @@ GTree *url_btree = NULL;
static gboolean regex_match (const GRegex *re, const char *word, static gboolean regex_match (const GRegex *re, const char *word,
int *start, int *end); int *start, int *end);
static const GRegex *re_url (void); static const GRegex *re_url (void);
static const GRegex *re_url_no_scheme (void);
static const GRegex *re_email (void); static const GRegex *re_email (void);
static const GRegex *re_nick (void); static const GRegex *re_nick (void);
static const GRegex *re_channel (void); static const GRegex *re_channel (void);
@@ -42,6 +45,8 @@ static gboolean match_nick (const char *word, int *start, int *end);
static gboolean match_channel (const char *word, int *start, int *end); static gboolean match_channel (const char *word, int *start, int *end);
static gboolean match_url (const char *word, int *start, int *end); static gboolean match_url (const char *word, int *start, int *end);
static gboolean match_email (const char *word, int *start, int *end); static gboolean match_email (const char *word, int *start, int *end);
static gboolean host_has_public_suffix (const char *host);
static gboolean host_has_public_suffix_range (const char *word, int start, int end);
static int static int
url_free (char *url, void *data) url_free (char *url, void *data)
@@ -266,7 +271,16 @@ match_channel (const char *word, int *start, int *end)
static gboolean static gboolean
match_url (const char *word, int *start, int *end) match_url (const char *word, int *start, int *end)
{ {
return regex_match (re_url (), word, start, end); if (regex_match (re_url (), word, start, end))
return TRUE;
if (!regex_match (re_url_no_scheme (), word, start, end))
return FALSE;
if (*start > 0 && word[*start - 1] == '@')
return FALSE;
return host_has_public_suffix_range (word, *start, *end);
} }
static gboolean static gboolean
@@ -393,6 +407,114 @@ regex_match (const GRegex *re, const char *word, int *start, int *end)
return found; return found;
} }
static gboolean
host_has_public_suffix_range (const char *word, int start, int end)
{
char *candidate;
const char *host_start;
const char *host_end;
const char *host_colon;
gboolean ok;
int host_len;
char *host;
candidate = g_strndup (word + start, end - start);
host_start = candidate;
host_end = candidate + strlen (candidate);
if (*host_start == '[')
{
g_free (candidate);
return FALSE;
}
host_colon = strchr (host_start, ':');
if (host_colon)
host_end = host_colon;
host_colon = strchr (host_start, '/');
if (host_colon && host_colon < host_end)
host_end = host_colon;
host_len = (int)(host_end - host_start);
if (host_len <= 0)
{
g_free (candidate);
return FALSE;
}
host = g_strndup (host_start, host_len);
ok = host_has_public_suffix (host);
g_free (host);
g_free (candidate);
return ok;
}
static GHashTable *
public_suffix_table (void)
{
static GHashTable *table = NULL;
unsigned int i;
if (table)
return table;
table = g_hash_table_new (g_str_hash, g_str_equal);
for (i = 0; i < public_suffix_rules_len; i++)
{
g_hash_table_add (table, (gpointer)public_suffix_rules[i]);
}
return table;
}
static gboolean
host_has_public_suffix (const char *host)
{
GHashTable *table;
gchar **labels;
int i;
int n;
gboolean matched = FALSE;
if (!strchr (host, '.'))
return FALSE;
labels = g_strsplit (host, ".", -1);
for (n = 0; labels[n]; n++)
{
if (labels[n][0] == '\0')
{
g_strfreev (labels);
return FALSE;
}
}
table = public_suffix_table ();
for (i = 0; i < n; i++)
{
char *tail = g_strjoinv (".", &labels[i]);
if (g_hash_table_contains (table, tail))
matched = TRUE;
if (i + 1 < n)
{
char *tail_wild = g_strjoinv (".", &labels[i + 1]);
char *wild = g_strconcat ("*.", tail_wild, NULL);
if (g_hash_table_contains (table, wild))
matched = TRUE;
g_free (tail_wild);
g_free (wild);
}
if (i > 0)
{
char *exc = g_strconcat ("!", tail, NULL);
if (g_hash_table_contains (table, exc))
matched = TRUE;
g_free (exc);
}
g_free (tail);
if (matched)
break;
}
g_strfreev (labels);
return matched;
}
/* Miscellaneous description --- */ /* Miscellaneous description --- */
#define DOMAIN_LABEL "[\\pL\\pN](?:[-\\pL\\pN]{0,61}[\\pL\\pN])?" #define DOMAIN_LABEL "[\\pL\\pN](?:[-\\pL\\pN]{0,61}[\\pL\\pN])?"
#define DOMAIN DOMAIN_LABEL "(\\." DOMAIN_LABEL ")*" #define DOMAIN DOMAIN_LABEL "(\\." DOMAIN_LABEL ")*"
@@ -477,6 +599,28 @@ re_url (void)
return url_ret; return url_ret;
} }
static const GRegex *
re_url_no_scheme (void)
{
static GRegex *url_ret = NULL;
GString *grist_gstr;
char *grist;
if (url_ret) return url_ret;
grist_gstr = g_string_new (NULL);
g_string_append (grist_gstr, "(");
g_string_append (grist_gstr, HOST_URL_OPT_TLD OPT_PORT);
g_string_append_printf (grist_gstr, "(/" PATH ")?");
g_string_append (grist_gstr, ")");
grist = g_string_free (grist_gstr, FALSE);
url_ret = make_re (grist);
g_free (grist);
return url_ret;
}
#define EMAIL_LOCAL_ATOM "[\\pL\\pN!#$%&'*+/=?^_`{|}~-]+" #define EMAIL_LOCAL_ATOM "[\\pL\\pN!#$%&'*+/=?^_`{|}~-]+"
#define EMAIL_LOCAL EMAIL_LOCAL_ATOM "(\\." EMAIL_LOCAL_ATOM ")*" #define EMAIL_LOCAL EMAIL_LOCAL_ATOM "(\\." EMAIL_LOCAL_ATOM ")*"
#define EMAIL EMAIL_LOCAL "@" DOMAIN TLD #define EMAIL EMAIL_LOCAL "@" DOMAIN TLD