diff --git a/src/common/common.vcxproj b/src/common/common.vcxproj index 287c159c..468e4fe1 100644 --- a/src/common/common.vcxproj +++ b/src/common/common.vcxproj @@ -29,6 +29,7 @@ + @@ -109,6 +110,7 @@ Header Files + + Header Files + Header Files diff --git a/src/common/gen-public-suffix.py b/src/common/gen-public-suffix.py new file mode 100644 index 00000000..4ccdd289 --- /dev/null +++ b/src/common/gen-public-suffix.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 +import sys +import urllib.request +from pathlib import Path + +URL = "https://raw.githubusercontent.com/publicsuffix/list/main/public_suffix_list.dat" + + +def parse_rules(text: str): + rules = [] + for raw in text.splitlines(): + line = raw.strip() + if not line or line.startswith("//"): + continue + if " " in line or "\t" in line: + line = line.split()[0] + rules.append(line.lower()) + return sorted(set(rules)) + + +def emit_header(path: str, rules): + with open(path, "w", encoding="utf-8", newline="\n") as out: + out.write("#pragma once\n") + out.write("static const char * const public_suffix_rules[] = {\n") + for rule in rules: + escaped = rule.replace("\\", "\\\\").replace('"', '\\"') + out.write(f'\t"{escaped}",\n') + out.write("};\n") + out.write( + "static const unsigned int public_suffix_rules_len = sizeof(public_suffix_rules) / sizeof(public_suffix_rules[0]);\n" + ) + + +def main(): + if len(sys.argv) != 2: + raise SystemExit("usage: gen-public-suffix.py ") + output = Path(sys.argv[1]) + fallback = Path(__file__).with_name("public_suffix_data.h") + try: + with urllib.request.urlopen(URL, timeout=30) as resp: + data = resp.read().decode("utf-8") + rules = parse_rules(data) + emit_header(str(output), rules) + except Exception: + if not fallback.exists(): + raise + output.write_bytes(fallback.read_bytes()) + + +if __name__ == "__main__": + main() diff --git a/src/common/meson.build b/src/common/meson.build index 99eef515..89db336b 100644 --- a/src/common/meson.build +++ b/src/common/meson.build @@ -96,6 +96,14 @@ marshal = [ make_te = find_program('make-te.py') + +python3 = find_program('python3', required: true) + +public_suffix_data = custom_target('public_suffix_data_h', + output: 'public_suffix_data.h', + command: [python3, files('gen-public-suffix.py'), '@OUTPUT@'] +) + textevents = custom_target('textevents', input: 'textevents.in', output: ['textevents.h', 'textenums.h'], @@ -119,7 +127,7 @@ if get_option('plugin') endif zoitechat_common = static_library('zoitechatcommon', - sources: [textevents] + marshal + common_sources, + sources: [textevents, public_suffix_data] + marshal + common_sources, include_directories: config_h_include, dependencies: common_deps + common_sysinfo_deps, c_args: common_cflags, @@ -127,7 +135,7 @@ zoitechat_common = static_library('zoitechatcommon', ) zoitechat_common_dep = declare_dependency( - sources: [textevents] + marshal, + sources: [textevents, public_suffix_data] + marshal, link_with: zoitechat_common, include_directories: common_includes, compile_args: common_cflags, diff --git a/src/common/url.c b/src/common/url.c index f58806a7..e49138dd 100644 --- a/src/common/url.c +++ b/src/common/url.c @@ -20,12 +20,14 @@ #include #include #include +#include #include "zoitechat.h" #include "zoitechatc.h" #include "cfgfiles.h" #include "fe.h" #include "tree.h" #include "url.h" +#include "public_suffix_data.h" #ifdef HAVE_STRINGS_H #include #endif @@ -35,6 +37,7 @@ GTree *url_btree = NULL; static gboolean regex_match (const GRegex *re, const char *word, int *start, int *end); static const GRegex *re_url (void); +static const GRegex *re_url_no_scheme (void); static const GRegex *re_email (void); static const GRegex *re_nick (void); static const GRegex *re_channel (void); @@ -42,6 +45,8 @@ static gboolean match_nick (const char *word, int *start, int *end); static gboolean match_channel (const char *word, int *start, int *end); static gboolean match_url (const char *word, int *start, int *end); static gboolean match_email (const char *word, int *start, int *end); +static gboolean host_has_public_suffix (const char *host); +static gboolean host_has_public_suffix_range (const char *word, int start, int end); static int url_free (char *url, void *data) @@ -266,7 +271,16 @@ match_channel (const char *word, int *start, int *end) static gboolean match_url (const char *word, int *start, int *end) { - return regex_match (re_url (), word, start, end); + if (regex_match (re_url (), word, start, end)) + return TRUE; + + if (!regex_match (re_url_no_scheme (), word, start, end)) + return FALSE; + + if (*start > 0 && word[*start - 1] == '@') + return FALSE; + + return host_has_public_suffix_range (word, *start, *end); } static gboolean @@ -393,6 +407,114 @@ regex_match (const GRegex *re, const char *word, int *start, int *end) return found; } +static gboolean +host_has_public_suffix_range (const char *word, int start, int end) +{ + char *candidate; + const char *host_start; + const char *host_end; + const char *host_colon; + gboolean ok; + int host_len; + char *host; + + candidate = g_strndup (word + start, end - start); + host_start = candidate; + host_end = candidate + strlen (candidate); + if (*host_start == '[') + { + g_free (candidate); + return FALSE; + } + host_colon = strchr (host_start, ':'); + if (host_colon) + host_end = host_colon; + host_colon = strchr (host_start, '/'); + if (host_colon && host_colon < host_end) + host_end = host_colon; + host_len = (int)(host_end - host_start); + if (host_len <= 0) + { + g_free (candidate); + return FALSE; + } + host = g_strndup (host_start, host_len); + ok = host_has_public_suffix (host); + g_free (host); + g_free (candidate); + return ok; +} + +static GHashTable * +public_suffix_table (void) +{ + static GHashTable *table = NULL; + unsigned int i; + + if (table) + return table; + + table = g_hash_table_new (g_str_hash, g_str_equal); + for (i = 0; i < public_suffix_rules_len; i++) + { + g_hash_table_add (table, (gpointer)public_suffix_rules[i]); + } + return table; +} + +static gboolean +host_has_public_suffix (const char *host) +{ + GHashTable *table; + gchar **labels; + int i; + int n; + gboolean matched = FALSE; + + if (!strchr (host, '.')) + return FALSE; + + labels = g_strsplit (host, ".", -1); + for (n = 0; labels[n]; n++) + { + if (labels[n][0] == '\0') + { + g_strfreev (labels); + return FALSE; + } + } + + table = public_suffix_table (); + for (i = 0; i < n; i++) + { + char *tail = g_strjoinv (".", &labels[i]); + if (g_hash_table_contains (table, tail)) + matched = TRUE; + if (i + 1 < n) + { + char *tail_wild = g_strjoinv (".", &labels[i + 1]); + char *wild = g_strconcat ("*.", tail_wild, NULL); + if (g_hash_table_contains (table, wild)) + matched = TRUE; + g_free (tail_wild); + g_free (wild); + } + if (i > 0) + { + char *exc = g_strconcat ("!", tail, NULL); + if (g_hash_table_contains (table, exc)) + matched = TRUE; + g_free (exc); + } + g_free (tail); + if (matched) + break; + } + + g_strfreev (labels); + return matched; +} + /* Miscellaneous description --- */ #define DOMAIN_LABEL "[\\pL\\pN](?:[-\\pL\\pN]{0,61}[\\pL\\pN])?" #define DOMAIN DOMAIN_LABEL "(\\." DOMAIN_LABEL ")*" @@ -477,6 +599,28 @@ re_url (void) return url_ret; } +static const GRegex * +re_url_no_scheme (void) +{ + static GRegex *url_ret = NULL; + GString *grist_gstr; + char *grist; + + if (url_ret) return url_ret; + + grist_gstr = g_string_new (NULL); + g_string_append (grist_gstr, "("); + g_string_append (grist_gstr, HOST_URL_OPT_TLD OPT_PORT); + g_string_append_printf (grist_gstr, "(/" PATH ")?"); + g_string_append (grist_gstr, ")"); + + grist = g_string_free (grist_gstr, FALSE); + url_ret = make_re (grist); + g_free (grist); + + return url_ret; +} + #define EMAIL_LOCAL_ATOM "[\\pL\\pN!#$%&'*+/=?^_`{|}~-]+" #define EMAIL_LOCAL EMAIL_LOCAL_ATOM "(\\." EMAIL_LOCAL_ATOM ")*" #define EMAIL EMAIL_LOCAL "@" DOMAIN TLD