From 12304fe245f25cff9886eccbbfe7380ceb39bd50 Mon Sep 17 00:00:00 2001 From: iBug Date: Mon, 27 Jul 2020 02:35:59 +0800 Subject: [PATCH 1/6] Add GFWList parser script --- .gitignore | 1 + gfwlist.py | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+) create mode 100755 gfwlist.py diff --git a/.gitignore b/.gitignore index 85d347b..941f743 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ # Output +gfwlist.txt release-info.md *.pac dist/ diff --git a/gfwlist.py b/gfwlist.py new file mode 100755 index 0000000..e1743b3 --- /dev/null +++ b/gfwlist.py @@ -0,0 +1,75 @@ +#!/usr/bin/python3 + +import base64 +import json +import urllib.parse +import requests + + +GFWLIST_URL = 'https://raw.githubusercontent.com/gfwlist/gfwlist/master/gfwlist.txt' + + +def get_gfwlist(): + r = requests.get(GFWLIST_URL) + r.raise_for_status() + return base64.b64decode(r.text).decode("utf-8").rstrip("\n") + + +def update_domains(domains, host, mode=0): + segments = host.strip(".").split(".")[::-1] + + this = domains + for segment in segments: + if segment not in this: + this[segment] = {} + this = this[segment] + this["@"] = mode + + +def parse_gfwlist(text): + domains = {} + blackpat = [] # blacklisted patterns + whitepat = [] # whitelisted patterns + + for line in text.splitlines(): + if not line.strip() or line.startswith("!"): + continue # ignore comments and empty lines + + mode = 0 # default to blacklist + if line.startswith("@@"): + mode = 1 # now it's whitelist + line = line[2:] + + if line.startswith("||"): + # domain prefix + update_domains(domains, line[2:], mode) + else: + # Keyword pattern + # Single vertical line at either side means string boundary + if line.startswith("|"): + line = line[1:] + else: + line = "*" + line + if line.endswith("|"): + line = line[:-1] + else: + line = line + "*" + if mode == 0: + blackpat.append(line) + else: + whitepat.append(line) + return domains, blackpat, whitepat + + +def generate_pac_partial(): + gfwlist = get_gfwlist() + domains, blackpat, whitepat = parse_gfwlist(gfwlist) + return "var DOMAINS = {};\n\nvar BLACKPAT = {};\n\nvar WHITEPAT = {};\n".format( + json.dumps(domains, indent=2), + json.dumps(blackpat, indent=2), + json.dumps(whitepat, indent=2), + ) + + +if __name__ == '__main__': + print(generate_pac_partial()) From cf08e0d8407b12b05568c8e3378cc832d9976e81 Mon Sep 17 00:00:00 2001 From: iBug Date: Mon, 27 Jul 2020 03:00:37 +0800 Subject: [PATCH 2/6] Add domain checking and pattern matching --- build.py | 17 +++++++++++++++++ code.js | 52 +++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 66 insertions(+), 3 deletions(-) diff --git a/build.py b/build.py index 5092b82..82f3fc8 100755 --- a/build.py +++ b/build.py @@ -6,6 +6,8 @@ import ipaddress import requests from requests.exceptions import RequestException, HTTPError +import gfwlist + SOURCES = { 'ipdeny.com': 'http://www.ipdeny.com/ipblocks/data/aggregated/cn-aggregated.zone', @@ -13,6 +15,9 @@ SOURCES = { } OUT_DIR = "dist" +# Stub content to disable GFWList check +GFWLIST_STUB = "var DOMAINS = {};\nvar BLACKPAT = [];\nvar WHITEPAT = [];\n" + def fetch_and_convert(src): response = requests.get(src) @@ -36,6 +41,9 @@ def main(): code = f.read() code = code.replace("@@TIME@@", now.isoformat()[:-7]) + gfwlist_part = gfwlist.generate_pac_partial() + gfwlist_stub = GFWLIST_STUB + os.makedirs(OUT_DIR, mode=0o755, exist_ok=True) for key in SOURCES: print(f"Generating PAC script from source {key}") @@ -45,10 +53,19 @@ def main(): continue except HTTPError: continue + filename = f"pac-{key}.txt" + filename_gfwlist = f"pac-gfwlist-{key}.txt" with open(os.path.join(OUT_DIR, filename), "w") as f: f.write(code) f.write(data) + f.write("\n") + f.write(gfwlist_stub) + with open(os.path.join(OUT_DIR, filename), "w") as f: + f.write(code) + f.write(data) + f.write("\n") + f.write(gfwlist_part) if __name__ == '__main__': diff --git a/code.js b/code.js index 00bd6a1..36c4a62 100644 --- a/code.js +++ b/code.js @@ -1,6 +1,9 @@ // Author: iBug // Time: @@TIME@@ +var proxy = __PROXY__; +var direct = "DIRECT"; + function belongsToSubnet(host, list) { var ip = host.split(".").map(Number); ip = 0x1000000 * ip[0] + 0x10000 * ip[1] + 0x100 * ip[2] + ip[3]; @@ -23,6 +26,38 @@ function belongsToSubnet(host, list) { return (masked ^ list[x][0]) == 0; } +function hasMatchedPattern(text, patterns) { + for (var i = 0; i < patterns.length; i++) { + if (shExpMatch(text, patterns[i]) + return true; + } + return false; +} + +function checkDomainType(host) { + // Check if a domain is blacklisted or whitelisted + var segments = host.split(".").reverse(); + var ptr = DOMAINS; + var type = DOMAINS["@"]; + for (var i = 0; i < segments.length; i++) { + var segment = segments[i]; + ptr = ptr[segment]; + if (ptr === undefined) + break; + if (ptr["@"] !== undefined) + type = ptr["@"]; + } + return type; +} + +function hasWhitelistedPattern(url) { + return hasMatchedPattern(url, WHITEPAT); +} + +function hasBlacklistedPattern(url) { + return hasMatchedPattern(url, BLACKPAT); +} + function isChina(host) { return belongsToSubnet(host, CHINA); } @@ -31,10 +66,21 @@ function isLan(host) { return belongsToSubnet(host, LAN); } -var proxy = "__PROXY__"; -var direct = "DIRECT"; - function FindProxyForURL(url, host) { + if (hasWhitelistedPattern(url)) { + return direct; + } + if (hasBlacklistedPattern(url)) { + return proxy; + } + var domainType = checkDomainType(host); + if (domainType === 0) { + return proxy; + } else if (domainType === 1) { + return direct; + } + + // Fallback to IP whitelist var remote = dnsResolve(host); if (!remote || remote.indexOf(":") !== -1) { // resolution failed or is IPv6 addr From 97f3e306b9370f4057d07fee87e10b28fdedbaa4 Mon Sep 17 00:00:00 2001 From: iBug Date: Mon, 27 Jul 2020 03:43:36 +0800 Subject: [PATCH 3/6] Stupid mistakes --- build.py | 2 +- code.js | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/build.py b/build.py index 82f3fc8..802d7a3 100755 --- a/build.py +++ b/build.py @@ -61,7 +61,7 @@ def main(): f.write(data) f.write("\n") f.write(gfwlist_stub) - with open(os.path.join(OUT_DIR, filename), "w") as f: + with open(os.path.join(OUT_DIR, filename_gfwlist), "w") as f: f.write(code) f.write(data) f.write("\n") diff --git a/code.js b/code.js index 36c4a62..3ca92f6 100644 --- a/code.js +++ b/code.js @@ -28,7 +28,7 @@ function belongsToSubnet(host, list) { function hasMatchedPattern(text, patterns) { for (var i = 0; i < patterns.length; i++) { - if (shExpMatch(text, patterns[i]) + if (shExpMatch(text, patterns[i])) return true; } return false; From 6a039ce9c8b124dbac786e0efcf75cfc2c1834d9 Mon Sep 17 00:00:00 2001 From: iBug Date: Mon, 27 Jul 2020 13:29:59 +0800 Subject: [PATCH 4/6] Exclude regex from GFWList --- gfwlist.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gfwlist.py b/gfwlist.py index e1743b3..5c35a15 100755 --- a/gfwlist.py +++ b/gfwlist.py @@ -43,6 +43,9 @@ def parse_gfwlist(text): if line.startswith("||"): # domain prefix update_domains(domains, line[2:], mode) + elif line.startswith("/"): + # regex, can't handle yet + pass else: # Keyword pattern # Single vertical line at either side means string boundary From c09c33bcf18bc765e5f18e0c2b217a72b8a15561 Mon Sep 17 00:00:00 2001 From: iBug Date: Mon, 27 Jul 2020 13:51:16 +0800 Subject: [PATCH 5/6] Slightly compact domain list --- code.js | 2 ++ gfwlist.py | 27 ++++++++++++++++++++++++--- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/code.js b/code.js index 3ca92f6..817b525 100644 --- a/code.js +++ b/code.js @@ -44,6 +44,8 @@ function checkDomainType(host) { ptr = ptr[segment]; if (ptr === undefined) break; + if (typeof ptr === "number") + return ptr; if (ptr["@"] !== undefined) type = ptr["@"]; } diff --git a/gfwlist.py b/gfwlist.py index 5c35a15..64097b9 100755 --- a/gfwlist.py +++ b/gfwlist.py @@ -1,18 +1,25 @@ #!/usr/bin/python3 +import os import base64 import json import urllib.parse import requests +GFWLIST_FILE = "gfwlist.txt" GFWLIST_URL = 'https://raw.githubusercontent.com/gfwlist/gfwlist/master/gfwlist.txt' def get_gfwlist(): - r = requests.get(GFWLIST_URL) - r.raise_for_status() - return base64.b64decode(r.text).decode("utf-8").rstrip("\n") + if os.path.isfile(GFWLIST_FILE): + with open(GFWLIST_FILE, "r") as f: + text = f.read() + else: + r = requests.get(GFWLIST_URL) + r.raise_for_status() + text = r.text + return base64.b64decode(text).decode("utf-8").rstrip("\n") def update_domains(domains, host, mode=0): @@ -26,6 +33,19 @@ def update_domains(domains, host, mode=0): this["@"] = mode +def postproc_domains(domains): + # Turn all {"@": 1} into 1 to save some text + keys = list(domains.keys()) + for key in keys: + if key == "@": + continue + obj = domains[key] + if len(obj) == 1 and "@" in obj: + domains[key] = obj["@"] + else: + postproc_domains(obj) + + def parse_gfwlist(text): domains = {} blackpat = [] # blacklisted patterns @@ -61,6 +81,7 @@ def parse_gfwlist(text): blackpat.append(line) else: whitepat.append(line) + postproc_domains(domains) return domains, blackpat, whitepat From 7cbc3f6b062a77d33096e415845f7ad8065c793a Mon Sep 17 00:00:00 2001 From: iBug Date: Mon, 27 Jul 2020 14:02:06 +0800 Subject: [PATCH 6/6] Critical: Exclude the first line from GFWList --- gfwlist.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gfwlist.py b/gfwlist.py index 64097b9..5292b7f 100755 --- a/gfwlist.py +++ b/gfwlist.py @@ -51,7 +51,7 @@ def parse_gfwlist(text): blackpat = [] # blacklisted patterns whitepat = [] # whitelisted patterns - for line in text.splitlines(): + for line in text.splitlines()[1:]: if not line.strip() or line.startswith("!"): continue # ignore comments and empty lines