From 12304fe245f25cff9886eccbbfe7380ceb39bd50 Mon Sep 17 00:00:00 2001 From: iBug Date: Mon, 27 Jul 2020 02:35:59 +0800 Subject: [PATCH] Add GFWList parser script --- .gitignore | 1 + gfwlist.py | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+) create mode 100755 gfwlist.py diff --git a/.gitignore b/.gitignore index 85d347b..941f743 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ # Output +gfwlist.txt release-info.md *.pac dist/ diff --git a/gfwlist.py b/gfwlist.py new file mode 100755 index 0000000..e1743b3 --- /dev/null +++ b/gfwlist.py @@ -0,0 +1,75 @@ +#!/usr/bin/python3 + +import base64 +import json +import urllib.parse +import requests + + +GFWLIST_URL = 'https://raw.githubusercontent.com/gfwlist/gfwlist/master/gfwlist.txt' + + +def get_gfwlist(): + r = requests.get(GFWLIST_URL) + r.raise_for_status() + return base64.b64decode(r.text).decode("utf-8").rstrip("\n") + + +def update_domains(domains, host, mode=0): + segments = host.strip(".").split(".")[::-1] + + this = domains + for segment in segments: + if segment not in this: + this[segment] = {} + this = this[segment] + this["@"] = mode + + +def parse_gfwlist(text): + domains = {} + blackpat = [] # blacklisted patterns + whitepat = [] # whitelisted patterns + + for line in text.splitlines(): + if not line.strip() or line.startswith("!"): + continue # ignore comments and empty lines + + mode = 0 # default to blacklist + if line.startswith("@@"): + mode = 1 # now it's whitelist + line = line[2:] + + if line.startswith("||"): + # domain prefix + update_domains(domains, line[2:], mode) + else: + # Keyword pattern + # Single vertical line at either side means string boundary + if line.startswith("|"): + line = line[1:] + else: + line = "*" + line + if line.endswith("|"): + line = line[:-1] + else: + line = line + "*" + if mode == 0: + blackpat.append(line) + else: + whitepat.append(line) + return domains, blackpat, whitepat + + +def generate_pac_partial(): + gfwlist = get_gfwlist() + domains, blackpat, whitepat = parse_gfwlist(gfwlist) + return "var DOMAINS = {};\n\nvar BLACKPAT = {};\n\nvar WHITEPAT = {};\n".format( + json.dumps(domains, indent=2), + json.dumps(blackpat, indent=2), + json.dumps(whitepat, indent=2), + ) + + +if __name__ == '__main__': + print(generate_pac_partial())