initial commit.

2015-04-13 17:27:44 +08:00
commit ccb5ed49b5
10 changed files with 59930 additions and 0 deletions
--- a/gfw-pac.py
+++ b/gfw-pac.py
@@ -0,0 +1,289 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+import re
+import math
+import socket
+import struct
+import pkgutil
+import urlparse
+import json
+import logging
+import urllib2
+from argparse import ArgumentParser
+
+gfwlist_url = 'https://autoproxy-gfwlist.googlecode.com/svn/trunk/gfwlist.txt'
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument('-i', '--input', dest='input',
+                        help='path to gfwlist', metavar='GFWLIST')
+    parser.add_argument('-f', '--file', dest='output', required=True,
+                        help='path to output pac', metavar='PAC')
+    parser.add_argument('-p', '--proxy', dest='proxy', required=True,
+                        help='the proxy parameter in the pac file, '
+                             'for example, "SOCKS5 127.0.0.1:1080;"',
+                        metavar='PROXY')
+    parser.add_argument('--user-rule', dest='user_rule',
+                        help='user rule file, which will be appended to'
+                             ' gfwlist')
+    parser.add_argument('--direct-rule', dest='direct_rule',
+                        help='user rule file, contains domains not bypass proxy')
+    parser.add_argument('--ip-file', dest='ip_file',
+                        help='delegated-apnic-latest from apnic.net')
+    return parser.parse_args()
+
+#from https://github.com/Leask/Flora_Pac
+def ip2long(ip):
+    packedIP = socket.inet_aton(ip)
+    return struct.unpack("!L", packedIP)[0]
+
+#from https://github.com/Leask/Flora_Pac
+def fetch_ip_data():
+    args = parse_args()
+    if (args.ip_file):
+        with open(args.ip_file, 'rb') as f:
+            data = f.read()
+    else:
+        #fetch data from apnic
+        print "Fetching data from apnic.net, it might take a few minutes, please wait..."
+        url=r'http://ftp.apnic.net/apnic/stats/apnic/delegated-apnic-latest'
+      # url=r'http://flora/delegated-apnic-latest' #debug
+        data=urllib2.urlopen(url).read()
+
+    cnregex=re.compile(r'apnic\|cn\|ipv4\|[0-9\.]+\|[0-9]+\|[0-9]+\|a.*',re.IGNORECASE)
+    cndata=cnregex.findall(data)
+
+    results=[]
+    prev_net=''
+
+    for item in cndata:
+        unit_items=item.split('|')
+        starting_ip=unit_items[3]
+        num_ip=int(unit_items[4])
+
+        imask=0xffffffff^(num_ip-1)
+        #convert to string
+        imask=hex(imask)[2:]
+        mask=[0]*4
+        mask[0]=imask[0:2]
+        mask[1]=imask[2:4]
+        mask[2]='0' #imask[4:6]
+        mask[3]='0' #imask[6:8]
+
+        #convert str to int
+        mask=[ int(i,16 ) for i in mask]
+        mask="%d.%d.%d.%d"%tuple(mask)
+
+        #mask in *nix format
+        mask2=32-int(math.log(num_ip,2))
+
+        ip=starting_ip.split('.')
+        ip[2] = '0'
+        ip[3] = '0'
+        starting_ip = '.'.join(ip)
+        if starting_ip != prev_net:
+            results.append((ip2long(starting_ip), ip2long(mask), mask2))
+            prev_net = starting_ip
+
+    results.insert(0, (ip2long('127.0.0.1'),   ip2long('255.0.0.0'),   0))
+    results.insert(1, (ip2long('10.0.0.0'),    ip2long('255.0.0.0'),   0))
+    results.insert(2, (ip2long('172.16.0.0'),  ip2long('255.240.0.0'), 0))
+    results.insert(3, (ip2long('192.168.0.0'), ip2long('255.255.0.0'), 0))
+    def ip(item):
+        return item[0]
+
+    results = sorted(results, key = ip)
+    return results
+
+def decode_gfwlist(content):
+    # decode base64 if have to
+    try:
+        if '.' in content:
+            raise Exception()
+        return content.decode('base64')
+    except:
+        return content
+
+
+def get_hostname(something):
+    try:
+        # quite enough for GFW
+        if not something.startswith('http:'):
+            something = 'http://' + something
+        r = urlparse.urlparse(something)
+        return r.hostname
+    except Exception as e:
+        logging.error(e)
+        return None
+
+
+def add_domain_to_set(s, something):
+    hostname = get_hostname(something)
+    if hostname is not None:
+        s.add(hostname)
+
+
+def combine_lists(content, user_rule=None):
+    gfwlist = content.splitlines(False)
+    if user_rule:
+        gfwlist.extend(user_rule.splitlines(False))
+    return gfwlist
+
+
+def parse_gfwlist(gfwlist):
+    domains = set()
+    for line in gfwlist:
+        if line.find('.*') >= 0:
+            continue
+        elif line.find('*') >= 0:
+            line = line.replace('*', '/')
+        if line.startswith('||'):
+            line = line.lstrip('||')
+        elif line.startswith('|'):
+            line = line.lstrip('|')
+        elif line.startswith('.'):
+            line = line.lstrip('.')
+        if line.startswith('!'):
+            continue
+        elif line.startswith('['):
+            continue
+        elif line.startswith('@'):
+            # ignore white list
+            continue
+        add_domain_to_set(domains, line)
+    return domains
+
+
+def reduce_domains(domains):
+    # reduce 'www.google.com' to 'google.com'
+    # remove invalid domains
+    with open('./tld.txt', 'rb') as f:
+            tld_content = f.read()
+    tlds = set(tld_content.splitlines(False))
+    new_domains = set()
+    for domain in domains:
+        domain_parts = domain.split('.')
+        last_root_domain = None
+        for i in xrange(0, len(domain_parts)):
+            root_domain = '.'.join(domain_parts[len(domain_parts) - i - 1:])
+            if i == 0:
+                if not tlds.__contains__(root_domain):
+                    # root_domain is not a valid tld
+                    break
+            last_root_domain = root_domain
+            if tlds.__contains__(root_domain):
+                continue
+            else:
+                break
+        if last_root_domain is not None:
+            new_domains.add(last_root_domain)
+
+    uni_domains = set()
+    for domain in new_domains:
+        domain_parts = domain.split('.')
+        for i in xrange(0, len(domain_parts)-1):
+            root_domain = '.'.join(domain_parts[len(domain_parts) - i - 1:])
+            if domains.__contains__(root_domain):
+                break
+        else:
+            uni_domains.add(domain)
+    return uni_domains
+
+
+def generate_pac_fast(domains, proxy, direct_domains, cnips):
+    # render the pac file
+    with open('./pac-template', 'rb') as f:
+        proxy_content = f.read()
+    domains_dict = {}
+    for domain in domains:
+        domains_dict[domain] = 1
+    proxy_content = proxy_content.replace('__PROXY__', json.dumps(str(proxy)))
+    proxy_content = proxy_content.replace(
+        '__DOMAINS__',
+        json.dumps(domains_dict, indent=2, sort_keys=True)
+    )
+
+    direct_domains_dict = {}
+    for domain in direct_domains:
+        direct_domains_dict[domain] = 1
+    proxy_content = proxy_content.replace(
+        '__DIRECT_DOMAINS__',
+        json.dumps(direct_domains_dict, indent=2, sort_keys=True)
+    )
+
+    proxy_content = proxy_content.replace(
+        '__CN_IPS__',
+        json.dumps(cnips, indent=2, sort_keys=False)
+    )
+
+    return proxy_content
+
+
+def generate_pac_precise(rules, proxy):
+    def grep_rule(rule):
+        if rule:
+            if rule.startswith('!'):
+                return None
+            if rule.startswith('['):
+                return None
+            return rule
+        return None
+    # render the pac file
+    proxy_content = pkgutil.get_data('gfwlist2pac', './abp.js')
+    rules = filter(grep_rule, rules)
+    proxy_content = proxy_content.replace('__PROXY__', json.dumps(str(proxy)))
+    proxy_content = proxy_content.replace('__RULES__',
+                                          json.dumps(rules, indent=2))
+    return proxy_content
+
+
+def main():
+    args = parse_args()
+    user_rule = None
+    direct_rule = None
+    if (args.input):
+        with open(args.input, 'rb') as f:
+            content = f.read()
+    else:
+        print 'Downloading gfwlist from %s' % gfwlist_url
+        content = urllib2.urlopen(gfwlist_url, timeout=10).read()
+    if args.user_rule:
+        userrule_parts = urlparse.urlsplit(args.user_rule)
+        if not userrule_parts.scheme or not userrule_parts.netloc:
+            # It's not an URL, deal it as local file
+            with open(args.user_rule, 'rb') as f:
+                user_rule = f.read()
+        else:
+            # Yeah, it's an URL, try to download it
+            print 'Downloading user rules file from %s' % args.user_rule
+            user_rule = urllib2.urlopen(args.user_rule, timeout=10).read()
+
+    if args.direct_rule:
+        directrule_parts = urlparse.urlsplit(args.direct_rule)
+        if not directrule_parts.scheme or not directrule_parts.netloc:
+            # It's not an URL, deal it as local file
+            with open(args.direct_rule, 'rb') as f:
+                direct_rule = f.read()
+        else:
+            # Yeah, it's an URL, try to download it
+            print 'Downloading user rules file from %s' % args.user_rule
+            direct_rule = urllib2.urlopen(args.direct_rule, timeout=10).read()
+    direct_rule = direct_rule.splitlines(False)
+
+    cnips = fetch_ip_data()
+
+    content = decode_gfwlist(content)
+    gfwlist = combine_lists(content, user_rule)
+
+    domains = parse_gfwlist(gfwlist)
+    domains = reduce_domains(domains)
+    pac_content = generate_pac_fast(domains, args.proxy, direct_rule, cnips)
+
+    with open(args.output, 'wb') as f:
+        f.write(pac_content)
+
+
+if __name__ == '__main__':
+    main()