mirror of
https://gitlab.torproject.org/tpo/core/tor.git
synced 2024-11-10 13:13:44 +01:00
Merge remote-tracking branch 'karsten/mmdb-convert'
This commit is contained in:
commit
9d0af78e3c
@ -1,90 +0,0 @@
|
||||
README.geoip -- information on the IP-to-country-code file shipped with tor
|
||||
===========================================================================
|
||||
|
||||
The IP-to-country-code file in src/config/geoip is based on MaxMind's
|
||||
GeoLite Country database with the following modifications:
|
||||
|
||||
- Those "A1" ("Anonymous Proxy") entries lying inbetween two entries with
|
||||
the same country code are automatically changed to that country code.
|
||||
These changes can be overriden by specifying a different country code
|
||||
in src/config/geoip-manual.
|
||||
|
||||
- Other "A1" entries are replaced with country codes specified in
|
||||
src/config/geoip-manual, or are left as is if there is no corresponding
|
||||
entry in that file. Even non-"A1" entries can be modified by adding a
|
||||
replacement entry to src/config/geoip-manual. Handle with care.
|
||||
|
||||
|
||||
1. Updating the geoip file from a MaxMind database file
|
||||
-------------------------------------------------------
|
||||
|
||||
Download the most recent MaxMind GeoLite Country database:
|
||||
http://geolite.maxmind.com/download/geoip/database/GeoIPCountryCSV.zip
|
||||
|
||||
Run `python deanonymind.py` in the local directory. Review the output to
|
||||
learn about applied automatic/manual changes and watch out for any
|
||||
warnings.
|
||||
|
||||
Possibly edit geoip-manual to make more/fewer/different manual changes and
|
||||
re-run `python deanonymind.py`.
|
||||
|
||||
When done, prepend the new geoip file with a comment like this:
|
||||
|
||||
# Last updated based on $DATE Maxmind GeoLite Country
|
||||
# See README.geoip for details on the conversion.
|
||||
|
||||
|
||||
2. Verifying automatic and manual changes using diff
|
||||
----------------------------------------------------
|
||||
|
||||
To unzip the original MaxMind file and look at the automatic changes, run:
|
||||
|
||||
unzip GeoIPCountryCSV.zip
|
||||
diff -U1 GeoIPCountryWhois.csv AutomaticGeoIPCountryWhois.csv
|
||||
|
||||
To look at subsequent manual changes, run:
|
||||
|
||||
diff -U1 AutomaticGeoIPCountryWhois.csv ManualGeoIPCountryWhois.csv
|
||||
|
||||
To manually generate the geoip file and compare it to the automatically
|
||||
created one, run:
|
||||
|
||||
cut -d, -f3-5 < ManualGeoIPCountryWhois.csv | sed 's/"//g' > mygeoip
|
||||
diff -U1 geoip mygeoip
|
||||
|
||||
|
||||
3. Verifying automatic and manual changes using blockfinder
|
||||
-----------------------------------------------------------
|
||||
|
||||
Blockfinder is a powerful tool to handle multiple IP-to-country data
|
||||
sources. Blockfinder has a function to specify a country code and compare
|
||||
conflicting country code assignments in different data sources.
|
||||
|
||||
We can use blockfinder to compare A1 entries in the original MaxMind file
|
||||
with the same or overlapping blocks in the file generated above and in the
|
||||
RIR delegation files:
|
||||
|
||||
git clone https://github.com/ioerror/blockfinder
|
||||
cd blockfinder/
|
||||
python blockfinder -i
|
||||
python blockfinder -r ../GeoIPCountryWhois.csv
|
||||
python blockfinder -r ../ManualGeoIPCountryWhois.csv
|
||||
python blockfinder -p A1 > A1-comparison.txt
|
||||
|
||||
The output marks conflicts between assignments using either '*' in case of
|
||||
two different opinions or '#' for three or more different opinions about
|
||||
the country code for a given block.
|
||||
|
||||
The '*' conflicts are most likely harmless, because there will always be
|
||||
at least two opinions with the original MaxMind file saying A1 and the
|
||||
other two sources saying something more meaningful.
|
||||
|
||||
However, watch out for '#' conflicts. In these cases, the original
|
||||
MaxMind file ("A1"), the updated MaxMind file (hopefully the correct
|
||||
country code), and the RIR delegation files (some other country code) all
|
||||
disagree.
|
||||
|
||||
There are perfectly valid cases where the updated MaxMind file and the RIR
|
||||
delegation files don't agree. But each of those cases must be verified
|
||||
manually.
|
||||
|
@ -1,205 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
import optparse
|
||||
import os
|
||||
import sys
|
||||
import zipfile
|
||||
|
||||
"""
|
||||
Take a MaxMind GeoLite Country database as input and replace A1 entries
|
||||
with the country code and name of the preceding entry iff the preceding
|
||||
(subsequent) entry ends (starts) directly before (after) the A1 entry and
|
||||
both preceding and subsequent entries contain the same country code.
|
||||
|
||||
Then apply manual changes, either replacing A1 entries that could not be
|
||||
replaced automatically or overriding previously made automatic changes.
|
||||
"""
|
||||
|
||||
def main():
|
||||
options = parse_options()
|
||||
assignments = read_file(options.in_maxmind)
|
||||
assignments = apply_automatic_changes(assignments)
|
||||
write_file(options.out_automatic, assignments)
|
||||
manual_assignments = read_file(options.in_manual, must_exist=False)
|
||||
assignments = apply_manual_changes(assignments, manual_assignments)
|
||||
write_file(options.out_manual, assignments)
|
||||
write_file(options.out_geoip, assignments, long_format=False)
|
||||
|
||||
def parse_options():
|
||||
parser = optparse.OptionParser()
|
||||
parser.add_option('-i', action='store', dest='in_maxmind',
|
||||
default='GeoIPCountryCSV.zip', metavar='FILE',
|
||||
help='use the specified MaxMind GeoLite Country .zip or .csv '
|
||||
'file as input [default: %default]')
|
||||
parser.add_option('-g', action='store', dest='in_manual',
|
||||
default='geoip-manual', metavar='FILE',
|
||||
help='use the specified .csv file for manual changes or to '
|
||||
'override automatic changes [default: %default]')
|
||||
parser.add_option('-a', action='store', dest='out_automatic',
|
||||
default="AutomaticGeoIPCountryWhois.csv", metavar='FILE',
|
||||
help='write full input file plus automatic changes to the '
|
||||
'specified .csv file [default: %default]')
|
||||
parser.add_option('-m', action='store', dest='out_manual',
|
||||
default='ManualGeoIPCountryWhois.csv', metavar='FILE',
|
||||
help='write full input file plus automatic and manual '
|
||||
'changes to the specified .csv file [default: %default]')
|
||||
parser.add_option('-o', action='store', dest='out_geoip',
|
||||
default='geoip', metavar='FILE',
|
||||
help='write full input file plus automatic and manual '
|
||||
'changes to the specified .csv file that can be shipped '
|
||||
'with tor [default: %default]')
|
||||
(options, args) = parser.parse_args()
|
||||
return options
|
||||
|
||||
def read_file(path, must_exist=True):
|
||||
if not os.path.exists(path):
|
||||
if must_exist:
|
||||
print 'File %s does not exist. Exiting.' % (path, )
|
||||
sys.exit(1)
|
||||
else:
|
||||
return
|
||||
if path.endswith('.zip'):
|
||||
zip_file = zipfile.ZipFile(path)
|
||||
csv_content = zip_file.read('GeoIPCountryWhois.csv')
|
||||
zip_file.close()
|
||||
else:
|
||||
csv_file = open(path)
|
||||
csv_content = csv_file.read()
|
||||
csv_file.close()
|
||||
assignments = []
|
||||
for line in csv_content.split('\n'):
|
||||
stripped_line = line.strip()
|
||||
if len(stripped_line) > 0 and not stripped_line.startswith('#'):
|
||||
assignments.append(stripped_line)
|
||||
return assignments
|
||||
|
||||
def apply_automatic_changes(assignments):
|
||||
print '\nApplying automatic changes...'
|
||||
result_lines = []
|
||||
prev_line = None
|
||||
a1_lines = []
|
||||
for line in assignments:
|
||||
if '"A1"' in line:
|
||||
a1_lines.append(line)
|
||||
else:
|
||||
if len(a1_lines) > 0:
|
||||
new_a1_lines = process_a1_lines(prev_line, a1_lines, line)
|
||||
for new_a1_line in new_a1_lines:
|
||||
result_lines.append(new_a1_line)
|
||||
a1_lines = []
|
||||
result_lines.append(line)
|
||||
prev_line = line
|
||||
if len(a1_lines) > 0:
|
||||
new_a1_lines = process_a1_lines(prev_line, a1_lines, None)
|
||||
for new_a1_line in new_a1_lines:
|
||||
result_lines.append(new_a1_line)
|
||||
return result_lines
|
||||
|
||||
def process_a1_lines(prev_line, a1_lines, next_line):
|
||||
if not prev_line or not next_line:
|
||||
return a1_lines # Can't merge first or last line in file.
|
||||
if len(a1_lines) > 1:
|
||||
return a1_lines # Can't merge more than 1 line at once.
|
||||
a1_line = a1_lines[0].strip()
|
||||
prev_entry = parse_line(prev_line)
|
||||
a1_entry = parse_line(a1_line)
|
||||
next_entry = parse_line(next_line)
|
||||
touches_prev_entry = int(prev_entry['end_num']) + 1 == \
|
||||
int(a1_entry['start_num'])
|
||||
touches_next_entry = int(a1_entry['end_num']) + 1 == \
|
||||
int(next_entry['start_num'])
|
||||
same_country_code = prev_entry['country_code'] == \
|
||||
next_entry['country_code']
|
||||
if touches_prev_entry and touches_next_entry and same_country_code:
|
||||
new_line = format_line_with_other_country(a1_entry, prev_entry)
|
||||
print '-%s\n+%s' % (a1_line, new_line, )
|
||||
return [new_line]
|
||||
else:
|
||||
return a1_lines
|
||||
|
||||
def parse_line(line):
|
||||
if not line:
|
||||
return None
|
||||
keys = ['start_str', 'end_str', 'start_num', 'end_num',
|
||||
'country_code', 'country_name']
|
||||
stripped_line = line.replace('"', '').strip()
|
||||
parts = stripped_line.split(',')
|
||||
entry = dict((k, v) for k, v in zip(keys, parts))
|
||||
return entry
|
||||
|
||||
def format_line_with_other_country(original_entry, other_entry):
|
||||
return '"%s","%s","%s","%s","%s","%s"' % (original_entry['start_str'],
|
||||
original_entry['end_str'], original_entry['start_num'],
|
||||
original_entry['end_num'], other_entry['country_code'],
|
||||
other_entry['country_name'], )
|
||||
|
||||
def apply_manual_changes(assignments, manual_assignments):
|
||||
if not manual_assignments:
|
||||
return assignments
|
||||
print '\nApplying manual changes...'
|
||||
manual_dict = {}
|
||||
for line in manual_assignments:
|
||||
start_num = parse_line(line)['start_num']
|
||||
if start_num in manual_dict:
|
||||
print ('Warning: duplicate start number in manual '
|
||||
'assignments:\n %s\n %s\nDiscarding first entry.' %
|
||||
(manual_dict[start_num], line, ))
|
||||
manual_dict[start_num] = line
|
||||
result = []
|
||||
for line in assignments:
|
||||
entry = parse_line(line)
|
||||
start_num = entry['start_num']
|
||||
if start_num in manual_dict:
|
||||
manual_line = manual_dict[start_num]
|
||||
manual_entry = parse_line(manual_line)
|
||||
if entry['start_str'] == manual_entry['start_str'] and \
|
||||
entry['end_str'] == manual_entry['end_str'] and \
|
||||
entry['end_num'] == manual_entry['end_num']:
|
||||
if len(manual_entry['country_code']) != 2:
|
||||
print '-%s' % (line, ) # only remove, don't replace
|
||||
del manual_dict[start_num]
|
||||
elif entry['country_code'] != \
|
||||
manual_entry['country_code']:
|
||||
new_line = format_line_with_other_country(entry,
|
||||
manual_entry)
|
||||
print '-%s\n+%s' % (line, new_line, )
|
||||
result.append(new_line)
|
||||
del manual_dict[start_num]
|
||||
else:
|
||||
print ('Warning: not applying ineffective manual '
|
||||
'change:\n %s\n %s' % (line, manual_line, ))
|
||||
result.append(line)
|
||||
else:
|
||||
print ('Warning: not applying manual change that is only '
|
||||
'a partial match:\n %s\n %s' %
|
||||
(line, manual_line, ))
|
||||
result.append(line)
|
||||
elif 'country_code' in entry and \
|
||||
entry['country_code'] == 'A1':
|
||||
print ('Warning: no manual replacement for A1 entry:\n %s'
|
||||
% (line, ))
|
||||
result.append(line)
|
||||
else:
|
||||
result.append(line)
|
||||
if len(manual_dict) > 0:
|
||||
print 'Warning: could not apply all manual assignments:'
|
||||
for line in manual_dict.values():
|
||||
print ' %s' % (line, )
|
||||
return result
|
||||
|
||||
def write_file(path, assignments, long_format=True):
|
||||
if long_format:
|
||||
output_lines = assignments
|
||||
else:
|
||||
output_lines = []
|
||||
for long_line in assignments:
|
||||
entry = parse_line(long_line)
|
||||
short_line = "%s,%s,%s" % (entry['start_num'],
|
||||
entry['end_num'], entry['country_code'], )
|
||||
output_lines.append(short_line)
|
||||
out_file = open(path, 'w')
|
||||
out_file.write('\n'.join(output_lines))
|
||||
out_file.close()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
@ -1,80 +0,0 @@
|
||||
# This file contains manual overrides of A1 entries (and possibly others)
|
||||
# in MaxMind's GeoLite Country database. Use deanonymind.py in the same
|
||||
# directory to process this file when producing a new geoip file. See
|
||||
# README.geoip in the same directory for details.
|
||||
|
||||
# GB, because RIR delegation files say exactly this range
|
||||
# 46.16.32.0-46.16.39.255 is GB, even though neither previous nor next
|
||||
# MaxMind range is GB. Both previous and next MaxMind ranges match RIR
|
||||
# delegation files, too. -KL 2013-03-07
|
||||
"46.16.32.0","46.16.39.255","772808704","772810751","GB","United Kingdom"
|
||||
|
||||
# CH, because previous MaxMind entry 46.19.141.0-46.19.142.255 is CH, and
|
||||
# RIR delegation files say 46.19.136.0-46.19.143.255 is CH.
|
||||
# -KL 2012-11-27
|
||||
"46.19.143.0","46.19.143.255","773033728","773033983","CH","Switzerland"
|
||||
|
||||
# GB, because next MaxMind entry 46.166.129.0-46.166.134.255 is GB, and
|
||||
# RIR delegation files say 46.166.128.0-46.166.191.255 is GB.
|
||||
# -KL 2012-11-27
|
||||
"46.166.128.0","46.166.128.255","782663680","782663935","GB","United Kingdom"
|
||||
|
||||
# US, because previous MaxMind entry 70.159.21.51-70.232.244.255 is US,
|
||||
# because next MaxMind entry 70.232.245.58-70.232.245.59 is A2 ("Satellite
|
||||
# Provider") which is a country information about as useless as A1, and
|
||||
# because RIR delegation files say 70.224.0.0-70.239.255.255 is US.
|
||||
# -KL 2012-11-27
|
||||
"70.232.245.0","70.232.245.57","1189672192","1189672249","US","United States"
|
||||
|
||||
# US, because next MaxMind entry 70.232.246.0-70.240.141.255 is US,
|
||||
# because previous MaxMind entry 70.232.245.58-70.232.245.59 is A2
|
||||
# ("Satellite Provider") which is a country information about as useless
|
||||
# as A1, and because RIR delegation files say 70.224.0.0-70.239.255.255 is
|
||||
# US. -KL 2012-11-27
|
||||
"70.232.245.60","70.232.245.255","1189672252","1189672447","US","United States"
|
||||
|
||||
# GB, despite neither previous (GE) nor next (LV) MaxMind entry being GB,
|
||||
# but because RIR delegation files agree with both previous and next
|
||||
# MaxMind entry and say GB for 91.228.0.0-91.228.3.255. -KL 2012-11-27
|
||||
"91.228.0.0","91.228.3.255","1541668864","1541669887","GB","United Kingdom"
|
||||
|
||||
# NL, because next MaxMind entry 176.56.173.0-176.56.173.63 is NL, and RIR
|
||||
# delegation files say 176.56.160.0-176.56.191.255 is NL. -KL 2013-05-13
|
||||
"176.56.172.0","176.56.172.255","2956504064","2956504319","NL","Netherlands"
|
||||
|
||||
# NL, despite neither previous (RU) nor next (GB) MaxMind entry being NL,
|
||||
# but because RIR delegation files say entire range
|
||||
# 176.56.160.0-176.56.191.255 is NL. -KL 2013-05-13
|
||||
"176.56.174.0","176.56.174.255","2956504576","2956504831","NL","Netherlands"
|
||||
|
||||
# GB, because RIR delegation files say exactly this range
|
||||
# 185.25.84.0-185.25.87.255 is GB, even though neither previous nor next
|
||||
# MaxMind range is GB. Both previous and next MaxMind ranges match RIR
|
||||
# delegation files, too. -KL 2013-05-13
|
||||
"185.25.84.0","185.25.87.255","3105444864","3105445887","GB","United Kingdom"
|
||||
|
||||
# US, because next MaxMind entry 199.101.193.0-199.101.195.255 is US, and,
|
||||
# together with next entries, matches RIR delegation file entry
|
||||
# 199.101.192.0-199.101.199.255 which is US. -KL 2013-05-13
|
||||
"199.101.192.0","199.101.192.255","3345334272","3345334527","US","United States"
|
||||
|
||||
# US, because ARIN says 199.255.208.0-199.255.215.255 is US.
|
||||
# Changed entry start from 199.255.213.0 to 199.255.208.0 on 2013-08-12.
|
||||
# Split up into 199.255.208.0-199.255.209.127 and
|
||||
# 199.255.210.0-199.255.215.255 on 2013-10-11. -KL 2013-10-11
|
||||
"199.255.208.0","199.255.209.127","3355430912","3355431295","US","United States"
|
||||
"199.255.210.0","199.255.215.255","3355431424","3355432959","US","United States"
|
||||
|
||||
# EU, despite neither previous (RU) nor next (SE) MaxMind entry being EU,
|
||||
# but because RIR delegation files agree with previous MaxMind entry and
|
||||
# say EU for 217.15.160.0-217.15.175.255. -KL 2013-05-13
|
||||
"217.15.160.0","217.15.164.255","3641679872","3641681151","EU","Europe"
|
||||
|
||||
# FR, because previous MaxMind entry 217.15.166.0-217.15.166.255 is FR,
|
||||
# and RIR delegation files contain a block 217.15.160.0-217.15.175.255
|
||||
# which, however, is EU, not FR. But merging with next MaxMind entry
|
||||
# 217.15.176.0-217.15.191.255 which is KZ and which fully matches what
|
||||
# the RIR delegation files say seems unlikely to be correct.
|
||||
# -KL 2012-11-27
|
||||
"217.15.167.0","217.15.175.255","3641681664","3641683967","FR","France"
|
||||
|
446
src/config/mmdb-convert.py
Normal file
446
src/config/mmdb-convert.py
Normal file
@ -0,0 +1,446 @@
|
||||
#!/usr/bin/python3
|
||||
|
||||
# This software has been dedicated to the public domain under the CC0
|
||||
# public domain dedication.
|
||||
#
|
||||
# To the extent possible under law, the person who associated CC0
|
||||
# with mmdb-convert.py has waived all copyright and related or
|
||||
# neighboring rights to mmdb-convert.py.
|
||||
#
|
||||
# You should have received a copy of the CC0 legalcode along with this
|
||||
# work in doc/cc0.txt. If not, see
|
||||
# <http://creativecommons.org/publicdomain/zero/1.0/>.
|
||||
|
||||
# Nick Mathewson is responsible for this kludge, but takes no
|
||||
# responsibility for it.
|
||||
|
||||
"""This kludge is meant to
|
||||
parse mmdb files in sufficient detail to dump out the old format
|
||||
that Tor expects. It's also meant to be pure-python.
|
||||
|
||||
When given a simplicity/speed tradeoff, it opts for simplicity.
|
||||
|
||||
You will not understand the code without undestanding the MaxMind-DB
|
||||
file format. It is specified at:
|
||||
https://github.com/maxmind/MaxMind-DB/blob/master/MaxMind-DB-spec.md.
|
||||
|
||||
This isn't so much tested. When it breaks, you get to keep both
|
||||
pieces.
|
||||
"""
|
||||
|
||||
import struct
|
||||
import bisect
|
||||
import socket
|
||||
import binascii
|
||||
import sys
|
||||
import time
|
||||
|
||||
METADATA_MARKER = b'\xab\xcd\xefMaxMind.com'
|
||||
|
||||
# Here's some python2/python3 junk. Better solutions wanted.
|
||||
try:
|
||||
ord(b"1"[0])
|
||||
except TypeError:
|
||||
def byte_to_int(b):
|
||||
"convert a single element of a bytestring to an integer."
|
||||
return b
|
||||
else:
|
||||
byte_to_int = ord
|
||||
|
||||
# Here's some more python2/python3 junk. Better solutions wanted.
|
||||
try:
|
||||
str(b"a", "utf8")
|
||||
except TypeError:
|
||||
bytesToStr = str
|
||||
else:
|
||||
def bytesToStr(b):
|
||||
"convert a bytestring in utf8 to a string."
|
||||
return str(b, 'utf8')
|
||||
|
||||
def to_int(s):
|
||||
"Parse a big-endian integer from bytestring s."
|
||||
result = 0
|
||||
for c in s:
|
||||
result *= 256
|
||||
result += byte_to_int(c)
|
||||
return result
|
||||
|
||||
def to_int24(s):
|
||||
"Parse a pair of big-endian 24-bit integers from bytestring s."
|
||||
a, b, c = struct.unpack("!HHH", s)
|
||||
return ((a <<8)+(b>>8)), (((b&0xff)<<16)+c)
|
||||
|
||||
def to_int32(s):
|
||||
"Parse a pair of big-endian 32-bit integers from bytestring s."
|
||||
a, b = struct.unpack("!LL", s)
|
||||
return a, b
|
||||
|
||||
def to_int28(s):
|
||||
"Parse a pair of big-endian 28-bit integers from bytestring s."
|
||||
a, b = unpack("!LL", s + b'\x00')
|
||||
return (((a & 0xf0) << 20) + (a >> 8)), ((a & 0x0f) << 24) + (b >> 8)
|
||||
|
||||
class Tree(object):
|
||||
"Holds a node in the tree"
|
||||
def __init__(self, left, right):
|
||||
self.left = left
|
||||
self.right = right
|
||||
|
||||
def resolve_tree(tree, data):
|
||||
"""Fill in the left_item and right_item fields for all values in the tree
|
||||
so that they point to another Tree, or to a Datum, or to None."""
|
||||
d = Datum(None, None, None, None)
|
||||
def resolve_item(item):
|
||||
"Helper: resolve a single index."
|
||||
if item < len(tree):
|
||||
return tree[item]
|
||||
elif item == len(tree):
|
||||
return None
|
||||
else:
|
||||
d.pos = (item - len(tree) - 16)
|
||||
p = bisect.bisect_left(data, d)
|
||||
assert data[p].pos == d.pos
|
||||
return data[p]
|
||||
|
||||
for t in tree:
|
||||
t.left_item = resolve_item(t.left)
|
||||
t.right_item = resolve_item(t.right)
|
||||
|
||||
def parse_search_tree(s, record_size):
|
||||
"""Given a bytestring and a record size in bits, parse the tree.
|
||||
Return a list of nodes."""
|
||||
record_bytes = (record_size*2) // 8
|
||||
nodes = []
|
||||
p = 0
|
||||
try:
|
||||
to_leftright = { 24: to_int24,
|
||||
28: to_int28,
|
||||
32: to_int32 }[ record_size ]
|
||||
except KeyError:
|
||||
raise NotImplementedError("Unsupported record size in bits: %d" %
|
||||
record_size)
|
||||
while p < len(s):
|
||||
left, right = to_leftright(s[p:p+record_bytes])
|
||||
p += record_bytes
|
||||
|
||||
nodes.append( Tree(left, right ) )
|
||||
|
||||
return nodes
|
||||
|
||||
class Datum(object):
|
||||
"""Holds a single entry from the Data section"""
|
||||
def __init__(self, pos, kind, ln, data):
|
||||
self.pos = pos # Position of this record within data section
|
||||
self.kind = kind # Type of this record. one of TP_*
|
||||
self.ln = ln # Length field, which might be overloaded.
|
||||
self.data = data # Raw bytes data.
|
||||
self.children = None # Used for arrays and maps.
|
||||
|
||||
def __repr__(self):
|
||||
return "Datum(%r,%r,%r,%r)" % (self.pos, self.kind, self.ln, self.data)
|
||||
|
||||
# Comparison functions used for bsearch
|
||||
def __lt__(self, other):
|
||||
return self.pos < other.pos
|
||||
|
||||
def __gt__(self, other):
|
||||
return self.pos > other.pos
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.pos == other.pos
|
||||
|
||||
def build_maps(self):
|
||||
"""If this is a map or array, fill in its 'map' field if it's a map,
|
||||
and the 'map' field of all its children."""
|
||||
|
||||
if not hasattr(self, 'nChildren'):
|
||||
return
|
||||
|
||||
if self.kind == TP_ARRAY:
|
||||
del self.nChildren
|
||||
for c in self.children:
|
||||
c.build_maps()
|
||||
|
||||
elif self.kind == TP_MAP:
|
||||
del self.nChildren
|
||||
self.map = {}
|
||||
for i in range(0, len(self.children), 2):
|
||||
k = self.children[i].deref()
|
||||
v = self.children[i+1].deref()
|
||||
v.build_maps()
|
||||
if k.kind != TP_UTF8:
|
||||
raise ValueError("Bad dictionary key type %d"% k.kind)
|
||||
self.map[bytesToStr(k.data)] = v
|
||||
|
||||
def int_val(self):
|
||||
"""If this is an integer type, return its value"""
|
||||
assert self.kind in (TP_UINT16, TP_UINT32, TP_UINT64,
|
||||
TP_UINT128, TP_SINT32)
|
||||
i = to_int(self.data)
|
||||
if self.kind == TP_SINT32:
|
||||
if i & 0x80000000:
|
||||
i = i - 0x100000000
|
||||
return i
|
||||
|
||||
def deref(self):
|
||||
"""If this value is a pointer, return its pointed-to-value. Chase
|
||||
through multiple layers of pointers if need be. If this isn't
|
||||
a pointer, return it."""
|
||||
n = 0
|
||||
s = self
|
||||
while s.kind == TP_PTR:
|
||||
s = s.ptr
|
||||
n += 1
|
||||
assert n < 100
|
||||
return s
|
||||
|
||||
def resolve_pointers(data):
|
||||
"""Fill in the ptr field of every pointer in data."""
|
||||
search = Datum(None, None, None, None)
|
||||
for d in data:
|
||||
if d.kind == TP_PTR:
|
||||
search.pos = d.ln
|
||||
p = bisect.bisect_left(data, search)
|
||||
assert data[p].pos == d.ln
|
||||
d.ptr = data[p]
|
||||
|
||||
TP_PTR = 1
|
||||
TP_UTF8 = 2
|
||||
TP_DBL = 3
|
||||
TP_BYTES = 4
|
||||
TP_UINT16 = 5
|
||||
TP_UINT32 = 6
|
||||
TP_MAP = 7
|
||||
TP_SINT32 = 8
|
||||
TP_UINT64 = 9
|
||||
TP_UINT128 = 10
|
||||
TP_ARRAY = 11
|
||||
TP_DCACHE = 12
|
||||
TP_END = 13
|
||||
TP_BOOL = 14
|
||||
TP_FLOAT = 15
|
||||
|
||||
def get_type_and_len(s):
|
||||
"""Data parsing helper: decode the type value and much-overloaded 'length'
|
||||
field for the value starting at s. Return a 3-tuple of type, length,
|
||||
and number of bytes used to encode type-plus-length."""
|
||||
c = byte_to_int(s[0])
|
||||
tp = c >> 5
|
||||
skip = 1
|
||||
if tp == 0:
|
||||
tp = byte_to_int(s[1])+7
|
||||
skip = 2
|
||||
ln = c & 31
|
||||
|
||||
# I'm sure I don't know what they were thinking here...
|
||||
if tp == TP_PTR:
|
||||
len_len = (ln >> 3) + 1
|
||||
if len_len < 4:
|
||||
ln &= 7
|
||||
ln <<= len_len * 8
|
||||
else:
|
||||
ln = 0
|
||||
ln += to_int(s[skip:skip+len_len])
|
||||
ln += (0, 0, 2048, 526336, 0)[len_len]
|
||||
skip += len_len
|
||||
elif ln >= 29:
|
||||
len_len = ln - 28
|
||||
ln = to_int(s[skip:skip+len_len])
|
||||
ln += (0, 29, 285, 65821)[len_len]
|
||||
skip += len_len
|
||||
|
||||
return tp, ln, skip
|
||||
|
||||
# Set of types for which 'length' doesn't mean length.
|
||||
IGNORE_LEN_TYPES = set([
|
||||
TP_MAP, # Length is number of key-value pairs that follow.
|
||||
TP_ARRAY, # Length is number of members that follow.
|
||||
TP_PTR, # Length is index to pointed-to data element.
|
||||
TP_BOOL, # Length is 0 or 1.
|
||||
TP_DCACHE, # Length isnumber of members that follow
|
||||
])
|
||||
|
||||
def parse_data_section(s):
|
||||
"""Given a data section encoded in a bytestring, return a list of
|
||||
Datum items."""
|
||||
|
||||
# Stack of possibly nested containers. We use the 'nChildren' member of
|
||||
# the last one to tell how many moreitems nest directly inside.
|
||||
stack = []
|
||||
|
||||
# List of all items, including nested ones.
|
||||
data = []
|
||||
|
||||
# Byte index within the data section.
|
||||
pos = 0
|
||||
|
||||
while s:
|
||||
tp, ln, skip = get_type_and_len(s)
|
||||
if tp in IGNORE_LEN_TYPES:
|
||||
real_len = 0
|
||||
else:
|
||||
real_len = ln
|
||||
|
||||
d = Datum(pos, tp, ln, s[skip:skip+real_len])
|
||||
data.append(d)
|
||||
pos += skip+real_len
|
||||
s = s[skip+real_len:]
|
||||
|
||||
if stack:
|
||||
stack[-1].children.append(d)
|
||||
stack[-1].nChildren -= 1
|
||||
if stack[-1].nChildren == 0:
|
||||
del stack[-1]
|
||||
|
||||
if d.kind == TP_ARRAY:
|
||||
d.nChildren = d.ln
|
||||
d.children = []
|
||||
stack.append(d)
|
||||
elif d.kind == TP_MAP:
|
||||
d.nChildren = d.ln * 2
|
||||
d.children = []
|
||||
stack.append(d)
|
||||
|
||||
return data
|
||||
|
||||
def parse_mm_file(s):
|
||||
"""Parse a MaxMind-DB file."""
|
||||
try:
|
||||
metadata_ptr = s.rindex(METADATA_MARKER)
|
||||
except ValueError:
|
||||
raise ValueError("No metadata!")
|
||||
|
||||
metadata = parse_data_section(s[metadata_ptr+len(METADATA_MARKER):])
|
||||
|
||||
if metadata[0].kind != TP_MAP:
|
||||
raise ValueError("Bad map")
|
||||
|
||||
metadata[0].build_maps()
|
||||
mm = metadata[0].map
|
||||
|
||||
tree_size = (((mm['record_size'].int_val() * 2) // 8 ) *
|
||||
mm['node_count'].int_val())
|
||||
|
||||
if s[tree_size:tree_size+16] != b'\x00'*16:
|
||||
raise ValueError("Missing section separator!")
|
||||
|
||||
tree = parse_search_tree(s[:tree_size], mm['record_size'].int_val())
|
||||
|
||||
data = parse_data_section(s[tree_size+16:metadata_ptr])
|
||||
|
||||
resolve_pointers(data)
|
||||
resolve_tree(tree, data)
|
||||
|
||||
for d in data:
|
||||
d.build_maps()
|
||||
|
||||
return metadata, tree, data
|
||||
|
||||
def format_datum(datum):
|
||||
"""Given a Datum at a leaf of the tree, return the string that we should
|
||||
write as its value.
|
||||
"""
|
||||
try:
|
||||
return bytesToStr(datum.map['country'].map['iso_code'].data)
|
||||
except KeyError:
|
||||
pass
|
||||
return None
|
||||
|
||||
IPV4_PREFIX = "0"*96
|
||||
|
||||
def dump_item_ipv4(entries, prefix, val):
|
||||
"""Dump the information for an IPv4 address to entries, where 'prefix'
|
||||
is a string holding a binary prefix for the address, and 'val' is the
|
||||
value to dump. If the prefix is not an IPv4 address (it does not start
|
||||
with 96 bits of 0), then print nothing.
|
||||
"""
|
||||
if not prefix.startswith(IPV4_PREFIX):
|
||||
return
|
||||
prefix = prefix[96:]
|
||||
v = int(prefix, 2)
|
||||
shift = 32 - len(prefix)
|
||||
lo = v << shift
|
||||
hi = ((v+1) << shift) - 1
|
||||
entries.append((lo, hi, val))
|
||||
|
||||
def fmt_item_ipv4(entry):
|
||||
"""Format an IPv4 range with lo and hi addresses in decimal form."""
|
||||
return "%d,%d,%s\n"%(entry[0], entry[1], entry[2])
|
||||
|
||||
def fmt_ipv6_addr(v):
|
||||
"""Given a 128-bit integer representing an ipv6 address, return a
|
||||
string for that ipv6 address."""
|
||||
return socket.inet_ntop(socket.AF_INET6, binascii.unhexlify("%032x"%v))
|
||||
|
||||
def fmt_item_ipv6(entry):
|
||||
"""Format an IPv6 range with lo and hi addresses in hex form."""
|
||||
return "%s,%s,%s\n"%(fmt_ipv6_addr(entry[0]),
|
||||
fmt_ipv6_addr(entry[1]),
|
||||
entry[2])
|
||||
|
||||
IPV4_MAPPED_IPV6_PREFIX = "0"*80 + "1"*16
|
||||
IPV6_6TO4_PREFIX = "0010000000000010"
|
||||
TEREDO_IPV6_PREFIX = "0010000000000001" + "0"*16
|
||||
|
||||
def dump_item_ipv6(entries, prefix, val):
|
||||
"""Dump the information for an IPv6 address prefix to entries, where
|
||||
'prefix' is a string holding a binary prefix for the address,
|
||||
and 'val' is the value to dump. If the prefix is an IPv4 address
|
||||
(starts with 96 bits of 0), is an IPv4-mapped IPv6 address
|
||||
(::ffff:0:0/96), or is in the 6to4 mapping subnet (2002::/16), then
|
||||
print nothing.
|
||||
"""
|
||||
if prefix.startswith(IPV4_PREFIX) or \
|
||||
prefix.startswith(IPV4_MAPPED_IPV6_PREFIX) or \
|
||||
prefix.startswith(IPV6_6TO4_PREFIX) or \
|
||||
prefix.startswith(TEREDO_IPV6_PREFIX):
|
||||
return
|
||||
v = int(prefix, 2)
|
||||
shift = 128 - len(prefix)
|
||||
lo = v << shift
|
||||
hi = ((v+1) << shift) - 1
|
||||
entries.append((lo, hi, val))
|
||||
|
||||
def dump_tree(entries, node, dump_item, prefix=""):
|
||||
"""Walk the tree rooted at 'node', and call dump_item on the
|
||||
format_datum output of every leaf of the tree."""
|
||||
|
||||
if isinstance(node, Tree):
|
||||
dump_tree(entries, node.left_item, dump_item, prefix+"0")
|
||||
dump_tree(entries, node.right_item, dump_item, prefix+"1")
|
||||
elif isinstance(node, Datum):
|
||||
assert node.kind == TP_MAP
|
||||
code = format_datum(node)
|
||||
if code:
|
||||
dump_item(entries, prefix, code)
|
||||
else:
|
||||
assert node == None
|
||||
|
||||
def write_geoip_file(filename, metadata, the_tree, dump_item, fmt_item):
|
||||
"""Write the entries in the_tree to filename."""
|
||||
entries = []
|
||||
dump_tree(entries, the_tree[0], dump_item)
|
||||
fobj = open(filename, 'w')
|
||||
|
||||
build_epoch = metadata[0].map['build_epoch'].int_val()
|
||||
fobj.write("# Last updated based on %s Maxmind GeoLite2 Country\n"%
|
||||
time.strftime('%B %-d %Y', time.gmtime(build_epoch)))
|
||||
|
||||
unwritten = None
|
||||
for entry in entries:
|
||||
if not unwritten:
|
||||
unwritten = entry
|
||||
elif unwritten[1] + 1 == entry[0] and unwritten[2] == entry[2]:
|
||||
unwritten = (unwritten[0], entry[1], unwritten[2])
|
||||
else:
|
||||
fobj.write(fmt_item(unwritten))
|
||||
unwritten = entry
|
||||
if unwritten:
|
||||
fobj.write(fmt_item(unwritten))
|
||||
fobj.close()
|
||||
|
||||
content = open(sys.argv[1], 'rb').read()
|
||||
metadata, the_tree, _ = parse_mm_file(content)
|
||||
|
||||
write_geoip_file('geoip', metadata, the_tree, dump_item_ipv4, fmt_item_ipv4)
|
||||
write_geoip_file('geoip6', metadata, the_tree, dump_item_ipv6, fmt_item_ipv6)
|
Loading…
Reference in New Issue
Block a user