Merge remote-tracking branch 'karsten/mmdb-convert'

This commit is contained in:
Nick Mathewson 2014-02-15 00:08:36 -05:00
commit 9d0af78e3c
4 changed files with 446 additions and 375 deletions

View File

@ -1,90 +0,0 @@
README.geoip -- information on the IP-to-country-code file shipped with tor
===========================================================================
The IP-to-country-code file in src/config/geoip is based on MaxMind's
GeoLite Country database with the following modifications:
- Those "A1" ("Anonymous Proxy") entries lying inbetween two entries with
the same country code are automatically changed to that country code.
These changes can be overriden by specifying a different country code
in src/config/geoip-manual.
- Other "A1" entries are replaced with country codes specified in
src/config/geoip-manual, or are left as is if there is no corresponding
entry in that file. Even non-"A1" entries can be modified by adding a
replacement entry to src/config/geoip-manual. Handle with care.
1. Updating the geoip file from a MaxMind database file
-------------------------------------------------------
Download the most recent MaxMind GeoLite Country database:
http://geolite.maxmind.com/download/geoip/database/GeoIPCountryCSV.zip
Run `python deanonymind.py` in the local directory. Review the output to
learn about applied automatic/manual changes and watch out for any
warnings.
Possibly edit geoip-manual to make more/fewer/different manual changes and
re-run `python deanonymind.py`.
When done, prepend the new geoip file with a comment like this:
# Last updated based on $DATE Maxmind GeoLite Country
# See README.geoip for details on the conversion.
2. Verifying automatic and manual changes using diff
----------------------------------------------------
To unzip the original MaxMind file and look at the automatic changes, run:
unzip GeoIPCountryCSV.zip
diff -U1 GeoIPCountryWhois.csv AutomaticGeoIPCountryWhois.csv
To look at subsequent manual changes, run:
diff -U1 AutomaticGeoIPCountryWhois.csv ManualGeoIPCountryWhois.csv
To manually generate the geoip file and compare it to the automatically
created one, run:
cut -d, -f3-5 < ManualGeoIPCountryWhois.csv | sed 's/"//g' > mygeoip
diff -U1 geoip mygeoip
3. Verifying automatic and manual changes using blockfinder
-----------------------------------------------------------
Blockfinder is a powerful tool to handle multiple IP-to-country data
sources. Blockfinder has a function to specify a country code and compare
conflicting country code assignments in different data sources.
We can use blockfinder to compare A1 entries in the original MaxMind file
with the same or overlapping blocks in the file generated above and in the
RIR delegation files:
git clone https://github.com/ioerror/blockfinder
cd blockfinder/
python blockfinder -i
python blockfinder -r ../GeoIPCountryWhois.csv
python blockfinder -r ../ManualGeoIPCountryWhois.csv
python blockfinder -p A1 > A1-comparison.txt
The output marks conflicts between assignments using either '*' in case of
two different opinions or '#' for three or more different opinions about
the country code for a given block.
The '*' conflicts are most likely harmless, because there will always be
at least two opinions with the original MaxMind file saying A1 and the
other two sources saying something more meaningful.
However, watch out for '#' conflicts. In these cases, the original
MaxMind file ("A1"), the updated MaxMind file (hopefully the correct
country code), and the RIR delegation files (some other country code) all
disagree.
There are perfectly valid cases where the updated MaxMind file and the RIR
delegation files don't agree. But each of those cases must be verified
manually.

View File

@ -1,205 +0,0 @@
#!/usr/bin/env python
import optparse
import os
import sys
import zipfile
"""
Take a MaxMind GeoLite Country database as input and replace A1 entries
with the country code and name of the preceding entry iff the preceding
(subsequent) entry ends (starts) directly before (after) the A1 entry and
both preceding and subsequent entries contain the same country code.
Then apply manual changes, either replacing A1 entries that could not be
replaced automatically or overriding previously made automatic changes.
"""
def main():
options = parse_options()
assignments = read_file(options.in_maxmind)
assignments = apply_automatic_changes(assignments)
write_file(options.out_automatic, assignments)
manual_assignments = read_file(options.in_manual, must_exist=False)
assignments = apply_manual_changes(assignments, manual_assignments)
write_file(options.out_manual, assignments)
write_file(options.out_geoip, assignments, long_format=False)
def parse_options():
parser = optparse.OptionParser()
parser.add_option('-i', action='store', dest='in_maxmind',
default='GeoIPCountryCSV.zip', metavar='FILE',
help='use the specified MaxMind GeoLite Country .zip or .csv '
'file as input [default: %default]')
parser.add_option('-g', action='store', dest='in_manual',
default='geoip-manual', metavar='FILE',
help='use the specified .csv file for manual changes or to '
'override automatic changes [default: %default]')
parser.add_option('-a', action='store', dest='out_automatic',
default="AutomaticGeoIPCountryWhois.csv", metavar='FILE',
help='write full input file plus automatic changes to the '
'specified .csv file [default: %default]')
parser.add_option('-m', action='store', dest='out_manual',
default='ManualGeoIPCountryWhois.csv', metavar='FILE',
help='write full input file plus automatic and manual '
'changes to the specified .csv file [default: %default]')
parser.add_option('-o', action='store', dest='out_geoip',
default='geoip', metavar='FILE',
help='write full input file plus automatic and manual '
'changes to the specified .csv file that can be shipped '
'with tor [default: %default]')
(options, args) = parser.parse_args()
return options
def read_file(path, must_exist=True):
if not os.path.exists(path):
if must_exist:
print 'File %s does not exist. Exiting.' % (path, )
sys.exit(1)
else:
return
if path.endswith('.zip'):
zip_file = zipfile.ZipFile(path)
csv_content = zip_file.read('GeoIPCountryWhois.csv')
zip_file.close()
else:
csv_file = open(path)
csv_content = csv_file.read()
csv_file.close()
assignments = []
for line in csv_content.split('\n'):
stripped_line = line.strip()
if len(stripped_line) > 0 and not stripped_line.startswith('#'):
assignments.append(stripped_line)
return assignments
def apply_automatic_changes(assignments):
print '\nApplying automatic changes...'
result_lines = []
prev_line = None
a1_lines = []
for line in assignments:
if '"A1"' in line:
a1_lines.append(line)
else:
if len(a1_lines) > 0:
new_a1_lines = process_a1_lines(prev_line, a1_lines, line)
for new_a1_line in new_a1_lines:
result_lines.append(new_a1_line)
a1_lines = []
result_lines.append(line)
prev_line = line
if len(a1_lines) > 0:
new_a1_lines = process_a1_lines(prev_line, a1_lines, None)
for new_a1_line in new_a1_lines:
result_lines.append(new_a1_line)
return result_lines
def process_a1_lines(prev_line, a1_lines, next_line):
if not prev_line or not next_line:
return a1_lines # Can't merge first or last line in file.
if len(a1_lines) > 1:
return a1_lines # Can't merge more than 1 line at once.
a1_line = a1_lines[0].strip()
prev_entry = parse_line(prev_line)
a1_entry = parse_line(a1_line)
next_entry = parse_line(next_line)
touches_prev_entry = int(prev_entry['end_num']) + 1 == \
int(a1_entry['start_num'])
touches_next_entry = int(a1_entry['end_num']) + 1 == \
int(next_entry['start_num'])
same_country_code = prev_entry['country_code'] == \
next_entry['country_code']
if touches_prev_entry and touches_next_entry and same_country_code:
new_line = format_line_with_other_country(a1_entry, prev_entry)
print '-%s\n+%s' % (a1_line, new_line, )
return [new_line]
else:
return a1_lines
def parse_line(line):
if not line:
return None
keys = ['start_str', 'end_str', 'start_num', 'end_num',
'country_code', 'country_name']
stripped_line = line.replace('"', '').strip()
parts = stripped_line.split(',')
entry = dict((k, v) for k, v in zip(keys, parts))
return entry
def format_line_with_other_country(original_entry, other_entry):
return '"%s","%s","%s","%s","%s","%s"' % (original_entry['start_str'],
original_entry['end_str'], original_entry['start_num'],
original_entry['end_num'], other_entry['country_code'],
other_entry['country_name'], )
def apply_manual_changes(assignments, manual_assignments):
if not manual_assignments:
return assignments
print '\nApplying manual changes...'
manual_dict = {}
for line in manual_assignments:
start_num = parse_line(line)['start_num']
if start_num in manual_dict:
print ('Warning: duplicate start number in manual '
'assignments:\n %s\n %s\nDiscarding first entry.' %
(manual_dict[start_num], line, ))
manual_dict[start_num] = line
result = []
for line in assignments:
entry = parse_line(line)
start_num = entry['start_num']
if start_num in manual_dict:
manual_line = manual_dict[start_num]
manual_entry = parse_line(manual_line)
if entry['start_str'] == manual_entry['start_str'] and \
entry['end_str'] == manual_entry['end_str'] and \
entry['end_num'] == manual_entry['end_num']:
if len(manual_entry['country_code']) != 2:
print '-%s' % (line, ) # only remove, don't replace
del manual_dict[start_num]
elif entry['country_code'] != \
manual_entry['country_code']:
new_line = format_line_with_other_country(entry,
manual_entry)
print '-%s\n+%s' % (line, new_line, )
result.append(new_line)
del manual_dict[start_num]
else:
print ('Warning: not applying ineffective manual '
'change:\n %s\n %s' % (line, manual_line, ))
result.append(line)
else:
print ('Warning: not applying manual change that is only '
'a partial match:\n %s\n %s' %
(line, manual_line, ))
result.append(line)
elif 'country_code' in entry and \
entry['country_code'] == 'A1':
print ('Warning: no manual replacement for A1 entry:\n %s'
% (line, ))
result.append(line)
else:
result.append(line)
if len(manual_dict) > 0:
print 'Warning: could not apply all manual assignments:'
for line in manual_dict.values():
print ' %s' % (line, )
return result
def write_file(path, assignments, long_format=True):
if long_format:
output_lines = assignments
else:
output_lines = []
for long_line in assignments:
entry = parse_line(long_line)
short_line = "%s,%s,%s" % (entry['start_num'],
entry['end_num'], entry['country_code'], )
output_lines.append(short_line)
out_file = open(path, 'w')
out_file.write('\n'.join(output_lines))
out_file.close()
if __name__ == '__main__':
main()

View File

@ -1,80 +0,0 @@
# This file contains manual overrides of A1 entries (and possibly others)
# in MaxMind's GeoLite Country database. Use deanonymind.py in the same
# directory to process this file when producing a new geoip file. See
# README.geoip in the same directory for details.
# GB, because RIR delegation files say exactly this range
# 46.16.32.0-46.16.39.255 is GB, even though neither previous nor next
# MaxMind range is GB. Both previous and next MaxMind ranges match RIR
# delegation files, too. -KL 2013-03-07
"46.16.32.0","46.16.39.255","772808704","772810751","GB","United Kingdom"
# CH, because previous MaxMind entry 46.19.141.0-46.19.142.255 is CH, and
# RIR delegation files say 46.19.136.0-46.19.143.255 is CH.
# -KL 2012-11-27
"46.19.143.0","46.19.143.255","773033728","773033983","CH","Switzerland"
# GB, because next MaxMind entry 46.166.129.0-46.166.134.255 is GB, and
# RIR delegation files say 46.166.128.0-46.166.191.255 is GB.
# -KL 2012-11-27
"46.166.128.0","46.166.128.255","782663680","782663935","GB","United Kingdom"
# US, because previous MaxMind entry 70.159.21.51-70.232.244.255 is US,
# because next MaxMind entry 70.232.245.58-70.232.245.59 is A2 ("Satellite
# Provider") which is a country information about as useless as A1, and
# because RIR delegation files say 70.224.0.0-70.239.255.255 is US.
# -KL 2012-11-27
"70.232.245.0","70.232.245.57","1189672192","1189672249","US","United States"
# US, because next MaxMind entry 70.232.246.0-70.240.141.255 is US,
# because previous MaxMind entry 70.232.245.58-70.232.245.59 is A2
# ("Satellite Provider") which is a country information about as useless
# as A1, and because RIR delegation files say 70.224.0.0-70.239.255.255 is
# US. -KL 2012-11-27
"70.232.245.60","70.232.245.255","1189672252","1189672447","US","United States"
# GB, despite neither previous (GE) nor next (LV) MaxMind entry being GB,
# but because RIR delegation files agree with both previous and next
# MaxMind entry and say GB for 91.228.0.0-91.228.3.255. -KL 2012-11-27
"91.228.0.0","91.228.3.255","1541668864","1541669887","GB","United Kingdom"
# NL, because next MaxMind entry 176.56.173.0-176.56.173.63 is NL, and RIR
# delegation files say 176.56.160.0-176.56.191.255 is NL. -KL 2013-05-13
"176.56.172.0","176.56.172.255","2956504064","2956504319","NL","Netherlands"
# NL, despite neither previous (RU) nor next (GB) MaxMind entry being NL,
# but because RIR delegation files say entire range
# 176.56.160.0-176.56.191.255 is NL. -KL 2013-05-13
"176.56.174.0","176.56.174.255","2956504576","2956504831","NL","Netherlands"
# GB, because RIR delegation files say exactly this range
# 185.25.84.0-185.25.87.255 is GB, even though neither previous nor next
# MaxMind range is GB. Both previous and next MaxMind ranges match RIR
# delegation files, too. -KL 2013-05-13
"185.25.84.0","185.25.87.255","3105444864","3105445887","GB","United Kingdom"
# US, because next MaxMind entry 199.101.193.0-199.101.195.255 is US, and,
# together with next entries, matches RIR delegation file entry
# 199.101.192.0-199.101.199.255 which is US. -KL 2013-05-13
"199.101.192.0","199.101.192.255","3345334272","3345334527","US","United States"
# US, because ARIN says 199.255.208.0-199.255.215.255 is US.
# Changed entry start from 199.255.213.0 to 199.255.208.0 on 2013-08-12.
# Split up into 199.255.208.0-199.255.209.127 and
# 199.255.210.0-199.255.215.255 on 2013-10-11. -KL 2013-10-11
"199.255.208.0","199.255.209.127","3355430912","3355431295","US","United States"
"199.255.210.0","199.255.215.255","3355431424","3355432959","US","United States"
# EU, despite neither previous (RU) nor next (SE) MaxMind entry being EU,
# but because RIR delegation files agree with previous MaxMind entry and
# say EU for 217.15.160.0-217.15.175.255. -KL 2013-05-13
"217.15.160.0","217.15.164.255","3641679872","3641681151","EU","Europe"
# FR, because previous MaxMind entry 217.15.166.0-217.15.166.255 is FR,
# and RIR delegation files contain a block 217.15.160.0-217.15.175.255
# which, however, is EU, not FR. But merging with next MaxMind entry
# 217.15.176.0-217.15.191.255 which is KZ and which fully matches what
# the RIR delegation files say seems unlikely to be correct.
# -KL 2012-11-27
"217.15.167.0","217.15.175.255","3641681664","3641683967","FR","France"

446
src/config/mmdb-convert.py Normal file
View File

@ -0,0 +1,446 @@
#!/usr/bin/python3
# This software has been dedicated to the public domain under the CC0
# public domain dedication.
#
# To the extent possible under law, the person who associated CC0
# with mmdb-convert.py has waived all copyright and related or
# neighboring rights to mmdb-convert.py.
#
# You should have received a copy of the CC0 legalcode along with this
# work in doc/cc0.txt. If not, see
# <http://creativecommons.org/publicdomain/zero/1.0/>.
# Nick Mathewson is responsible for this kludge, but takes no
# responsibility for it.
"""This kludge is meant to
parse mmdb files in sufficient detail to dump out the old format
that Tor expects. It's also meant to be pure-python.
When given a simplicity/speed tradeoff, it opts for simplicity.
You will not understand the code without undestanding the MaxMind-DB
file format. It is specified at:
https://github.com/maxmind/MaxMind-DB/blob/master/MaxMind-DB-spec.md.
This isn't so much tested. When it breaks, you get to keep both
pieces.
"""
import struct
import bisect
import socket
import binascii
import sys
import time
METADATA_MARKER = b'\xab\xcd\xefMaxMind.com'
# Here's some python2/python3 junk. Better solutions wanted.
try:
ord(b"1"[0])
except TypeError:
def byte_to_int(b):
"convert a single element of a bytestring to an integer."
return b
else:
byte_to_int = ord
# Here's some more python2/python3 junk. Better solutions wanted.
try:
str(b"a", "utf8")
except TypeError:
bytesToStr = str
else:
def bytesToStr(b):
"convert a bytestring in utf8 to a string."
return str(b, 'utf8')
def to_int(s):
"Parse a big-endian integer from bytestring s."
result = 0
for c in s:
result *= 256
result += byte_to_int(c)
return result
def to_int24(s):
"Parse a pair of big-endian 24-bit integers from bytestring s."
a, b, c = struct.unpack("!HHH", s)
return ((a <<8)+(b>>8)), (((b&0xff)<<16)+c)
def to_int32(s):
"Parse a pair of big-endian 32-bit integers from bytestring s."
a, b = struct.unpack("!LL", s)
return a, b
def to_int28(s):
"Parse a pair of big-endian 28-bit integers from bytestring s."
a, b = unpack("!LL", s + b'\x00')
return (((a & 0xf0) << 20) + (a >> 8)), ((a & 0x0f) << 24) + (b >> 8)
class Tree(object):
"Holds a node in the tree"
def __init__(self, left, right):
self.left = left
self.right = right
def resolve_tree(tree, data):
"""Fill in the left_item and right_item fields for all values in the tree
so that they point to another Tree, or to a Datum, or to None."""
d = Datum(None, None, None, None)
def resolve_item(item):
"Helper: resolve a single index."
if item < len(tree):
return tree[item]
elif item == len(tree):
return None
else:
d.pos = (item - len(tree) - 16)
p = bisect.bisect_left(data, d)
assert data[p].pos == d.pos
return data[p]
for t in tree:
t.left_item = resolve_item(t.left)
t.right_item = resolve_item(t.right)
def parse_search_tree(s, record_size):
"""Given a bytestring and a record size in bits, parse the tree.
Return a list of nodes."""
record_bytes = (record_size*2) // 8
nodes = []
p = 0
try:
to_leftright = { 24: to_int24,
28: to_int28,
32: to_int32 }[ record_size ]
except KeyError:
raise NotImplementedError("Unsupported record size in bits: %d" %
record_size)
while p < len(s):
left, right = to_leftright(s[p:p+record_bytes])
p += record_bytes
nodes.append( Tree(left, right ) )
return nodes
class Datum(object):
"""Holds a single entry from the Data section"""
def __init__(self, pos, kind, ln, data):
self.pos = pos # Position of this record within data section
self.kind = kind # Type of this record. one of TP_*
self.ln = ln # Length field, which might be overloaded.
self.data = data # Raw bytes data.
self.children = None # Used for arrays and maps.
def __repr__(self):
return "Datum(%r,%r,%r,%r)" % (self.pos, self.kind, self.ln, self.data)
# Comparison functions used for bsearch
def __lt__(self, other):
return self.pos < other.pos
def __gt__(self, other):
return self.pos > other.pos
def __eq__(self, other):
return self.pos == other.pos
def build_maps(self):
"""If this is a map or array, fill in its 'map' field if it's a map,
and the 'map' field of all its children."""
if not hasattr(self, 'nChildren'):
return
if self.kind == TP_ARRAY:
del self.nChildren
for c in self.children:
c.build_maps()
elif self.kind == TP_MAP:
del self.nChildren
self.map = {}
for i in range(0, len(self.children), 2):
k = self.children[i].deref()
v = self.children[i+1].deref()
v.build_maps()
if k.kind != TP_UTF8:
raise ValueError("Bad dictionary key type %d"% k.kind)
self.map[bytesToStr(k.data)] = v
def int_val(self):
"""If this is an integer type, return its value"""
assert self.kind in (TP_UINT16, TP_UINT32, TP_UINT64,
TP_UINT128, TP_SINT32)
i = to_int(self.data)
if self.kind == TP_SINT32:
if i & 0x80000000:
i = i - 0x100000000
return i
def deref(self):
"""If this value is a pointer, return its pointed-to-value. Chase
through multiple layers of pointers if need be. If this isn't
a pointer, return it."""
n = 0
s = self
while s.kind == TP_PTR:
s = s.ptr
n += 1
assert n < 100
return s
def resolve_pointers(data):
"""Fill in the ptr field of every pointer in data."""
search = Datum(None, None, None, None)
for d in data:
if d.kind == TP_PTR:
search.pos = d.ln
p = bisect.bisect_left(data, search)
assert data[p].pos == d.ln
d.ptr = data[p]
TP_PTR = 1
TP_UTF8 = 2
TP_DBL = 3
TP_BYTES = 4
TP_UINT16 = 5
TP_UINT32 = 6
TP_MAP = 7
TP_SINT32 = 8
TP_UINT64 = 9
TP_UINT128 = 10
TP_ARRAY = 11
TP_DCACHE = 12
TP_END = 13
TP_BOOL = 14
TP_FLOAT = 15
def get_type_and_len(s):
"""Data parsing helper: decode the type value and much-overloaded 'length'
field for the value starting at s. Return a 3-tuple of type, length,
and number of bytes used to encode type-plus-length."""
c = byte_to_int(s[0])
tp = c >> 5
skip = 1
if tp == 0:
tp = byte_to_int(s[1])+7
skip = 2
ln = c & 31
# I'm sure I don't know what they were thinking here...
if tp == TP_PTR:
len_len = (ln >> 3) + 1
if len_len < 4:
ln &= 7
ln <<= len_len * 8
else:
ln = 0
ln += to_int(s[skip:skip+len_len])
ln += (0, 0, 2048, 526336, 0)[len_len]
skip += len_len
elif ln >= 29:
len_len = ln - 28
ln = to_int(s[skip:skip+len_len])
ln += (0, 29, 285, 65821)[len_len]
skip += len_len
return tp, ln, skip
# Set of types for which 'length' doesn't mean length.
IGNORE_LEN_TYPES = set([
TP_MAP, # Length is number of key-value pairs that follow.
TP_ARRAY, # Length is number of members that follow.
TP_PTR, # Length is index to pointed-to data element.
TP_BOOL, # Length is 0 or 1.
TP_DCACHE, # Length isnumber of members that follow
])
def parse_data_section(s):
"""Given a data section encoded in a bytestring, return a list of
Datum items."""
# Stack of possibly nested containers. We use the 'nChildren' member of
# the last one to tell how many moreitems nest directly inside.
stack = []
# List of all items, including nested ones.
data = []
# Byte index within the data section.
pos = 0
while s:
tp, ln, skip = get_type_and_len(s)
if tp in IGNORE_LEN_TYPES:
real_len = 0
else:
real_len = ln
d = Datum(pos, tp, ln, s[skip:skip+real_len])
data.append(d)
pos += skip+real_len
s = s[skip+real_len:]
if stack:
stack[-1].children.append(d)
stack[-1].nChildren -= 1
if stack[-1].nChildren == 0:
del stack[-1]
if d.kind == TP_ARRAY:
d.nChildren = d.ln
d.children = []
stack.append(d)
elif d.kind == TP_MAP:
d.nChildren = d.ln * 2
d.children = []
stack.append(d)
return data
def parse_mm_file(s):
"""Parse a MaxMind-DB file."""
try:
metadata_ptr = s.rindex(METADATA_MARKER)
except ValueError:
raise ValueError("No metadata!")
metadata = parse_data_section(s[metadata_ptr+len(METADATA_MARKER):])
if metadata[0].kind != TP_MAP:
raise ValueError("Bad map")
metadata[0].build_maps()
mm = metadata[0].map
tree_size = (((mm['record_size'].int_val() * 2) // 8 ) *
mm['node_count'].int_val())
if s[tree_size:tree_size+16] != b'\x00'*16:
raise ValueError("Missing section separator!")
tree = parse_search_tree(s[:tree_size], mm['record_size'].int_val())
data = parse_data_section(s[tree_size+16:metadata_ptr])
resolve_pointers(data)
resolve_tree(tree, data)
for d in data:
d.build_maps()
return metadata, tree, data
def format_datum(datum):
"""Given a Datum at a leaf of the tree, return the string that we should
write as its value.
"""
try:
return bytesToStr(datum.map['country'].map['iso_code'].data)
except KeyError:
pass
return None
IPV4_PREFIX = "0"*96
def dump_item_ipv4(entries, prefix, val):
"""Dump the information for an IPv4 address to entries, where 'prefix'
is a string holding a binary prefix for the address, and 'val' is the
value to dump. If the prefix is not an IPv4 address (it does not start
with 96 bits of 0), then print nothing.
"""
if not prefix.startswith(IPV4_PREFIX):
return
prefix = prefix[96:]
v = int(prefix, 2)
shift = 32 - len(prefix)
lo = v << shift
hi = ((v+1) << shift) - 1
entries.append((lo, hi, val))
def fmt_item_ipv4(entry):
"""Format an IPv4 range with lo and hi addresses in decimal form."""
return "%d,%d,%s\n"%(entry[0], entry[1], entry[2])
def fmt_ipv6_addr(v):
"""Given a 128-bit integer representing an ipv6 address, return a
string for that ipv6 address."""
return socket.inet_ntop(socket.AF_INET6, binascii.unhexlify("%032x"%v))
def fmt_item_ipv6(entry):
"""Format an IPv6 range with lo and hi addresses in hex form."""
return "%s,%s,%s\n"%(fmt_ipv6_addr(entry[0]),
fmt_ipv6_addr(entry[1]),
entry[2])
IPV4_MAPPED_IPV6_PREFIX = "0"*80 + "1"*16
IPV6_6TO4_PREFIX = "0010000000000010"
TEREDO_IPV6_PREFIX = "0010000000000001" + "0"*16
def dump_item_ipv6(entries, prefix, val):
"""Dump the information for an IPv6 address prefix to entries, where
'prefix' is a string holding a binary prefix for the address,
and 'val' is the value to dump. If the prefix is an IPv4 address
(starts with 96 bits of 0), is an IPv4-mapped IPv6 address
(::ffff:0:0/96), or is in the 6to4 mapping subnet (2002::/16), then
print nothing.
"""
if prefix.startswith(IPV4_PREFIX) or \
prefix.startswith(IPV4_MAPPED_IPV6_PREFIX) or \
prefix.startswith(IPV6_6TO4_PREFIX) or \
prefix.startswith(TEREDO_IPV6_PREFIX):
return
v = int(prefix, 2)
shift = 128 - len(prefix)
lo = v << shift
hi = ((v+1) << shift) - 1
entries.append((lo, hi, val))
def dump_tree(entries, node, dump_item, prefix=""):
"""Walk the tree rooted at 'node', and call dump_item on the
format_datum output of every leaf of the tree."""
if isinstance(node, Tree):
dump_tree(entries, node.left_item, dump_item, prefix+"0")
dump_tree(entries, node.right_item, dump_item, prefix+"1")
elif isinstance(node, Datum):
assert node.kind == TP_MAP
code = format_datum(node)
if code:
dump_item(entries, prefix, code)
else:
assert node == None
def write_geoip_file(filename, metadata, the_tree, dump_item, fmt_item):
"""Write the entries in the_tree to filename."""
entries = []
dump_tree(entries, the_tree[0], dump_item)
fobj = open(filename, 'w')
build_epoch = metadata[0].map['build_epoch'].int_val()
fobj.write("# Last updated based on %s Maxmind GeoLite2 Country\n"%
time.strftime('%B %-d %Y', time.gmtime(build_epoch)))
unwritten = None
for entry in entries:
if not unwritten:
unwritten = entry
elif unwritten[1] + 1 == entry[0] and unwritten[2] == entry[2]:
unwritten = (unwritten[0], entry[1], unwritten[2])
else:
fobj.write(fmt_item(unwritten))
unwritten = entry
if unwritten:
fobj.write(fmt_item(unwritten))
fobj.close()
content = open(sys.argv[1], 'rb').read()
metadata, the_tree, _ = parse_mm_file(content)
write_geoip_file('geoip', metadata, the_tree, dump_item_ipv4, fmt_item_ipv4)
write_geoip_file('geoip6', metadata, the_tree, dump_item_ipv6, fmt_item_ipv6)