Rust tool to convert IPFire Location dump into CSV format.

The IPFire people provide a tool that collects data from several
top-level sources, combines it into a single database, and annotates
it with optional overrides.  This tool transforms the "dump" format
of their database into the form Tor expects.
This commit is contained in:
Nick Mathewson 2021-02-22 08:30:11 -05:00
parent 8ccfd4a51a
commit 0d4237839b
6 changed files with 349 additions and 0 deletions

View File

@ -0,0 +1,25 @@
To generate new geoip files, you'll need to install the
libloc/"location" tool provided by https://location.ipfire.org/.
I personally build it with:
./configure CFLAGS='-g -O2' --disable-perl --without-systemd --prefix=/opt/libloc
make
make install
Then (after adjusting PATH and PYTHONPATH) you can get the latest
dump with:
location update
location dump geoip-dump.txt
And transform it into geoip files with
cargo run --release -- -i geoip-dump.txt
==============================
Note that the current version "0.1.9" of rangemap has a performance
bug, making this tool quite slow. Previous versions had a
correctness bug that made the output needlessly long. With luck,
there will soon be a fast correct rangemap version.

View File

@ -0,0 +1 @@
target

View File

@ -0,0 +1,16 @@
[package]
name = "geoip-db-tool"
version = "0.1.0"
authors = ["Nick Mathewson <nickm@torproject.org>"]
edition = "2018"
license = "MIT OR Apache-2.0"
publish = false
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
ipnetwork= "0.17.0"
rangemap= "0.1.9"
# I use this for now to avoid a performance hit due to a bug on 0.1.9
# rangemap = {version = "*", path = "/home/nickm/src/rangemap/" }
argh = "0.1.4"

View File

@ -0,0 +1,126 @@
/// Code to parse a dump file
use std::collections::HashMap;
use std::convert::TryInto;
use std::iter::Peekable;
use super::NetBlock;
pub struct BlockReader<I>
where
I: Iterator<Item = std::io::Result<String>>,
{
iter: Peekable<I>,
}
enum AnyBlock {
NotNet,
NetBlock(NetBlock),
}
impl<I> BlockReader<I>
where
I: Iterator<Item = std::io::Result<String>>,
{
pub fn new(iter: I) -> Self {
BlockReader {
iter: iter.peekable(),
}
}
/// Extract the initial header from the file.
pub fn extract_header(&mut self) -> String {
let mut res: String = "".to_string();
while let Some(Ok(line)) = self.iter.peek() {
if !line.starts_with('#') {
break;
}
res.push_str(line.as_str());
res.push('\n');
let _ = self.iter.next();
}
res
}
/// Extract the next empty-line-delimited block from the file.
///
/// This isn't terribly efficient, but it's "fast enough".
fn get_block(&mut self) -> Option<std::io::Result<AnyBlock>> {
let mut kv = HashMap::new();
while let Some(line) = self.iter.next() {
//dbg!(&line);
if let Err(e) = line {
return Some(Err(e));
}
let line_orig = line.unwrap();
let line = line_orig.splitn(2, '#').next().unwrap().trim();
if line.is_empty() {
if kv.is_empty() {
continue;
} else {
break;
}
}
let kwds: Vec<_> = line.splitn(2, ':').collect();
if kwds.len() != 2 {
return None; // XXXX handle the error better.
}
kv.insert(kwds[0].trim().to_string(), kwds[1].trim().to_string());
}
if kv.is_empty() {
return None;
}
let net = if let Some(net) = kv.get("net") {
net.parse().unwrap() //XXXX handle the error better.
} else {
return Some(Ok(AnyBlock::NotNet));
};
let cc = if let Some(country) = kv.get("country") {
assert!(country.as_bytes().len() == 2);
country.as_bytes()[0..2].try_into().unwrap()
} else {
return Some(Ok(AnyBlock::NotNet));
};
fn is_true(v: Option<&String>) -> bool {
match v {
Some(s) => s == "true",
None => false,
}
}
let is_anon_proxy = is_true(kv.get("is-anonymous-proxy"));
let is_anycast = is_true(kv.get("is-anycast-proxy"));
let is_satellite = is_true(kv.get("is-satellite-provider"));
Some(Ok(AnyBlock::NetBlock(NetBlock {
net,
cc,
is_anon_proxy,
is_anycast,
is_satellite,
})))
}
}
impl<I> Iterator for BlockReader<I>
where
I: Iterator<Item = std::io::Result<String>>,
{
type Item = NetBlock;
fn next(&mut self) -> Option<Self::Item> {
loop {
match self.get_block() {
None => return None,
Some(Err(_)) => return None,
Some(Ok(AnyBlock::NotNet)) => continue,
Some(Ok(AnyBlock::NetBlock(n))) => return Some(n),
}
}
}
}

View File

@ -0,0 +1,165 @@
/// A basic tool to convert IPFire Location dumps into the CSV formats that Tor
/// expects.
mod db;
use argh::FromArgs;
use ipnetwork::IpNetwork;
use rangemap::RangeInclusiveMap;
use std::fs::File;
use std::io::{BufRead, BufReader, BufWriter, Write};
use std::net::{IpAddr, Ipv6Addr};
use std::path::{Path, PathBuf};
fn default_ipv4_path() -> PathBuf {
"./geoip".into()
}
fn default_ipv6_path() -> PathBuf {
"./geoip6".into()
}
#[derive(FromArgs)]
/// Convert an IPFire Location dump into CSV geoip files.
struct Args {
/// where to store the IPv4 geoip output
#[argh(option, default = "default_ipv4_path()", short = '4')]
output_ipv4: PathBuf,
/// where to store the IPv6 geoip6 output
#[argh(option, default = "default_ipv6_path()", short = '6')]
output_ipv6: PathBuf,
/// where to find the dump file
#[argh(option, short = 'i')]
input: PathBuf,
}
/// Represents a network block from running `location dump`.
#[derive(Debug, Clone)]
pub struct NetBlock {
pub net: IpNetwork,
pub cc: [u8; 2],
pub is_anon_proxy: bool,
pub is_anycast: bool,
pub is_satellite: bool,
}
impl PartialEq for NetBlock {
fn eq(&self, other: &Self) -> bool {
self.net == other.net
}
}
/// We define network blocks as being sorted first from largest to smallest,
/// then by address.
impl Ord for NetBlock {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
self.net
.prefix()
.cmp(&other.net.prefix())
.then_with(|| self.net.network().cmp(&other.net.network()))
}
}
impl PartialOrd for NetBlock {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl Eq for NetBlock {}
const PROLOGUE: &str = "\
# This file has been converted from the IPFire Location database
# using Tor's geoip-db-tool. For more information on the data, see
# https://location.ipfire.org/.
#
# Below is the header from the original export:
#
";
/// Read an input file in the `location dump` format, and write CSV ipv4 and ipv6 files.
///
/// This code tries to be "efficient enough"; most of the logic is handled by
/// using the rangemap crate.
fn convert(input: &Path, output_v4: &Path, output_v6: &Path) -> std::io::Result<()> {
let f = File::open(input)?;
let f = BufReader::new(f);
let mut blocks = Vec::new();
let mut reader = db::BlockReader::new(f.lines());
let hdr = reader.extract_header();
// Read blocks, and then sort them by specificity and address.
for nb in reader {
blocks.push(nb);
}
blocks.sort();
// Convert the sorted blocks into a map from address ranges into
// country codes.
//
// Note that since we have sorted the blocks from least to most specific,
// we will be puttting them into the maps in the right order, so that the
// most specific rule "wins".
//
// We use u32 and u128 as the index types for these RangeInclusiveMaps,
// so that we don't need to implement a step function for IpAddr.
let mut v4map: RangeInclusiveMap<u32, [u8; 2], _> = RangeInclusiveMap::new();
let mut v6map: RangeInclusiveMap<u128, [u8; 2], _> = RangeInclusiveMap::new();
let mut n = 0usize;
let num_blocks = blocks.len();
for nb in blocks {
n += 1;
if n % 100000 == 0 {
println!("{}/{}", n, num_blocks);
}
let start = nb.net.network();
let end = nb.net.broadcast();
match (start, end) {
(IpAddr::V4(a), IpAddr::V4(b)) => {
v4map.insert(a.into()..=b.into(), nb.cc);
}
(IpAddr::V6(a), IpAddr::V6(b)) => {
v6map.insert(a.into()..=b.into(), nb.cc);
}
(_, _) => panic!("network started and ended in different families!?"),
}
}
// Write the ranges out to the appropriate files, in order.
let mut v4 = BufWriter::new(File::create(output_v4)?);
let mut v6 = BufWriter::new(File::create(output_v6)?);
v4.write_all(PROLOGUE.as_bytes())?;
v4.write_all(hdr.as_bytes())?;
for (r, cc) in v4map.iter() {
let a: u32 = *r.start();
let b: u32 = *r.end();
writeln!(&mut v4, "{},{},{}", a, b, std::str::from_utf8(cc).unwrap())?;
}
v6.write_all(PROLOGUE.as_bytes())?;
v6.write_all(hdr.as_bytes())?;
for (r, cc) in v6map.iter() {
let a: Ipv6Addr = (*r.start()).into();
let b: Ipv6Addr = (*r.end()).into();
writeln!(&mut v6, "{},{},{}", a, b, std::str::from_utf8(cc).unwrap())?;
}
// The documentation says you should always flush a BufWriter.
v4.flush()?;
v6.flush()?;
Ok(())
}
fn main() -> std::io::Result<()> {
let args: Args = argh::from_env();
convert(
args.input.as_path(),
args.output_ipv4.as_path(),
args.output_ipv6.as_path(),
)
}

View File

@ -0,0 +1,16 @@
#!/bin/sh
set -e
DIR=$(cd "$(dirname "$0")" && pwd)
TMP=$(mktemp -d)
location update
location dump "$TMP/geoip-dump.txt"
OLDDIR=$(pwd)
cd "$DIR/geoip-db-tool/"
cargo build --release
cd "$OLDDIR"
"$DIR/geoip-db-tool/target/release/geoip-db-tool" -i "$TMP/geoip-dump.txt"