mirror of
https://gitlab.torproject.org/tpo/core/tor.git
synced 2024-11-23 20:03:31 +01:00
Rust tool to convert IPFire Location dump into CSV format.
The IPFire people provide a tool that collects data from several top-level sources, combines it into a single database, and annotates it with optional overrides. This tool transforms the "dump" format of their database into the form Tor expects.
This commit is contained in:
parent
8ccfd4a51a
commit
0d4237839b
25
scripts/maint/geoip/README.geoip
Normal file
25
scripts/maint/geoip/README.geoip
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
To generate new geoip files, you'll need to install the
|
||||||
|
libloc/"location" tool provided by https://location.ipfire.org/.
|
||||||
|
I personally build it with:
|
||||||
|
|
||||||
|
./configure CFLAGS='-g -O2' --disable-perl --without-systemd --prefix=/opt/libloc
|
||||||
|
make
|
||||||
|
make install
|
||||||
|
|
||||||
|
Then (after adjusting PATH and PYTHONPATH) you can get the latest
|
||||||
|
dump with:
|
||||||
|
|
||||||
|
location update
|
||||||
|
location dump geoip-dump.txt
|
||||||
|
|
||||||
|
And transform it into geoip files with
|
||||||
|
|
||||||
|
cargo run --release -- -i geoip-dump.txt
|
||||||
|
|
||||||
|
|
||||||
|
==============================
|
||||||
|
|
||||||
|
Note that the current version "0.1.9" of rangemap has a performance
|
||||||
|
bug, making this tool quite slow. Previous versions had a
|
||||||
|
correctness bug that made the output needlessly long. With luck,
|
||||||
|
there will soon be a fast correct rangemap version.
|
1
scripts/maint/geoip/geoip-db-tool/.gitignore
vendored
Normal file
1
scripts/maint/geoip/geoip-db-tool/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
target
|
16
scripts/maint/geoip/geoip-db-tool/Cargo.toml
Normal file
16
scripts/maint/geoip/geoip-db-tool/Cargo.toml
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
[package]
|
||||||
|
name = "geoip-db-tool"
|
||||||
|
version = "0.1.0"
|
||||||
|
authors = ["Nick Mathewson <nickm@torproject.org>"]
|
||||||
|
edition = "2018"
|
||||||
|
license = "MIT OR Apache-2.0"
|
||||||
|
publish = false
|
||||||
|
|
||||||
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
ipnetwork= "0.17.0"
|
||||||
|
rangemap= "0.1.9"
|
||||||
|
# I use this for now to avoid a performance hit due to a bug on 0.1.9
|
||||||
|
# rangemap = {version = "*", path = "/home/nickm/src/rangemap/" }
|
||||||
|
argh = "0.1.4"
|
126
scripts/maint/geoip/geoip-db-tool/src/db.rs
Normal file
126
scripts/maint/geoip/geoip-db-tool/src/db.rs
Normal file
@ -0,0 +1,126 @@
|
|||||||
|
/// Code to parse a dump file
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::convert::TryInto;
|
||||||
|
use std::iter::Peekable;
|
||||||
|
|
||||||
|
use super::NetBlock;
|
||||||
|
|
||||||
|
pub struct BlockReader<I>
|
||||||
|
where
|
||||||
|
I: Iterator<Item = std::io::Result<String>>,
|
||||||
|
{
|
||||||
|
iter: Peekable<I>,
|
||||||
|
}
|
||||||
|
|
||||||
|
enum AnyBlock {
|
||||||
|
NotNet,
|
||||||
|
NetBlock(NetBlock),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<I> BlockReader<I>
|
||||||
|
where
|
||||||
|
I: Iterator<Item = std::io::Result<String>>,
|
||||||
|
{
|
||||||
|
pub fn new(iter: I) -> Self {
|
||||||
|
BlockReader {
|
||||||
|
iter: iter.peekable(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract the initial header from the file.
|
||||||
|
pub fn extract_header(&mut self) -> String {
|
||||||
|
let mut res: String = "".to_string();
|
||||||
|
|
||||||
|
while let Some(Ok(line)) = self.iter.peek() {
|
||||||
|
if !line.starts_with('#') {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
res.push_str(line.as_str());
|
||||||
|
res.push('\n');
|
||||||
|
let _ = self.iter.next();
|
||||||
|
}
|
||||||
|
|
||||||
|
res
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract the next empty-line-delimited block from the file.
|
||||||
|
///
|
||||||
|
/// This isn't terribly efficient, but it's "fast enough".
|
||||||
|
fn get_block(&mut self) -> Option<std::io::Result<AnyBlock>> {
|
||||||
|
let mut kv = HashMap::new();
|
||||||
|
|
||||||
|
while let Some(line) = self.iter.next() {
|
||||||
|
//dbg!(&line);
|
||||||
|
if let Err(e) = line {
|
||||||
|
return Some(Err(e));
|
||||||
|
}
|
||||||
|
let line_orig = line.unwrap();
|
||||||
|
let line = line_orig.splitn(2, '#').next().unwrap().trim();
|
||||||
|
if line.is_empty() {
|
||||||
|
if kv.is_empty() {
|
||||||
|
continue;
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let kwds: Vec<_> = line.splitn(2, ':').collect();
|
||||||
|
if kwds.len() != 2 {
|
||||||
|
return None; // XXXX handle the error better.
|
||||||
|
}
|
||||||
|
kv.insert(kwds[0].trim().to_string(), kwds[1].trim().to_string());
|
||||||
|
}
|
||||||
|
|
||||||
|
if kv.is_empty() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
let net = if let Some(net) = kv.get("net") {
|
||||||
|
net.parse().unwrap() //XXXX handle the error better.
|
||||||
|
} else {
|
||||||
|
return Some(Ok(AnyBlock::NotNet));
|
||||||
|
};
|
||||||
|
|
||||||
|
let cc = if let Some(country) = kv.get("country") {
|
||||||
|
assert!(country.as_bytes().len() == 2);
|
||||||
|
country.as_bytes()[0..2].try_into().unwrap()
|
||||||
|
} else {
|
||||||
|
return Some(Ok(AnyBlock::NotNet));
|
||||||
|
};
|
||||||
|
|
||||||
|
fn is_true(v: Option<&String>) -> bool {
|
||||||
|
match v {
|
||||||
|
Some(s) => s == "true",
|
||||||
|
None => false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let is_anon_proxy = is_true(kv.get("is-anonymous-proxy"));
|
||||||
|
let is_anycast = is_true(kv.get("is-anycast-proxy"));
|
||||||
|
let is_satellite = is_true(kv.get("is-satellite-provider"));
|
||||||
|
|
||||||
|
Some(Ok(AnyBlock::NetBlock(NetBlock {
|
||||||
|
net,
|
||||||
|
cc,
|
||||||
|
is_anon_proxy,
|
||||||
|
is_anycast,
|
||||||
|
is_satellite,
|
||||||
|
})))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<I> Iterator for BlockReader<I>
|
||||||
|
where
|
||||||
|
I: Iterator<Item = std::io::Result<String>>,
|
||||||
|
{
|
||||||
|
type Item = NetBlock;
|
||||||
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
|
loop {
|
||||||
|
match self.get_block() {
|
||||||
|
None => return None,
|
||||||
|
Some(Err(_)) => return None,
|
||||||
|
Some(Ok(AnyBlock::NotNet)) => continue,
|
||||||
|
Some(Ok(AnyBlock::NetBlock(n))) => return Some(n),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
165
scripts/maint/geoip/geoip-db-tool/src/main.rs
Normal file
165
scripts/maint/geoip/geoip-db-tool/src/main.rs
Normal file
@ -0,0 +1,165 @@
|
|||||||
|
/// A basic tool to convert IPFire Location dumps into the CSV formats that Tor
|
||||||
|
/// expects.
|
||||||
|
mod db;
|
||||||
|
|
||||||
|
use argh::FromArgs;
|
||||||
|
use ipnetwork::IpNetwork;
|
||||||
|
use rangemap::RangeInclusiveMap;
|
||||||
|
|
||||||
|
use std::fs::File;
|
||||||
|
use std::io::{BufRead, BufReader, BufWriter, Write};
|
||||||
|
use std::net::{IpAddr, Ipv6Addr};
|
||||||
|
use std::path::{Path, PathBuf};
|
||||||
|
|
||||||
|
fn default_ipv4_path() -> PathBuf {
|
||||||
|
"./geoip".into()
|
||||||
|
}
|
||||||
|
fn default_ipv6_path() -> PathBuf {
|
||||||
|
"./geoip6".into()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(FromArgs)]
|
||||||
|
/// Convert an IPFire Location dump into CSV geoip files.
|
||||||
|
struct Args {
|
||||||
|
/// where to store the IPv4 geoip output
|
||||||
|
#[argh(option, default = "default_ipv4_path()", short = '4')]
|
||||||
|
output_ipv4: PathBuf,
|
||||||
|
|
||||||
|
/// where to store the IPv6 geoip6 output
|
||||||
|
#[argh(option, default = "default_ipv6_path()", short = '6')]
|
||||||
|
output_ipv6: PathBuf,
|
||||||
|
|
||||||
|
/// where to find the dump file
|
||||||
|
#[argh(option, short = 'i')]
|
||||||
|
input: PathBuf,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Represents a network block from running `location dump`.
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct NetBlock {
|
||||||
|
pub net: IpNetwork,
|
||||||
|
pub cc: [u8; 2],
|
||||||
|
pub is_anon_proxy: bool,
|
||||||
|
pub is_anycast: bool,
|
||||||
|
pub is_satellite: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PartialEq for NetBlock {
|
||||||
|
fn eq(&self, other: &Self) -> bool {
|
||||||
|
self.net == other.net
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// We define network blocks as being sorted first from largest to smallest,
|
||||||
|
/// then by address.
|
||||||
|
impl Ord for NetBlock {
|
||||||
|
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
|
||||||
|
self.net
|
||||||
|
.prefix()
|
||||||
|
.cmp(&other.net.prefix())
|
||||||
|
.then_with(|| self.net.network().cmp(&other.net.network()))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PartialOrd for NetBlock {
|
||||||
|
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
|
||||||
|
Some(self.cmp(other))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Eq for NetBlock {}
|
||||||
|
|
||||||
|
const PROLOGUE: &str = "\
|
||||||
|
# This file has been converted from the IPFire Location database
|
||||||
|
# using Tor's geoip-db-tool. For more information on the data, see
|
||||||
|
# https://location.ipfire.org/.
|
||||||
|
#
|
||||||
|
# Below is the header from the original export:
|
||||||
|
#
|
||||||
|
";
|
||||||
|
|
||||||
|
/// Read an input file in the `location dump` format, and write CSV ipv4 and ipv6 files.
|
||||||
|
///
|
||||||
|
/// This code tries to be "efficient enough"; most of the logic is handled by
|
||||||
|
/// using the rangemap crate.
|
||||||
|
fn convert(input: &Path, output_v4: &Path, output_v6: &Path) -> std::io::Result<()> {
|
||||||
|
let f = File::open(input)?;
|
||||||
|
let f = BufReader::new(f);
|
||||||
|
let mut blocks = Vec::new();
|
||||||
|
|
||||||
|
let mut reader = db::BlockReader::new(f.lines());
|
||||||
|
let hdr = reader.extract_header();
|
||||||
|
// Read blocks, and then sort them by specificity and address.
|
||||||
|
for nb in reader {
|
||||||
|
blocks.push(nb);
|
||||||
|
}
|
||||||
|
blocks.sort();
|
||||||
|
|
||||||
|
// Convert the sorted blocks into a map from address ranges into
|
||||||
|
// country codes.
|
||||||
|
//
|
||||||
|
// Note that since we have sorted the blocks from least to most specific,
|
||||||
|
// we will be puttting them into the maps in the right order, so that the
|
||||||
|
// most specific rule "wins".
|
||||||
|
//
|
||||||
|
// We use u32 and u128 as the index types for these RangeInclusiveMaps,
|
||||||
|
// so that we don't need to implement a step function for IpAddr.
|
||||||
|
let mut v4map: RangeInclusiveMap<u32, [u8; 2], _> = RangeInclusiveMap::new();
|
||||||
|
let mut v6map: RangeInclusiveMap<u128, [u8; 2], _> = RangeInclusiveMap::new();
|
||||||
|
|
||||||
|
let mut n = 0usize;
|
||||||
|
let num_blocks = blocks.len();
|
||||||
|
for nb in blocks {
|
||||||
|
n += 1;
|
||||||
|
if n % 100000 == 0 {
|
||||||
|
println!("{}/{}", n, num_blocks);
|
||||||
|
}
|
||||||
|
let start = nb.net.network();
|
||||||
|
let end = nb.net.broadcast();
|
||||||
|
match (start, end) {
|
||||||
|
(IpAddr::V4(a), IpAddr::V4(b)) => {
|
||||||
|
v4map.insert(a.into()..=b.into(), nb.cc);
|
||||||
|
}
|
||||||
|
(IpAddr::V6(a), IpAddr::V6(b)) => {
|
||||||
|
v6map.insert(a.into()..=b.into(), nb.cc);
|
||||||
|
}
|
||||||
|
(_, _) => panic!("network started and ended in different families!?"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write the ranges out to the appropriate files, in order.
|
||||||
|
let mut v4 = BufWriter::new(File::create(output_v4)?);
|
||||||
|
let mut v6 = BufWriter::new(File::create(output_v6)?);
|
||||||
|
|
||||||
|
v4.write_all(PROLOGUE.as_bytes())?;
|
||||||
|
v4.write_all(hdr.as_bytes())?;
|
||||||
|
for (r, cc) in v4map.iter() {
|
||||||
|
let a: u32 = *r.start();
|
||||||
|
let b: u32 = *r.end();
|
||||||
|
writeln!(&mut v4, "{},{},{}", a, b, std::str::from_utf8(cc).unwrap())?;
|
||||||
|
}
|
||||||
|
|
||||||
|
v6.write_all(PROLOGUE.as_bytes())?;
|
||||||
|
v6.write_all(hdr.as_bytes())?;
|
||||||
|
for (r, cc) in v6map.iter() {
|
||||||
|
let a: Ipv6Addr = (*r.start()).into();
|
||||||
|
let b: Ipv6Addr = (*r.end()).into();
|
||||||
|
writeln!(&mut v6, "{},{},{}", a, b, std::str::from_utf8(cc).unwrap())?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// The documentation says you should always flush a BufWriter.
|
||||||
|
v4.flush()?;
|
||||||
|
v6.flush()?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn main() -> std::io::Result<()> {
|
||||||
|
let args: Args = argh::from_env();
|
||||||
|
|
||||||
|
convert(
|
||||||
|
args.input.as_path(),
|
||||||
|
args.output_ipv4.as_path(),
|
||||||
|
args.output_ipv6.as_path(),
|
||||||
|
)
|
||||||
|
}
|
16
scripts/maint/geoip/update_geoip.sh
Executable file
16
scripts/maint/geoip/update_geoip.sh
Executable file
@ -0,0 +1,16 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
DIR=$(cd "$(dirname "$0")" && pwd)
|
||||||
|
TMP=$(mktemp -d)
|
||||||
|
|
||||||
|
location update
|
||||||
|
location dump "$TMP/geoip-dump.txt"
|
||||||
|
|
||||||
|
OLDDIR=$(pwd)
|
||||||
|
cd "$DIR/geoip-db-tool/"
|
||||||
|
cargo build --release
|
||||||
|
cd "$OLDDIR"
|
||||||
|
|
||||||
|
"$DIR/geoip-db-tool/target/release/geoip-db-tool" -i "$TMP/geoip-dump.txt"
|
Loading…
Reference in New Issue
Block a user