mirror of
https://gitlab.torproject.org/tpo/core/tor.git
synced 2024-11-10 21:23:58 +01:00
Rust tool to convert IPFire Location dump into CSV format.
The IPFire people provide a tool that collects data from several top-level sources, combines it into a single database, and annotates it with optional overrides. This tool transforms the "dump" format of their database into the form Tor expects.
This commit is contained in:
parent
8ccfd4a51a
commit
0d4237839b
25
scripts/maint/geoip/README.geoip
Normal file
25
scripts/maint/geoip/README.geoip
Normal file
@ -0,0 +1,25 @@
|
||||
To generate new geoip files, you'll need to install the
|
||||
libloc/"location" tool provided by https://location.ipfire.org/.
|
||||
I personally build it with:
|
||||
|
||||
./configure CFLAGS='-g -O2' --disable-perl --without-systemd --prefix=/opt/libloc
|
||||
make
|
||||
make install
|
||||
|
||||
Then (after adjusting PATH and PYTHONPATH) you can get the latest
|
||||
dump with:
|
||||
|
||||
location update
|
||||
location dump geoip-dump.txt
|
||||
|
||||
And transform it into geoip files with
|
||||
|
||||
cargo run --release -- -i geoip-dump.txt
|
||||
|
||||
|
||||
==============================
|
||||
|
||||
Note that the current version "0.1.9" of rangemap has a performance
|
||||
bug, making this tool quite slow. Previous versions had a
|
||||
correctness bug that made the output needlessly long. With luck,
|
||||
there will soon be a fast correct rangemap version.
|
1
scripts/maint/geoip/geoip-db-tool/.gitignore
vendored
Normal file
1
scripts/maint/geoip/geoip-db-tool/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
target
|
16
scripts/maint/geoip/geoip-db-tool/Cargo.toml
Normal file
16
scripts/maint/geoip/geoip-db-tool/Cargo.toml
Normal file
@ -0,0 +1,16 @@
|
||||
[package]
|
||||
name = "geoip-db-tool"
|
||||
version = "0.1.0"
|
||||
authors = ["Nick Mathewson <nickm@torproject.org>"]
|
||||
edition = "2018"
|
||||
license = "MIT OR Apache-2.0"
|
||||
publish = false
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
ipnetwork= "0.17.0"
|
||||
rangemap= "0.1.9"
|
||||
# I use this for now to avoid a performance hit due to a bug on 0.1.9
|
||||
# rangemap = {version = "*", path = "/home/nickm/src/rangemap/" }
|
||||
argh = "0.1.4"
|
126
scripts/maint/geoip/geoip-db-tool/src/db.rs
Normal file
126
scripts/maint/geoip/geoip-db-tool/src/db.rs
Normal file
@ -0,0 +1,126 @@
|
||||
/// Code to parse a dump file
|
||||
use std::collections::HashMap;
|
||||
use std::convert::TryInto;
|
||||
use std::iter::Peekable;
|
||||
|
||||
use super::NetBlock;
|
||||
|
||||
pub struct BlockReader<I>
|
||||
where
|
||||
I: Iterator<Item = std::io::Result<String>>,
|
||||
{
|
||||
iter: Peekable<I>,
|
||||
}
|
||||
|
||||
enum AnyBlock {
|
||||
NotNet,
|
||||
NetBlock(NetBlock),
|
||||
}
|
||||
|
||||
impl<I> BlockReader<I>
|
||||
where
|
||||
I: Iterator<Item = std::io::Result<String>>,
|
||||
{
|
||||
pub fn new(iter: I) -> Self {
|
||||
BlockReader {
|
||||
iter: iter.peekable(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract the initial header from the file.
|
||||
pub fn extract_header(&mut self) -> String {
|
||||
let mut res: String = "".to_string();
|
||||
|
||||
while let Some(Ok(line)) = self.iter.peek() {
|
||||
if !line.starts_with('#') {
|
||||
break;
|
||||
}
|
||||
res.push_str(line.as_str());
|
||||
res.push('\n');
|
||||
let _ = self.iter.next();
|
||||
}
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
/// Extract the next empty-line-delimited block from the file.
|
||||
///
|
||||
/// This isn't terribly efficient, but it's "fast enough".
|
||||
fn get_block(&mut self) -> Option<std::io::Result<AnyBlock>> {
|
||||
let mut kv = HashMap::new();
|
||||
|
||||
while let Some(line) = self.iter.next() {
|
||||
//dbg!(&line);
|
||||
if let Err(e) = line {
|
||||
return Some(Err(e));
|
||||
}
|
||||
let line_orig = line.unwrap();
|
||||
let line = line_orig.splitn(2, '#').next().unwrap().trim();
|
||||
if line.is_empty() {
|
||||
if kv.is_empty() {
|
||||
continue;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
let kwds: Vec<_> = line.splitn(2, ':').collect();
|
||||
if kwds.len() != 2 {
|
||||
return None; // XXXX handle the error better.
|
||||
}
|
||||
kv.insert(kwds[0].trim().to_string(), kwds[1].trim().to_string());
|
||||
}
|
||||
|
||||
if kv.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let net = if let Some(net) = kv.get("net") {
|
||||
net.parse().unwrap() //XXXX handle the error better.
|
||||
} else {
|
||||
return Some(Ok(AnyBlock::NotNet));
|
||||
};
|
||||
|
||||
let cc = if let Some(country) = kv.get("country") {
|
||||
assert!(country.as_bytes().len() == 2);
|
||||
country.as_bytes()[0..2].try_into().unwrap()
|
||||
} else {
|
||||
return Some(Ok(AnyBlock::NotNet));
|
||||
};
|
||||
|
||||
fn is_true(v: Option<&String>) -> bool {
|
||||
match v {
|
||||
Some(s) => s == "true",
|
||||
None => false,
|
||||
}
|
||||
}
|
||||
|
||||
let is_anon_proxy = is_true(kv.get("is-anonymous-proxy"));
|
||||
let is_anycast = is_true(kv.get("is-anycast-proxy"));
|
||||
let is_satellite = is_true(kv.get("is-satellite-provider"));
|
||||
|
||||
Some(Ok(AnyBlock::NetBlock(NetBlock {
|
||||
net,
|
||||
cc,
|
||||
is_anon_proxy,
|
||||
is_anycast,
|
||||
is_satellite,
|
||||
})))
|
||||
}
|
||||
}
|
||||
|
||||
impl<I> Iterator for BlockReader<I>
|
||||
where
|
||||
I: Iterator<Item = std::io::Result<String>>,
|
||||
{
|
||||
type Item = NetBlock;
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
loop {
|
||||
match self.get_block() {
|
||||
None => return None,
|
||||
Some(Err(_)) => return None,
|
||||
Some(Ok(AnyBlock::NotNet)) => continue,
|
||||
Some(Ok(AnyBlock::NetBlock(n))) => return Some(n),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
165
scripts/maint/geoip/geoip-db-tool/src/main.rs
Normal file
165
scripts/maint/geoip/geoip-db-tool/src/main.rs
Normal file
@ -0,0 +1,165 @@
|
||||
/// A basic tool to convert IPFire Location dumps into the CSV formats that Tor
|
||||
/// expects.
|
||||
mod db;
|
||||
|
||||
use argh::FromArgs;
|
||||
use ipnetwork::IpNetwork;
|
||||
use rangemap::RangeInclusiveMap;
|
||||
|
||||
use std::fs::File;
|
||||
use std::io::{BufRead, BufReader, BufWriter, Write};
|
||||
use std::net::{IpAddr, Ipv6Addr};
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
fn default_ipv4_path() -> PathBuf {
|
||||
"./geoip".into()
|
||||
}
|
||||
fn default_ipv6_path() -> PathBuf {
|
||||
"./geoip6".into()
|
||||
}
|
||||
|
||||
#[derive(FromArgs)]
|
||||
/// Convert an IPFire Location dump into CSV geoip files.
|
||||
struct Args {
|
||||
/// where to store the IPv4 geoip output
|
||||
#[argh(option, default = "default_ipv4_path()", short = '4')]
|
||||
output_ipv4: PathBuf,
|
||||
|
||||
/// where to store the IPv6 geoip6 output
|
||||
#[argh(option, default = "default_ipv6_path()", short = '6')]
|
||||
output_ipv6: PathBuf,
|
||||
|
||||
/// where to find the dump file
|
||||
#[argh(option, short = 'i')]
|
||||
input: PathBuf,
|
||||
}
|
||||
|
||||
/// Represents a network block from running `location dump`.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct NetBlock {
|
||||
pub net: IpNetwork,
|
||||
pub cc: [u8; 2],
|
||||
pub is_anon_proxy: bool,
|
||||
pub is_anycast: bool,
|
||||
pub is_satellite: bool,
|
||||
}
|
||||
|
||||
impl PartialEq for NetBlock {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.net == other.net
|
||||
}
|
||||
}
|
||||
|
||||
/// We define network blocks as being sorted first from largest to smallest,
|
||||
/// then by address.
|
||||
impl Ord for NetBlock {
|
||||
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
|
||||
self.net
|
||||
.prefix()
|
||||
.cmp(&other.net.prefix())
|
||||
.then_with(|| self.net.network().cmp(&other.net.network()))
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialOrd for NetBlock {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for NetBlock {}
|
||||
|
||||
const PROLOGUE: &str = "\
|
||||
# This file has been converted from the IPFire Location database
|
||||
# using Tor's geoip-db-tool. For more information on the data, see
|
||||
# https://location.ipfire.org/.
|
||||
#
|
||||
# Below is the header from the original export:
|
||||
#
|
||||
";
|
||||
|
||||
/// Read an input file in the `location dump` format, and write CSV ipv4 and ipv6 files.
|
||||
///
|
||||
/// This code tries to be "efficient enough"; most of the logic is handled by
|
||||
/// using the rangemap crate.
|
||||
fn convert(input: &Path, output_v4: &Path, output_v6: &Path) -> std::io::Result<()> {
|
||||
let f = File::open(input)?;
|
||||
let f = BufReader::new(f);
|
||||
let mut blocks = Vec::new();
|
||||
|
||||
let mut reader = db::BlockReader::new(f.lines());
|
||||
let hdr = reader.extract_header();
|
||||
// Read blocks, and then sort them by specificity and address.
|
||||
for nb in reader {
|
||||
blocks.push(nb);
|
||||
}
|
||||
blocks.sort();
|
||||
|
||||
// Convert the sorted blocks into a map from address ranges into
|
||||
// country codes.
|
||||
//
|
||||
// Note that since we have sorted the blocks from least to most specific,
|
||||
// we will be puttting them into the maps in the right order, so that the
|
||||
// most specific rule "wins".
|
||||
//
|
||||
// We use u32 and u128 as the index types for these RangeInclusiveMaps,
|
||||
// so that we don't need to implement a step function for IpAddr.
|
||||
let mut v4map: RangeInclusiveMap<u32, [u8; 2], _> = RangeInclusiveMap::new();
|
||||
let mut v6map: RangeInclusiveMap<u128, [u8; 2], _> = RangeInclusiveMap::new();
|
||||
|
||||
let mut n = 0usize;
|
||||
let num_blocks = blocks.len();
|
||||
for nb in blocks {
|
||||
n += 1;
|
||||
if n % 100000 == 0 {
|
||||
println!("{}/{}", n, num_blocks);
|
||||
}
|
||||
let start = nb.net.network();
|
||||
let end = nb.net.broadcast();
|
||||
match (start, end) {
|
||||
(IpAddr::V4(a), IpAddr::V4(b)) => {
|
||||
v4map.insert(a.into()..=b.into(), nb.cc);
|
||||
}
|
||||
(IpAddr::V6(a), IpAddr::V6(b)) => {
|
||||
v6map.insert(a.into()..=b.into(), nb.cc);
|
||||
}
|
||||
(_, _) => panic!("network started and ended in different families!?"),
|
||||
}
|
||||
}
|
||||
|
||||
// Write the ranges out to the appropriate files, in order.
|
||||
let mut v4 = BufWriter::new(File::create(output_v4)?);
|
||||
let mut v6 = BufWriter::new(File::create(output_v6)?);
|
||||
|
||||
v4.write_all(PROLOGUE.as_bytes())?;
|
||||
v4.write_all(hdr.as_bytes())?;
|
||||
for (r, cc) in v4map.iter() {
|
||||
let a: u32 = *r.start();
|
||||
let b: u32 = *r.end();
|
||||
writeln!(&mut v4, "{},{},{}", a, b, std::str::from_utf8(cc).unwrap())?;
|
||||
}
|
||||
|
||||
v6.write_all(PROLOGUE.as_bytes())?;
|
||||
v6.write_all(hdr.as_bytes())?;
|
||||
for (r, cc) in v6map.iter() {
|
||||
let a: Ipv6Addr = (*r.start()).into();
|
||||
let b: Ipv6Addr = (*r.end()).into();
|
||||
writeln!(&mut v6, "{},{},{}", a, b, std::str::from_utf8(cc).unwrap())?;
|
||||
}
|
||||
|
||||
// The documentation says you should always flush a BufWriter.
|
||||
v4.flush()?;
|
||||
v6.flush()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn main() -> std::io::Result<()> {
|
||||
let args: Args = argh::from_env();
|
||||
|
||||
convert(
|
||||
args.input.as_path(),
|
||||
args.output_ipv4.as_path(),
|
||||
args.output_ipv6.as_path(),
|
||||
)
|
||||
}
|
16
scripts/maint/geoip/update_geoip.sh
Executable file
16
scripts/maint/geoip/update_geoip.sh
Executable file
@ -0,0 +1,16 @@
|
||||
#!/bin/sh
|
||||
|
||||
set -e
|
||||
|
||||
DIR=$(cd "$(dirname "$0")" && pwd)
|
||||
TMP=$(mktemp -d)
|
||||
|
||||
location update
|
||||
location dump "$TMP/geoip-dump.txt"
|
||||
|
||||
OLDDIR=$(pwd)
|
||||
cd "$DIR/geoip-db-tool/"
|
||||
cargo build --release
|
||||
cd "$OLDDIR"
|
||||
|
||||
"$DIR/geoip-db-tool/target/release/geoip-db-tool" -i "$TMP/geoip-dump.txt"
|
Loading…
Reference in New Issue
Block a user