Merge pull request #65 from MarginaliaSearch/asn-info
Replace the ip2location-LITE IP geolocation data with ASN information from apnic.net
This commit is contained in:
commit
7797de80e3
@ -22,6 +22,10 @@ public class DomainInformation {
|
||||
boolean unknownDomain;
|
||||
|
||||
String ip;
|
||||
Integer asn;
|
||||
String asnOrg;
|
||||
String asnCountry;
|
||||
|
||||
String ipCountry;
|
||||
String state;
|
||||
|
||||
|
@ -62,6 +62,15 @@ public class WmsaHome {
|
||||
|
||||
public static Path getIPLocationDatabse() {
|
||||
return getHomePath().resolve("data").resolve("IP2LOCATION-LITE-DB1.CSV");
|
||||
|
||||
}
|
||||
|
||||
public static Path getAsnMappingDatabase() {
|
||||
return getHomePath().resolve("data").resolve("asn-data-raw-table");
|
||||
}
|
||||
|
||||
public static Path getAsnInfoDatabase() {
|
||||
return getHomePath().resolve("data").resolve("asn-used-autnums");
|
||||
}
|
||||
|
||||
public static LanguageModels getLanguageModels() {
|
||||
@ -85,4 +94,6 @@ public class WmsaHome {
|
||||
public static boolean isDebug() {
|
||||
return debugMode;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
@ -1,55 +1,45 @@
|
||||
package nu.marginalia.geoip;
|
||||
|
||||
import com.opencsv.CSVReader;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.geoip.sources.AsnMapping;
|
||||
import nu.marginalia.geoip.sources.AsnTable;
|
||||
import nu.marginalia.geoip.sources.IP2LocationMapping;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.FileReader;
|
||||
import java.net.InetAddress;
|
||||
import java.util.TreeMap;
|
||||
import java.util.Optional;
|
||||
|
||||
public class GeoIpDictionary {
|
||||
private volatile TreeMap<Long, IpRange> ranges = null;
|
||||
private volatile IP2LocationMapping ip2locMapping = null;
|
||||
private volatile AsnTable asnTable = null;
|
||||
private volatile AsnMapping asnMapping = null;
|
||||
private static final Logger logger = LoggerFactory.getLogger(GeoIpDictionary.class);
|
||||
|
||||
record IpRange(long from, long to, String country) {}
|
||||
volatile boolean ready = false;
|
||||
|
||||
public GeoIpDictionary() {
|
||||
Thread.ofPlatform().start(() -> {
|
||||
try (var reader = new CSVReader(new FileReader(WmsaHome.getIPLocationDatabse().toFile()))) {
|
||||
var dict = new TreeMap<Long, IpRange>();
|
||||
this.asnTable = new AsnTable(WmsaHome.getAsnInfoDatabase());
|
||||
logger.info("Loaded ASN table");
|
||||
this.asnMapping = new AsnMapping(WmsaHome.getAsnMappingDatabase());
|
||||
logger.info("Loaded ASN mapping");
|
||||
this.ip2locMapping = new IP2LocationMapping(WmsaHome.getIPLocationDatabse());
|
||||
|
||||
for (;;) {
|
||||
String[] vals = reader.readNext();
|
||||
if (vals == null) {
|
||||
break;
|
||||
}
|
||||
var range = new IpRange(Long.parseLong(vals[0]),
|
||||
Long.parseLong(vals[1]),
|
||||
vals[2]);
|
||||
dict.put(range.from, range);
|
||||
}
|
||||
ranges = dict;
|
||||
logger.info("Loaded {} IP ranges", ranges.size());
|
||||
} catch (Exception e) {
|
||||
ranges = new TreeMap<>();
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
finally {
|
||||
synchronized (this) {
|
||||
this.notifyAll();
|
||||
}
|
||||
ready = true;
|
||||
|
||||
synchronized (this) {
|
||||
this.notifyAll();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
public boolean isReady() {
|
||||
return null != ranges;
|
||||
return ready;
|
||||
}
|
||||
|
||||
public boolean waitReady() {
|
||||
while (null == ranges) {
|
||||
while (!ready) {
|
||||
try {
|
||||
synchronized (this) {
|
||||
this.wait(1000);
|
||||
@ -61,32 +51,46 @@ public class GeoIpDictionary {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
public String getCountry(String ip) {
|
||||
try {
|
||||
return getCountry(InetAddress.getByName(ip));
|
||||
} catch (Exception e) {
|
||||
if (null == ip2locMapping) {
|
||||
return "";
|
||||
}
|
||||
return ip2locMapping.getCountry(ip);
|
||||
}
|
||||
|
||||
public String getCountry(InetAddress address) {
|
||||
if (null == ranges) { // not loaded yet or failed to load
|
||||
if (null == ip2locMapping) {
|
||||
return "";
|
||||
}
|
||||
|
||||
return ip2locMapping.getCountry(address);
|
||||
}
|
||||
|
||||
public Optional<AsnTable.AsnInfo> getAsnInfo(String ip) {
|
||||
try {
|
||||
return getAsnInfo(InetAddress.getByName(ip));
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
|
||||
public Optional<AsnTable.AsnInfo> getAsnInfo(int ipAddress) {
|
||||
if (null == asnTable) { // not loaded yet or failed to load
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
return asnMapping
|
||||
.getAsnNumber(ipAddress)
|
||||
.flatMap(asn -> asnTable.getAsnInfo(asn));
|
||||
}
|
||||
|
||||
public Optional<AsnTable.AsnInfo> getAsnInfo(InetAddress address) {
|
||||
byte[] bytes = address.getAddress();
|
||||
long ival = ((long)bytes[0]&0xFF) << 24 | ((long)bytes[1]&0xFF) << 16 | ((long)bytes[2]&0xFF)<< 8 | ((long)bytes[3]&0xFF);
|
||||
|
||||
Long key = ranges.floorKey(ival);
|
||||
if (null == key) {
|
||||
return "";
|
||||
}
|
||||
int ival = (int) (((long)bytes[0]&0xFF) << 24 | ((long)bytes[1]&0xFF) << 16 | ((long)bytes[2]&0xFF)<< 8 | ((long)bytes[3]&0xFF));
|
||||
|
||||
var range = ranges.get(key);
|
||||
if (ival >= key && ival < range.to) {
|
||||
return range.country;
|
||||
}
|
||||
|
||||
return "";
|
||||
return getAsnInfo(ival);
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,64 @@
|
||||
package nu.marginalia.geoip.sources;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Optional;
|
||||
|
||||
public class AsnMapping {
|
||||
private static final Logger logger = LoggerFactory.getLogger(AsnMapping.class);
|
||||
private final IpRangeMapping<Integer> ranges = new IpRangeMapping<>();
|
||||
|
||||
public AsnMapping(Path databaseFile) {
|
||||
try (var reader = Files.lines(databaseFile)) {
|
||||
reader.forEach(this::parseAsnMappingFileLine);
|
||||
} catch (Exception e) {
|
||||
logger.error("Failed to load ASN mapping" + databaseFile, e);
|
||||
}
|
||||
}
|
||||
|
||||
public Optional<Integer> getAsnNumber(int ip) {
|
||||
return ranges.get(ip);
|
||||
}
|
||||
|
||||
private void parseAsnMappingFileLine(String s) {
|
||||
try {
|
||||
String[] parts = StringUtils.split(s, '\t');
|
||||
if (parts.length != 2) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Parse CIDR notation, e.g. 127.0.0.1/24 -> ["127.0.0.1", "24"]
|
||||
String[] cidrParts = StringUtils.split(parts[0], '/');
|
||||
if (cidrParts.length != 2) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Parse IP address and subnet mask
|
||||
String[] ipParts = StringUtils.split(cidrParts[0], '.');
|
||||
int ipMask = Integer.parseInt(cidrParts[1]);
|
||||
|
||||
// Convert subnet mask to integer start and end values
|
||||
int ipStart = 0;
|
||||
int ipEnd = 0;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
int ipByte = Integer.parseInt(ipParts[i]);
|
||||
ipStart |= ipByte << (24 - 8 * i);
|
||||
ipEnd |= ipByte << (24 - 8 * i);
|
||||
}
|
||||
ipStart &= 0xFFFFFFFF << (32 - ipMask);
|
||||
ipEnd |= 0xFFFFFFFF >>> ipMask;
|
||||
|
||||
|
||||
ranges.add(ipStart, ipEnd, Integer.parseInt(parts[1]));
|
||||
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.warn("Failed to parse ASN mapping line: {}", s);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,62 @@
|
||||
package nu.marginalia.geoip.sources;
|
||||
|
||||
import nu.marginalia.WmsaHome;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.HashMap;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
|
||||
public class AsnTable {
|
||||
public HashMap<Integer, AsnInfo> asns = new HashMap<>(65536);
|
||||
public record AsnInfo(int asn, String country, String org) {}
|
||||
private static final Logger logger = LoggerFactory.getLogger(AsnTable.class);
|
||||
|
||||
public AsnTable(Path asnFile) {
|
||||
try (var reader = Files.lines(WmsaHome.getAsnInfoDatabase())) {
|
||||
reader.map(AsnTable::parseAsnFileLine).filter(Objects::nonNull).forEach(asn -> asns.put(asn.asn(), asn));
|
||||
} catch (Exception e) {
|
||||
logger.error("Failed to load ASN database " + asnFile, e);
|
||||
}
|
||||
}
|
||||
|
||||
public Optional<AsnInfo> getAsnInfo(int asn) {
|
||||
return Optional.ofNullable(asns.get(asn));
|
||||
}
|
||||
|
||||
static AsnInfo parseAsnFileLine(String line) {
|
||||
line = line.trim();
|
||||
|
||||
try {
|
||||
int numEnd = line.indexOf(' ');
|
||||
String num = line.substring(0, numEnd);
|
||||
|
||||
int asn = Integer.parseInt(num);
|
||||
|
||||
int orgStart = numEnd + 1;
|
||||
int orgEnd = line.lastIndexOf(',');
|
||||
if (orgEnd < 0 || orgEnd < orgStart + 1) {
|
||||
orgEnd = line.length();
|
||||
}
|
||||
|
||||
String org = line.substring(orgStart, orgEnd);
|
||||
String country = "";
|
||||
if (orgEnd + 1 < line.length()) {
|
||||
country = line.substring(orgEnd + 1).trim();
|
||||
}
|
||||
|
||||
if ("UNALLOCATED".equals(org)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return new AsnInfo(asn, country, org);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.warn("Failed to parse ASN line: {}", line);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,40 @@
|
||||
package nu.marginalia.geoip.sources;
|
||||
|
||||
import com.opencsv.CSVReader;
|
||||
|
||||
import java.net.InetAddress;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
/** Load an IP2LOCATION LITE database file and provide a method to look up the country for an IP address.
|
||||
*/
|
||||
public class IP2LocationMapping {
|
||||
private final IpRangeMapping<String> ranges = new IpRangeMapping<>();
|
||||
|
||||
public IP2LocationMapping(Path filename) {
|
||||
try (var reader = new CSVReader(Files.newBufferedReader(filename))) {
|
||||
for (;;) {
|
||||
String[] vals = reader.readNext();
|
||||
if (vals == null) {
|
||||
break;
|
||||
}
|
||||
|
||||
ranges.add(Integer.parseUnsignedInt(vals[0]), Integer.parseUnsignedInt(vals[1]), vals[2]);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public String getCountry(String ip) {
|
||||
try {
|
||||
return getCountry(InetAddress.getByName(ip));
|
||||
} catch (Exception e) {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
public String getCountry(InetAddress address) {
|
||||
return ranges.get(address).orElse("");
|
||||
}
|
||||
}
|
@ -0,0 +1,41 @@
|
||||
package nu.marginalia.geoip.sources;
|
||||
|
||||
import java.net.InetAddress;
|
||||
import java.util.Optional;
|
||||
import java.util.TreeMap;
|
||||
|
||||
public class IpRangeMapping<T> {
|
||||
private final TreeMap<Integer, IpRangeWithCountry<T>> ranges = new TreeMap<>(Integer::compareUnsigned);
|
||||
|
||||
public record IpRangeWithCountry<T>(int ipStart, int ipEnd, T value) {
|
||||
public boolean contains(int ip) {
|
||||
return Integer.compareUnsigned(ipStart, ip) <= 0
|
||||
&& Integer.compareUnsigned(ip, ipEnd) < 0;
|
||||
}
|
||||
}
|
||||
|
||||
public void add(int ipStart, int ipEnd, T value) {
|
||||
ranges.put(ipStart, new IpRangeWithCountry<>(ipStart, ipEnd, value));
|
||||
}
|
||||
|
||||
public Optional<T> get(InetAddress address) {
|
||||
byte[] bytes = address.getAddress();
|
||||
int ival = (int) (((long) bytes[0] & 0xFF) << 24 | ((long) bytes[1] & 0xFF) << 16 | ((long) bytes[2] & 0xFF) << 8 | ((long) bytes[3] & 0xFF));
|
||||
|
||||
return get(ival);
|
||||
}
|
||||
|
||||
public Optional<T> get(int ipUnsignedInt) {
|
||||
Integer key = ranges.floorKey(ipUnsignedInt);
|
||||
if (null == key) {
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
var range = ranges.get(key);
|
||||
if (range.contains(ipUnsignedInt)) {
|
||||
return Optional.of(range.value);
|
||||
}
|
||||
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
@ -0,0 +1,16 @@
|
||||
package nu.marginalia.geoip;
|
||||
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
@Tag("slow")
|
||||
class GeoIpDictionaryTest {
|
||||
|
||||
@Test
|
||||
public void testAsnResolution() {
|
||||
GeoIpDictionary geoIpDictionary = new GeoIpDictionary();
|
||||
geoIpDictionary.waitReady();
|
||||
System.out.println(geoIpDictionary.getAsnInfo("193.183.0.162"));
|
||||
}
|
||||
|
||||
}
|
@ -11,12 +11,14 @@ import nu.marginalia.converting.processor.logic.links.LinkGraph;
|
||||
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
|
||||
import nu.marginalia.crawling.model.*;
|
||||
import nu.marginalia.geoip.GeoIpDictionary;
|
||||
import nu.marginalia.geoip.sources.AsnTable;
|
||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||
import nu.marginalia.converting.model.ProcessedDomain;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.converting.processor.logic.links.TopKeywords;
|
||||
import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jetbrains.annotations.Nullable;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@ -121,12 +123,7 @@ public class DomainProcessor {
|
||||
|
||||
List<String> terms = new ArrayList<>();
|
||||
|
||||
terms.add("ip:"+ip);
|
||||
|
||||
String ipCountryCode = geoIpDictionary.getCountry(ip).toLowerCase();
|
||||
if (!ipCountryCode.isBlank()) {
|
||||
terms.add("ip:"+ipCountryCode);
|
||||
}
|
||||
addIpInfo(terms, ip);
|
||||
|
||||
if (cookies) {
|
||||
terms.add(HtmlFeature.COOKIES.getKeyword());
|
||||
@ -156,6 +153,59 @@ public class DomainProcessor {
|
||||
return ret;
|
||||
}
|
||||
|
||||
private void addIpInfo(List<String> terms, String ip) {
|
||||
terms.add("ip:"+ip);
|
||||
|
||||
// Add IP location country as a term
|
||||
String country = geoIpDictionary.getCountry(ip);
|
||||
if (!country.isBlank()) { // use the ip:-prefix as there's no real confusion between e.g. ip:127.0.0.1 and ip:uk
|
||||
terms.add("ip:"+country.toLowerCase());
|
||||
}
|
||||
|
||||
// Add ASN as a term
|
||||
geoIpDictionary.getAsnInfo(ip).ifPresent(asnInfo -> {
|
||||
terms.add("as:"+asnInfo.asn());
|
||||
|
||||
for (var orgPart : StringUtils.split(asnInfo.org(), '-')) {
|
||||
terms.add("as:"+orgPart.toLowerCase());
|
||||
}
|
||||
|
||||
if (isCloudy(asnInfo)) {
|
||||
terms.add("special:cloud");
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
}
|
||||
|
||||
private boolean isCloudy(AsnTable.AsnInfo asnInfo) {
|
||||
String org = asnInfo.org();
|
||||
|
||||
if (org.contains("MICROSOFT-AZURE")) {
|
||||
return true;
|
||||
}
|
||||
if(org.contains("AMAZON")) {
|
||||
return true;
|
||||
}
|
||||
if (org.contains("CLOUDFLARE")) {
|
||||
return true;
|
||||
}
|
||||
if (org.contains("GOOGLE-CLOUD")) {
|
||||
return true;
|
||||
}
|
||||
if (org.contains("DIGITALOCEAN")) {
|
||||
return true;
|
||||
}
|
||||
if (org.contains("ALIBABA")) {
|
||||
return true;
|
||||
}
|
||||
if (org.contains("CLOUDFLARE")) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private static final Pattern academicPattern = Pattern.compile(".*\\.(ac|edu)\\.[a-z]{2}$");
|
||||
private boolean isAcademicDomain(EdgeDomain domain) {
|
||||
|
@ -6,6 +6,8 @@
|
||||
Pages Known: {{pagesKnown}} <br/>
|
||||
Pages Crawled: {{pagesFetched}} <br/>
|
||||
Pages Indexed: {{pagesIndexed}} <br/>
|
||||
<p></p>
|
||||
IP: {{ip}} {{#if ipCountry}}<span title="{{ipCountry}}">{{getIpFlag}}</span>{{/if}}<br/>
|
||||
<span title="Autonomous System">AS</span>: {{#if asn}}AS{{asn}} {{asnOrg}} {{asnCountry}}{{/if}} <br/>
|
||||
</fieldset>
|
||||
<br/>
|
@ -60,6 +60,11 @@ public class DomainInformationService {
|
||||
String ip = rs.getString("IP");
|
||||
|
||||
builder.ip(ip);
|
||||
geoIpDictionary.getAsnInfo(ip).ifPresent(asnInfo -> {
|
||||
builder.asn(asnInfo.asn());
|
||||
builder.asnOrg(asnInfo.org());
|
||||
builder.asnCountry(asnInfo.country());
|
||||
});
|
||||
builder.ipCountry(geoIpDictionary.getCountry(ip));
|
||||
|
||||
builder.nodeAffinity(rs.getInt("NODE_AFFINITY"));
|
||||
|
@ -32,6 +32,9 @@ download_model model/lid.176.ftz https://downloads.marginalia.nu/model/lid.176.f
|
||||
download_model data/IP2LOCATION-LITE-DB1.CSV.ZIP https://download.ip2location.com/lite/IP2LOCATION-LITE-DB1.CSV.ZIP
|
||||
unzip -qn -d data data/IP2LOCATION-LITE-DB1.CSV.ZIP
|
||||
|
||||
download_model data/asn-data-raw-table https://thyme.apnic.net/current/data-raw-table
|
||||
download_model data/asn-used-autnums https://thyme.apnic.net/current/data-used-autnums
|
||||
|
||||
download_model data/adblock.txt https://downloads.marginalia.nu/data/adblock.txt
|
||||
if [ ! -f data/suggestions.txt ]; then
|
||||
download_model data/suggestions.txt.gz https://downloads.marginalia.nu/data/suggestions.txt.gz
|
||||
|
Loading…
Reference in New Issue
Block a user