(geo-ip) Revert removal of ip2location logic

We do both ip2location and ASN data.

The change also adds some keywords based on autonomous system information, on a somewhat experimental basis.  It would be neat to be able to e.g. exclude cloud services or just e.g. cloudflare from the search results.
This commit is contained in:
Viktor Lofgren 2023-12-17 15:03:00 +01:00
parent bde68ba48b
commit c92f1b8df8
14 changed files with 196 additions and 159 deletions

View File

@ -24,6 +24,8 @@ public class DomainInformation {
String ip;
Integer asn;
String asnOrg;
String asnCountry;
String ipCountry;
String state;

View File

@ -2,7 +2,6 @@ package nu.marginalia.ip_blocklist;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.geoip.AsnTable;
import nu.marginalia.geoip.GeoIpDictionary;
import nu.marginalia.model.EdgeDomain;
import org.slf4j.Logger;
@ -44,9 +43,7 @@ public class GeoIpBlocklist {
public String getCountry(EdgeDomain domain) {
try {
return ipDictionary.getAsnInfo(InetAddressCache.getAddress(domain))
.map(AsnTable.AsnInfo::country)
.orElse("-");
return ipDictionary.getCountry(InetAddressCache.getAddress(domain));
}
catch (Throwable ex) {
logger.debug("Failed to resolve {}", domain);

View File

@ -1,6 +1,9 @@
package nu.marginalia.geoip;
import nu.marginalia.WmsaHome;
import nu.marginalia.geoip.sources.AsnMapping;
import nu.marginalia.geoip.sources.AsnTable;
import nu.marginalia.geoip.sources.IP2LocationMapping;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -8,10 +11,12 @@ import java.net.InetAddress;
import java.util.Optional;
public class GeoIpDictionary {
private volatile IP2LocationMapping ip2locMapping = null;
private volatile AsnTable asnTable = null;
private volatile AsnMapping asnMapping = null;
private static final Logger logger = LoggerFactory.getLogger(GeoIpDictionary.class);
volatile boolean ready = false;
public GeoIpDictionary() {
Thread.ofPlatform().start(() -> {
@ -19,15 +24,22 @@ public class GeoIpDictionary {
logger.info("Loaded ASN table");
this.asnMapping = new AsnMapping(WmsaHome.getAsnMappingDatabase());
logger.info("Loaded ASN mapping");
this.ip2locMapping = new IP2LocationMapping(WmsaHome.getIPLocationDatabse());
ready = true;
synchronized (this) {
this.notifyAll();
}
});
}
public boolean isReady() {
return null != asnMapping;
return ready;
}
public boolean waitReady() {
while (null == asnMapping) {
while (!ready) {
try {
synchronized (this) {
this.wait(1000);
@ -39,6 +51,22 @@ public class GeoIpDictionary {
return true;
}
public String getCountry(String ip) {
if (null == ip2locMapping) {
return "";
}
return ip2locMapping.getCountry(ip);
}
public String getCountry(InetAddress address) {
if (null == ip2locMapping) {
return "";
}
return ip2locMapping.getCountry(address);
}
public Optional<AsnTable.AsnInfo> getAsnInfo(String ip) {
try {
return getAsnInfo(InetAddress.getByName(ip));

View File

@ -1,4 +1,4 @@
package nu.marginalia.geoip;
package nu.marginalia.geoip.sources;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
@ -6,55 +6,35 @@ import org.slf4j.LoggerFactory;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Objects;
import java.util.Optional;
import java.util.TreeMap;
public class AsnMapping {
private static final Logger logger = LoggerFactory.getLogger(AsnMapping.class);
private final TreeMap<Integer, AsnMappingRecord> asns = new TreeMap<>(Integer::compareUnsigned);
public record AsnMappingRecord(int ipStart, int ipEnd, int asn) {
public boolean contains(int ip) {
return Integer.compareUnsigned(ipStart, ip) <= 0
&& Integer.compareUnsigned(ip, ipEnd) < 0;
}
}
private final IpRangeMapping<Integer> ranges = new IpRangeMapping<>();
public AsnMapping(Path databaseFile) {
try (var reader = Files.lines(databaseFile)) {
reader.map(AsnMapping::parseAsnMappingFileLine).filter(Objects::nonNull).forEach(asn -> asns.put(asn.ipStart(), asn));
reader.forEach(this::parseAsnMappingFileLine);
} catch (Exception e) {
logger.error("Failed to load ASN mapping" + databaseFile, e);
}
}
public Optional<Integer> getAsnNumber(int ip) {
var entry = asns.floorEntry(ip);
if (null == entry) {
return Optional.empty();
return ranges.get(ip);
}
var asn = entry.getValue();
if (asn.contains(ip)) {
return Optional.of(asn.asn());
}
return Optional.empty();
}
public static AsnMappingRecord parseAsnMappingFileLine(String s) {
private void parseAsnMappingFileLine(String s) {
try {
String[] parts = StringUtils.split(s, '\t');
if (parts.length != 2) {
return null;
return;
}
// Parse CIDR notation, e.g. 127.0.0.1/24 -> ["127.0.0.1", "24"]
String[] cidrParts = StringUtils.split(parts[0], '/');
if (cidrParts.length != 2) {
return null;
return;
}
// Parse IP address and subnet mask
@ -72,12 +52,12 @@ public class AsnMapping {
ipStart &= 0xFFFFFFFF << (32 - ipMask);
ipEnd |= 0xFFFFFFFF >>> ipMask;
return new AsnMappingRecord(ipStart, ipEnd, Integer.parseInt(parts[1]));
ranges.add(ipStart, ipEnd, Integer.parseInt(parts[1]));
}
catch (Exception ex) {
logger.warn("Failed to parse ASN mapping line: {}", s);
return null;
}
}

View File

@ -1,4 +1,4 @@
package nu.marginalia.geoip;
package nu.marginalia.geoip.sources;
import nu.marginalia.WmsaHome;
import org.slf4j.Logger;

View File

@ -0,0 +1,40 @@
package nu.marginalia.geoip.sources;
import com.opencsv.CSVReader;
import java.net.InetAddress;
import java.nio.file.Files;
import java.nio.file.Path;
/** Load an IP2LOCATION LITE database file and provide a method to look up the country for an IP address.
*/
public class IP2LocationMapping {
private final IpRangeMapping<String> ranges = new IpRangeMapping<>();
public IP2LocationMapping(Path filename) {
try (var reader = new CSVReader(Files.newBufferedReader(filename))) {
for (;;) {
String[] vals = reader.readNext();
if (vals == null) {
break;
}
ranges.add(Integer.parseUnsignedInt(vals[0]), Integer.parseUnsignedInt(vals[1]), vals[2]);
}
} catch (Exception e) {
throw new RuntimeException(e);
}
}
public String getCountry(String ip) {
try {
return getCountry(InetAddress.getByName(ip));
} catch (Exception e) {
return "";
}
}
public String getCountry(InetAddress address) {
return ranges.get(address).orElse("");
}
}

View File

@ -0,0 +1,41 @@
package nu.marginalia.geoip.sources;
import java.net.InetAddress;
import java.util.Optional;
import java.util.TreeMap;
public class IpRangeMapping<T> {
private final TreeMap<Integer, IpRangeWithCountry<T>> ranges = new TreeMap<>(Integer::compareUnsigned);
public record IpRangeWithCountry<T>(int ipStart, int ipEnd, T value) {
public boolean contains(int ip) {
return Integer.compareUnsigned(ipStart, ip) <= 0
&& Integer.compareUnsigned(ip, ipEnd) < 0;
}
}
public void add(int ipStart, int ipEnd, T value) {
ranges.put(ipStart, new IpRangeWithCountry<>(ipStart, ipEnd, value));
}
public Optional<T> get(InetAddress address) {
byte[] bytes = address.getAddress();
int ival = (int) (((long) bytes[0] & 0xFF) << 24 | ((long) bytes[1] & 0xFF) << 16 | ((long) bytes[2] & 0xFF) << 8 | ((long) bytes[3] & 0xFF));
return get(ival);
}
public Optional<T> get(int ipUnsignedInt) {
Integer key = ranges.floorKey(ipUnsignedInt);
if (null == key) {
return Optional.empty();
}
var range = ranges.get(key);
if (range.contains(ipUnsignedInt)) {
return Optional.of(range.value);
}
return Optional.empty();
}
}

View File

@ -1,55 +0,0 @@
package nu.marginalia.geoip;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import java.net.InetAddress;
import java.net.UnknownHostException;
import static org.junit.jupiter.api.Assertions.*;
import static org.junit.jupiter.api.Assertions.assertNotNull;
public class AsnMappingTest {
@Test
public void testParseAsnMappingFileLine() throws UnknownHostException {
// Test Case 1: Valid ASN Mapping Line
String input1 = "192.0.2.0/24\t65536";
AsnMapping.AsnMappingRecord result1 = AsnMapping.parseAsnMappingFileLine(input1);
assertNotNull(result1, "The result should not be null for valid data");
assertEquals(65536, result1.asn(), "The asn is not as expected");
// Test Case 2: Invalid ASN Mapping Line - Different format
String input2 = "nah I am just a string, not an ASN Mapping Line...";
AsnMapping.AsnMappingRecord result2 = AsnMapping.parseAsnMappingFileLine(input2);
assertNull(result2, "The result should be null for invalid data");
// Test Case 3: Invalid ASN Mapping Line - Null input
String input3 = null;
AsnMapping.AsnMappingRecord result3 = AsnMapping.parseAsnMappingFileLine(input3);
assertNull(result3, "The result should be null for null input");
// Test Case 4: Invalid ASN Mapping Line - Empty string
String input4 = "";
AsnMapping.AsnMappingRecord result4 = AsnMapping.parseAsnMappingFileLine(input4);
assertNull(result4, "The result should be null for empty string");
// Test Case 5: Invalid ASN Mapping Line - One part
String input5 = "192.0.2.0/24";
AsnMapping.AsnMappingRecord result5 = AsnMapping.parseAsnMappingFileLine(input5);
assertNull(result5, "The result should be null for a string with only one part");
}
@Test
public void testIpBounds() throws UnknownHostException {
String input7 = "193.183.0.0/24\t207825";
AsnMapping.AsnMappingRecord result7 = AsnMapping.parseAsnMappingFileLine(input7);
assertNotNull(result7, "The result should not be null for valid data");
var ip = InetAddress.getAllByName("193.183.0.0");
byte[] ipBytes = ip[0].getAddress();
int ipInt = (int) (((long)ipBytes[0]&0xFF) << 24 | ((long)ipBytes[1]&0xFF) << 16 | ((long)ipBytes[2]&0xFF)<< 8 | ((long)ipBytes[3]&0xFF));
assertTrue(result7.contains(ipInt));
}
}

View File

@ -1,55 +0,0 @@
package nu.marginalia.geoip;
import nu.marginalia.geoip.AsnTable;
import nu.marginalia.geoip.AsnTable.AsnInfo;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNull;
public class AsnTableTest {
/**
* This class is to test the static method parseAsnFileLine of the AsnTable class.
* This method parses a line from an ASN table file into an AsnInfo instance,
* which holds ASN number, country and organization string.
*/
@Test
public void testParseAsnFileLine_ShouldReturnNullWhenUnallocated() {
String unallocatedLine = " 1 UNALLOCATED";
AsnInfo result = AsnTable.parseAsnFileLine(unallocatedLine);
assertNull(result, "Parse ASN File Line output should be null for unallocated ASN");
}
@Test
public void testParseAsnFileLine_ShouldReturnNullWhenInputIsNotParsable() {
String unparsableLine = " NotParsable Line";
AsnInfo result = AsnTable.parseAsnFileLine(unparsableLine);
assertNull(result, "Parse ASN File Line output should be null for unparsable lines");
}
@Test
public void testParseAsnFileLine_AllFieldsParsedCorrectly() {
String asnLineWithAllFields = "123456 Company,US ";
AsnInfo expected = new AsnInfo(123456, "US", "Company");
AsnInfo actual = AsnTable.parseAsnFileLine(asnLineWithAllFields);
assertEquals(expected, actual, "Parse ASN File Line output should match expected AsnInfo when line is correctly formatted");
}
@Test
public void testParseAsnFileLine_MultipleCommasInOrg() {
String asnLineWithAllFields = "123456 Company, Inc., US ";
AsnInfo expected = new AsnInfo(123456, "US", "Company, Inc.");
AsnInfo actual = AsnTable.parseAsnFileLine(asnLineWithAllFields);
assertEquals(expected, actual, "Parse ASN File Line output should match expected AsnInfo when line is correctly formatted");
}
@Test
public void testParseAsnFileLine_NoCountry() {
String asnLineWithoutCountry = "123456 Company";
AsnInfo expected = new AsnInfo(123456, "", "Company");
AsnInfo actual = AsnTable.parseAsnFileLine(asnLineWithoutCountry);
assertEquals(expected, actual, "Parse ASN File Line output should match expected AsnInfo when line lacks country");
}
}

View File

@ -11,12 +11,14 @@ import nu.marginalia.converting.processor.logic.links.LinkGraph;
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
import nu.marginalia.crawling.model.*;
import nu.marginalia.geoip.GeoIpDictionary;
import nu.marginalia.geoip.sources.AsnTable;
import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.converting.processor.logic.links.TopKeywords;
import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator;
import nu.marginalia.model.crawl.HtmlFeature;
import org.apache.commons.lang3.StringUtils;
import org.jetbrains.annotations.Nullable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -121,12 +123,7 @@ public class DomainProcessor {
List<String> terms = new ArrayList<>();
terms.add("ip:"+ip);
geoIpDictionary.getAsnInfo(ip).ifPresent(asnInfo -> {
terms.add("asn:"+asnInfo.asn());
terms.add("ip:"+asnInfo.country());
});
addIpInfo(terms, ip);
if (cookies) {
terms.add(HtmlFeature.COOKIES.getKeyword());
@ -156,6 +153,59 @@ public class DomainProcessor {
return ret;
}
private void addIpInfo(List<String> terms, String ip) {
terms.add("ip:"+ip);
// Add IP location country as a term
String country = geoIpDictionary.getCountry(ip);
if (!country.isBlank()) { // use the ip:-prefix as there's no real confusion between e.g. ip:127.0.0.1 and ip:uk
terms.add("ip:"+country.toLowerCase());
}
// Add ASN as a term
geoIpDictionary.getAsnInfo(ip).ifPresent(asnInfo -> {
terms.add("as:"+asnInfo.asn());
for (var orgPart : StringUtils.split(asnInfo.org(), '-')) {
terms.add("as:"+orgPart.toLowerCase());
}
if (isCloudy(asnInfo)) {
terms.add("special:cloud");
}
});
}
private boolean isCloudy(AsnTable.AsnInfo asnInfo) {
String org = asnInfo.org();
if (org.contains("MICROSOFT-AZURE")) {
return true;
}
if(org.contains("AMAZON")) {
return true;
}
if (org.contains("CLOUDFLARE")) {
return true;
}
if (org.contains("GOOGLE-CLOUD")) {
return true;
}
if (org.contains("DIGITALOCEAN")) {
return true;
}
if (org.contains("ALIBABA")) {
return true;
}
if (org.contains("CLOUDFLARE")) {
return true;
}
return false;
}
private static final Pattern academicPattern = Pattern.compile(".*\\.(ac|edu)\\.[a-z]{2}$");
private boolean isAcademicDomain(EdgeDomain domain) {

View File

@ -108,6 +108,11 @@
<h1> Open Source </h1>
The search engine is open source with an AGPL license. The sources can be perused at
<tt><a href="https://git.marginalia.nu/">https://git.marginalia.nu/</a></tt>.
<h1>Data Sources</h1>
IP geolocation is sourced from the IP2Location LITE data available from
<a rel="external noopener nofollow" href="https://lite.ip2location.com/">https://lite.ip2location.com/</a>
under
<a rel="external noopener nofollow" href="https://creativecommons.org/licenses/by-sa/4.0/">CC-BY-SA&nbsp;4.0</a>.
</section>
</footer>

View File

@ -6,7 +6,8 @@
Pages Known: {{pagesKnown}} <br/>
Pages Crawled: {{pagesFetched}} <br/>
Pages Indexed: {{pagesIndexed}} <br/>
<p></p>
IP: {{ip}} {{#if ipCountry}}<span title="{{ipCountry}}">{{getIpFlag}}</span>{{/if}}<br/>
ASN: {{asn}} {{asnOrg}} <br/>
<span title="Autonomous System">AS</span>: {{#if asn}}AS{{asn}} {{asnOrg}} {{asnCountry}}{{/if}} <br/>
</fieldset>
<br/>

View File

@ -60,12 +60,12 @@ public class DomainInformationService {
String ip = rs.getString("IP");
builder.ip(ip);
var isnInfo = geoIpDictionary.getAsnInfo(ip);
if (isnInfo.isPresent()) {
builder.asn(isnInfo.get().asn());
builder.ipCountry(isnInfo.get().country());
builder.asnOrg(isnInfo.get().org());
}
geoIpDictionary.getAsnInfo(ip).ifPresent(asnInfo -> {
builder.asn(asnInfo.asn());
builder.asnOrg(asnInfo.org());
builder.asnCountry(asnInfo.country());
});
builder.ipCountry(geoIpDictionary.getCountry(ip));
builder.nodeAffinity(rs.getInt("NODE_AFFINITY"));
builder.domain(new EdgeDomain(rs.getString("DOMAIN_NAME")));

View File

@ -29,6 +29,9 @@ download_model model/ngrams.bin https://downloads.marginalia.nu/model/ngrams.bin
download_model model/tfreq-new-algo3.bin https://downloads.marginalia.nu/model/tfreq-new-algo3.bin
download_model model/lid.176.ftz https://downloads.marginalia.nu/model/lid.176.ftz
download_model data/IP2LOCATION-LITE-DB1.CSV.ZIP https://download.ip2location.com/lite/IP2LOCATION-LITE-DB1.CSV.ZIP
unzip -qn -d data data/IP2LOCATION-LITE-DB1.CSV.ZIP
download_model data/asn-data-raw-table https://thyme.apnic.net/current/data-raw-table
download_model data/asn-used-autnums https://thyme.apnic.net/current/data-used-autnums