(geo-ip) Revert removal of ip2location logic

We do both ip2location and ASN data.

The change also adds some keywords based on autonomous system information, on a somewhat experimental basis.  It would be neat to be able to e.g. exclude cloud services or just e.g. cloudflare from the search results.
This commit is contained in:
Viktor Lofgren 2023-12-17 15:03:00 +01:00
parent bde68ba48b
commit c92f1b8df8
14 changed files with 196 additions and 159 deletions

View File

@ -24,6 +24,8 @@ public class DomainInformation {
String ip; String ip;
Integer asn; Integer asn;
String asnOrg; String asnOrg;
String asnCountry;
String ipCountry; String ipCountry;
String state; String state;

View File

@ -2,7 +2,6 @@ package nu.marginalia.ip_blocklist;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import nu.marginalia.geoip.AsnTable;
import nu.marginalia.geoip.GeoIpDictionary; import nu.marginalia.geoip.GeoIpDictionary;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import org.slf4j.Logger; import org.slf4j.Logger;
@ -44,9 +43,7 @@ public class GeoIpBlocklist {
public String getCountry(EdgeDomain domain) { public String getCountry(EdgeDomain domain) {
try { try {
return ipDictionary.getAsnInfo(InetAddressCache.getAddress(domain)) return ipDictionary.getCountry(InetAddressCache.getAddress(domain));
.map(AsnTable.AsnInfo::country)
.orElse("-");
} }
catch (Throwable ex) { catch (Throwable ex) {
logger.debug("Failed to resolve {}", domain); logger.debug("Failed to resolve {}", domain);

View File

@ -1,6 +1,9 @@
package nu.marginalia.geoip; package nu.marginalia.geoip;
import nu.marginalia.WmsaHome; import nu.marginalia.WmsaHome;
import nu.marginalia.geoip.sources.AsnMapping;
import nu.marginalia.geoip.sources.AsnTable;
import nu.marginalia.geoip.sources.IP2LocationMapping;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -8,10 +11,12 @@ import java.net.InetAddress;
import java.util.Optional; import java.util.Optional;
public class GeoIpDictionary { public class GeoIpDictionary {
private volatile IP2LocationMapping ip2locMapping = null;
private volatile AsnTable asnTable = null; private volatile AsnTable asnTable = null;
private volatile AsnMapping asnMapping = null; private volatile AsnMapping asnMapping = null;
private static final Logger logger = LoggerFactory.getLogger(GeoIpDictionary.class); private static final Logger logger = LoggerFactory.getLogger(GeoIpDictionary.class);
volatile boolean ready = false;
public GeoIpDictionary() { public GeoIpDictionary() {
Thread.ofPlatform().start(() -> { Thread.ofPlatform().start(() -> {
@ -19,15 +24,22 @@ public class GeoIpDictionary {
logger.info("Loaded ASN table"); logger.info("Loaded ASN table");
this.asnMapping = new AsnMapping(WmsaHome.getAsnMappingDatabase()); this.asnMapping = new AsnMapping(WmsaHome.getAsnMappingDatabase());
logger.info("Loaded ASN mapping"); logger.info("Loaded ASN mapping");
this.ip2locMapping = new IP2LocationMapping(WmsaHome.getIPLocationDatabse());
ready = true;
synchronized (this) {
this.notifyAll();
}
}); });
} }
public boolean isReady() { public boolean isReady() {
return null != asnMapping; return ready;
} }
public boolean waitReady() { public boolean waitReady() {
while (null == asnMapping) { while (!ready) {
try { try {
synchronized (this) { synchronized (this) {
this.wait(1000); this.wait(1000);
@ -39,6 +51,22 @@ public class GeoIpDictionary {
return true; return true;
} }
public String getCountry(String ip) {
if (null == ip2locMapping) {
return "";
}
return ip2locMapping.getCountry(ip);
}
public String getCountry(InetAddress address) {
if (null == ip2locMapping) {
return "";
}
return ip2locMapping.getCountry(address);
}
public Optional<AsnTable.AsnInfo> getAsnInfo(String ip) { public Optional<AsnTable.AsnInfo> getAsnInfo(String ip) {
try { try {
return getAsnInfo(InetAddress.getByName(ip)); return getAsnInfo(InetAddress.getByName(ip));

View File

@ -1,4 +1,4 @@
package nu.marginalia.geoip; package nu.marginalia.geoip.sources;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger; import org.slf4j.Logger;
@ -6,55 +6,35 @@ import org.slf4j.LoggerFactory;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.Objects;
import java.util.Optional; import java.util.Optional;
import java.util.TreeMap;
public class AsnMapping { public class AsnMapping {
private static final Logger logger = LoggerFactory.getLogger(AsnMapping.class); private static final Logger logger = LoggerFactory.getLogger(AsnMapping.class);
private final TreeMap<Integer, AsnMappingRecord> asns = new TreeMap<>(Integer::compareUnsigned); private final IpRangeMapping<Integer> ranges = new IpRangeMapping<>();
public record AsnMappingRecord(int ipStart, int ipEnd, int asn) {
public boolean contains(int ip) {
return Integer.compareUnsigned(ipStart, ip) <= 0
&& Integer.compareUnsigned(ip, ipEnd) < 0;
}
}
public AsnMapping(Path databaseFile) { public AsnMapping(Path databaseFile) {
try (var reader = Files.lines(databaseFile)) { try (var reader = Files.lines(databaseFile)) {
reader.map(AsnMapping::parseAsnMappingFileLine).filter(Objects::nonNull).forEach(asn -> asns.put(asn.ipStart(), asn)); reader.forEach(this::parseAsnMappingFileLine);
} catch (Exception e) { } catch (Exception e) {
logger.error("Failed to load ASN mapping" + databaseFile, e); logger.error("Failed to load ASN mapping" + databaseFile, e);
} }
} }
public Optional<Integer> getAsnNumber(int ip) { public Optional<Integer> getAsnNumber(int ip) {
var entry = asns.floorEntry(ip); return ranges.get(ip);
if (null == entry) {
return Optional.empty();
}
var asn = entry.getValue();
if (asn.contains(ip)) {
return Optional.of(asn.asn());
}
return Optional.empty();
} }
public static AsnMappingRecord parseAsnMappingFileLine(String s) { private void parseAsnMappingFileLine(String s) {
try { try {
String[] parts = StringUtils.split(s, '\t'); String[] parts = StringUtils.split(s, '\t');
if (parts.length != 2) { if (parts.length != 2) {
return null; return;
} }
// Parse CIDR notation, e.g. 127.0.0.1/24 -> ["127.0.0.1", "24"] // Parse CIDR notation, e.g. 127.0.0.1/24 -> ["127.0.0.1", "24"]
String[] cidrParts = StringUtils.split(parts[0], '/'); String[] cidrParts = StringUtils.split(parts[0], '/');
if (cidrParts.length != 2) { if (cidrParts.length != 2) {
return null; return;
} }
// Parse IP address and subnet mask // Parse IP address and subnet mask
@ -72,12 +52,12 @@ public class AsnMapping {
ipStart &= 0xFFFFFFFF << (32 - ipMask); ipStart &= 0xFFFFFFFF << (32 - ipMask);
ipEnd |= 0xFFFFFFFF >>> ipMask; ipEnd |= 0xFFFFFFFF >>> ipMask;
return new AsnMappingRecord(ipStart, ipEnd, Integer.parseInt(parts[1]));
ranges.add(ipStart, ipEnd, Integer.parseInt(parts[1]));
} }
catch (Exception ex) { catch (Exception ex) {
logger.warn("Failed to parse ASN mapping line: {}", s); logger.warn("Failed to parse ASN mapping line: {}", s);
return null;
} }
} }

View File

@ -1,4 +1,4 @@
package nu.marginalia.geoip; package nu.marginalia.geoip.sources;
import nu.marginalia.WmsaHome; import nu.marginalia.WmsaHome;
import org.slf4j.Logger; import org.slf4j.Logger;

View File

@ -0,0 +1,40 @@
package nu.marginalia.geoip.sources;
import com.opencsv.CSVReader;
import java.net.InetAddress;
import java.nio.file.Files;
import java.nio.file.Path;
/** Load an IP2LOCATION LITE database file and provide a method to look up the country for an IP address.
*/
public class IP2LocationMapping {
private final IpRangeMapping<String> ranges = new IpRangeMapping<>();
public IP2LocationMapping(Path filename) {
try (var reader = new CSVReader(Files.newBufferedReader(filename))) {
for (;;) {
String[] vals = reader.readNext();
if (vals == null) {
break;
}
ranges.add(Integer.parseUnsignedInt(vals[0]), Integer.parseUnsignedInt(vals[1]), vals[2]);
}
} catch (Exception e) {
throw new RuntimeException(e);
}
}
public String getCountry(String ip) {
try {
return getCountry(InetAddress.getByName(ip));
} catch (Exception e) {
return "";
}
}
public String getCountry(InetAddress address) {
return ranges.get(address).orElse("");
}
}

View File

@ -0,0 +1,41 @@
package nu.marginalia.geoip.sources;
import java.net.InetAddress;
import java.util.Optional;
import java.util.TreeMap;
public class IpRangeMapping<T> {
private final TreeMap<Integer, IpRangeWithCountry<T>> ranges = new TreeMap<>(Integer::compareUnsigned);
public record IpRangeWithCountry<T>(int ipStart, int ipEnd, T value) {
public boolean contains(int ip) {
return Integer.compareUnsigned(ipStart, ip) <= 0
&& Integer.compareUnsigned(ip, ipEnd) < 0;
}
}
public void add(int ipStart, int ipEnd, T value) {
ranges.put(ipStart, new IpRangeWithCountry<>(ipStart, ipEnd, value));
}
public Optional<T> get(InetAddress address) {
byte[] bytes = address.getAddress();
int ival = (int) (((long) bytes[0] & 0xFF) << 24 | ((long) bytes[1] & 0xFF) << 16 | ((long) bytes[2] & 0xFF) << 8 | ((long) bytes[3] & 0xFF));
return get(ival);
}
public Optional<T> get(int ipUnsignedInt) {
Integer key = ranges.floorKey(ipUnsignedInt);
if (null == key) {
return Optional.empty();
}
var range = ranges.get(key);
if (range.contains(ipUnsignedInt)) {
return Optional.of(range.value);
}
return Optional.empty();
}
}

View File

@ -1,55 +0,0 @@
package nu.marginalia.geoip;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import java.net.InetAddress;
import java.net.UnknownHostException;
import static org.junit.jupiter.api.Assertions.*;
import static org.junit.jupiter.api.Assertions.assertNotNull;
public class AsnMappingTest {
@Test
public void testParseAsnMappingFileLine() throws UnknownHostException {
// Test Case 1: Valid ASN Mapping Line
String input1 = "192.0.2.0/24\t65536";
AsnMapping.AsnMappingRecord result1 = AsnMapping.parseAsnMappingFileLine(input1);
assertNotNull(result1, "The result should not be null for valid data");
assertEquals(65536, result1.asn(), "The asn is not as expected");
// Test Case 2: Invalid ASN Mapping Line - Different format
String input2 = "nah I am just a string, not an ASN Mapping Line...";
AsnMapping.AsnMappingRecord result2 = AsnMapping.parseAsnMappingFileLine(input2);
assertNull(result2, "The result should be null for invalid data");
// Test Case 3: Invalid ASN Mapping Line - Null input
String input3 = null;
AsnMapping.AsnMappingRecord result3 = AsnMapping.parseAsnMappingFileLine(input3);
assertNull(result3, "The result should be null for null input");
// Test Case 4: Invalid ASN Mapping Line - Empty string
String input4 = "";
AsnMapping.AsnMappingRecord result4 = AsnMapping.parseAsnMappingFileLine(input4);
assertNull(result4, "The result should be null for empty string");
// Test Case 5: Invalid ASN Mapping Line - One part
String input5 = "192.0.2.0/24";
AsnMapping.AsnMappingRecord result5 = AsnMapping.parseAsnMappingFileLine(input5);
assertNull(result5, "The result should be null for a string with only one part");
}
@Test
public void testIpBounds() throws UnknownHostException {
String input7 = "193.183.0.0/24\t207825";
AsnMapping.AsnMappingRecord result7 = AsnMapping.parseAsnMappingFileLine(input7);
assertNotNull(result7, "The result should not be null for valid data");
var ip = InetAddress.getAllByName("193.183.0.0");
byte[] ipBytes = ip[0].getAddress();
int ipInt = (int) (((long)ipBytes[0]&0xFF) << 24 | ((long)ipBytes[1]&0xFF) << 16 | ((long)ipBytes[2]&0xFF)<< 8 | ((long)ipBytes[3]&0xFF));
assertTrue(result7.contains(ipInt));
}
}

View File

@ -1,55 +0,0 @@
package nu.marginalia.geoip;
import nu.marginalia.geoip.AsnTable;
import nu.marginalia.geoip.AsnTable.AsnInfo;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNull;
public class AsnTableTest {
/**
* This class is to test the static method parseAsnFileLine of the AsnTable class.
* This method parses a line from an ASN table file into an AsnInfo instance,
* which holds ASN number, country and organization string.
*/
@Test
public void testParseAsnFileLine_ShouldReturnNullWhenUnallocated() {
String unallocatedLine = " 1 UNALLOCATED";
AsnInfo result = AsnTable.parseAsnFileLine(unallocatedLine);
assertNull(result, "Parse ASN File Line output should be null for unallocated ASN");
}
@Test
public void testParseAsnFileLine_ShouldReturnNullWhenInputIsNotParsable() {
String unparsableLine = " NotParsable Line";
AsnInfo result = AsnTable.parseAsnFileLine(unparsableLine);
assertNull(result, "Parse ASN File Line output should be null for unparsable lines");
}
@Test
public void testParseAsnFileLine_AllFieldsParsedCorrectly() {
String asnLineWithAllFields = "123456 Company,US ";
AsnInfo expected = new AsnInfo(123456, "US", "Company");
AsnInfo actual = AsnTable.parseAsnFileLine(asnLineWithAllFields);
assertEquals(expected, actual, "Parse ASN File Line output should match expected AsnInfo when line is correctly formatted");
}
@Test
public void testParseAsnFileLine_MultipleCommasInOrg() {
String asnLineWithAllFields = "123456 Company, Inc., US ";
AsnInfo expected = new AsnInfo(123456, "US", "Company, Inc.");
AsnInfo actual = AsnTable.parseAsnFileLine(asnLineWithAllFields);
assertEquals(expected, actual, "Parse ASN File Line output should match expected AsnInfo when line is correctly formatted");
}
@Test
public void testParseAsnFileLine_NoCountry() {
String asnLineWithoutCountry = "123456 Company";
AsnInfo expected = new AsnInfo(123456, "", "Company");
AsnInfo actual = AsnTable.parseAsnFileLine(asnLineWithoutCountry);
assertEquals(expected, actual, "Parse ASN File Line output should match expected AsnInfo when line lacks country");
}
}

View File

@ -11,12 +11,14 @@ import nu.marginalia.converting.processor.logic.links.LinkGraph;
import nu.marginalia.crawling.io.SerializableCrawlDataStream; import nu.marginalia.crawling.io.SerializableCrawlDataStream;
import nu.marginalia.crawling.model.*; import nu.marginalia.crawling.model.*;
import nu.marginalia.geoip.GeoIpDictionary; import nu.marginalia.geoip.GeoIpDictionary;
import nu.marginalia.geoip.sources.AsnTable;
import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.converting.processor.logic.links.TopKeywords; import nu.marginalia.converting.processor.logic.links.TopKeywords;
import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator; import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator;
import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.model.crawl.HtmlFeature;
import org.apache.commons.lang3.StringUtils;
import org.jetbrains.annotations.Nullable; import org.jetbrains.annotations.Nullable;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -121,12 +123,7 @@ public class DomainProcessor {
List<String> terms = new ArrayList<>(); List<String> terms = new ArrayList<>();
terms.add("ip:"+ip); addIpInfo(terms, ip);
geoIpDictionary.getAsnInfo(ip).ifPresent(asnInfo -> {
terms.add("asn:"+asnInfo.asn());
terms.add("ip:"+asnInfo.country());
});
if (cookies) { if (cookies) {
terms.add(HtmlFeature.COOKIES.getKeyword()); terms.add(HtmlFeature.COOKIES.getKeyword());
@ -156,6 +153,59 @@ public class DomainProcessor {
return ret; return ret;
} }
private void addIpInfo(List<String> terms, String ip) {
terms.add("ip:"+ip);
// Add IP location country as a term
String country = geoIpDictionary.getCountry(ip);
if (!country.isBlank()) { // use the ip:-prefix as there's no real confusion between e.g. ip:127.0.0.1 and ip:uk
terms.add("ip:"+country.toLowerCase());
}
// Add ASN as a term
geoIpDictionary.getAsnInfo(ip).ifPresent(asnInfo -> {
terms.add("as:"+asnInfo.asn());
for (var orgPart : StringUtils.split(asnInfo.org(), '-')) {
terms.add("as:"+orgPart.toLowerCase());
}
if (isCloudy(asnInfo)) {
terms.add("special:cloud");
}
});
}
private boolean isCloudy(AsnTable.AsnInfo asnInfo) {
String org = asnInfo.org();
if (org.contains("MICROSOFT-AZURE")) {
return true;
}
if(org.contains("AMAZON")) {
return true;
}
if (org.contains("CLOUDFLARE")) {
return true;
}
if (org.contains("GOOGLE-CLOUD")) {
return true;
}
if (org.contains("DIGITALOCEAN")) {
return true;
}
if (org.contains("ALIBABA")) {
return true;
}
if (org.contains("CLOUDFLARE")) {
return true;
}
return false;
}
private static final Pattern academicPattern = Pattern.compile(".*\\.(ac|edu)\\.[a-z]{2}$"); private static final Pattern academicPattern = Pattern.compile(".*\\.(ac|edu)\\.[a-z]{2}$");
private boolean isAcademicDomain(EdgeDomain domain) { private boolean isAcademicDomain(EdgeDomain domain) {

View File

@ -108,6 +108,11 @@
<h1> Open Source </h1> <h1> Open Source </h1>
The search engine is open source with an AGPL license. The sources can be perused at The search engine is open source with an AGPL license. The sources can be perused at
<tt><a href="https://git.marginalia.nu/">https://git.marginalia.nu/</a></tt>. <tt><a href="https://git.marginalia.nu/">https://git.marginalia.nu/</a></tt>.
<h1>Data Sources</h1>
IP geolocation is sourced from the IP2Location LITE data available from
<a rel="external noopener nofollow" href="https://lite.ip2location.com/">https://lite.ip2location.com/</a>
under
<a rel="external noopener nofollow" href="https://creativecommons.org/licenses/by-sa/4.0/">CC-BY-SA&nbsp;4.0</a>.
</section> </section>
</footer> </footer>

View File

@ -6,7 +6,8 @@
Pages Known: {{pagesKnown}} <br/> Pages Known: {{pagesKnown}} <br/>
Pages Crawled: {{pagesFetched}} <br/> Pages Crawled: {{pagesFetched}} <br/>
Pages Indexed: {{pagesIndexed}} <br/> Pages Indexed: {{pagesIndexed}} <br/>
<p></p>
IP: {{ip}} {{#if ipCountry}}<span title="{{ipCountry}}">{{getIpFlag}}</span>{{/if}}<br/> IP: {{ip}} {{#if ipCountry}}<span title="{{ipCountry}}">{{getIpFlag}}</span>{{/if}}<br/>
ASN: {{asn}} {{asnOrg}} <br/> <span title="Autonomous System">AS</span>: {{#if asn}}AS{{asn}} {{asnOrg}} {{asnCountry}}{{/if}} <br/>
</fieldset> </fieldset>
<br/> <br/>

View File

@ -60,12 +60,12 @@ public class DomainInformationService {
String ip = rs.getString("IP"); String ip = rs.getString("IP");
builder.ip(ip); builder.ip(ip);
var isnInfo = geoIpDictionary.getAsnInfo(ip); geoIpDictionary.getAsnInfo(ip).ifPresent(asnInfo -> {
if (isnInfo.isPresent()) { builder.asn(asnInfo.asn());
builder.asn(isnInfo.get().asn()); builder.asnOrg(asnInfo.org());
builder.ipCountry(isnInfo.get().country()); builder.asnCountry(asnInfo.country());
builder.asnOrg(isnInfo.get().org()); });
} builder.ipCountry(geoIpDictionary.getCountry(ip));
builder.nodeAffinity(rs.getInt("NODE_AFFINITY")); builder.nodeAffinity(rs.getInt("NODE_AFFINITY"));
builder.domain(new EdgeDomain(rs.getString("DOMAIN_NAME"))); builder.domain(new EdgeDomain(rs.getString("DOMAIN_NAME")));

View File

@ -29,6 +29,9 @@ download_model model/ngrams.bin https://downloads.marginalia.nu/model/ngrams.bin
download_model model/tfreq-new-algo3.bin https://downloads.marginalia.nu/model/tfreq-new-algo3.bin download_model model/tfreq-new-algo3.bin https://downloads.marginalia.nu/model/tfreq-new-algo3.bin
download_model model/lid.176.ftz https://downloads.marginalia.nu/model/lid.176.ftz download_model model/lid.176.ftz https://downloads.marginalia.nu/model/lid.176.ftz
download_model data/IP2LOCATION-LITE-DB1.CSV.ZIP https://download.ip2location.com/lite/IP2LOCATION-LITE-DB1.CSV.ZIP
unzip -qn -d data data/IP2LOCATION-LITE-DB1.CSV.ZIP
download_model data/asn-data-raw-table https://thyme.apnic.net/current/data-raw-table download_model data/asn-data-raw-table https://thyme.apnic.net/current/data-raw-table
download_model data/asn-used-autnums https://thyme.apnic.net/current/data-used-autnums download_model data/asn-used-autnums https://thyme.apnic.net/current/data-used-autnums