CatgirlIntelligenceAgency/code/index/test/nu/marginalia/ranking/domains/TestGraphSourceForInvertedLinkData.java
Viktor Lofgren 1d34224416 (refac) Remove src/main from all source code paths.
Look, this will make the git history look funny, but trimming unnecessary depth from the source tree is a very necessary sanity-preserving measure when dealing with a super-modularized codebase like this one.

While it makes the project configuration a bit less conventional, it will save you several clicks every time you jump between modules.  Which you'll do a lot, because it's *modul*ar.  The src/main/java convention makes a lot of sense for a non-modular project though.  This ain't that.
2024-02-23 16:13:40 +01:00

87 lines
3.1 KiB
Java

package nu.marginalia.ranking.domains;
import lombok.SneakyThrows;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.ranking.domains.data.GraphSource;
import org.apache.commons.lang3.StringUtils;
import org.jgrapht.Graph;
import org.jgrapht.graph.DefaultDirectedGraph;
import org.jgrapht.graph.DefaultEdge;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class TestGraphSourceForInvertedLinkData implements GraphSource {
private static Path domainDataPath = Paths.get("/home/vlofgren/Exports/Links/domains.export.tsv");
private static Path[] linksDataPaths = new Path[] {
Paths.get("/home/vlofgren/Exports/Links/domain-links-1.dat"),
Paths.get("/home/vlofgren/Exports/Links/domain-links-2.dat"),
Paths.get("/home/vlofgren/Exports/Links/domain-links-3.dat"),
Paths.get("/home/vlofgren/Exports/Links/domain-links-4.dat"),
Paths.get("/home/vlofgren/Exports/Links/domain-links-5.dat"),
Paths.get("/home/vlofgren/Exports/Links/domain-links-6.dat"),
Paths.get("/home/vlofgren/Exports/Links/domain-links-7.dat"),
Paths.get("/home/vlofgren/Exports/Links/domain-links-8.dat"),
Paths.get("/home/vlofgren/Exports/Links/domain-links-9.dat")
};
public List<Integer> domainIds(List<String> domainNameList) { return List.of(); }
static boolean isAvailable() {
return Files.exists(domainDataPath) && Files.exists(linksDataPaths[0]);
}
private Map<Integer, String> idToName = new HashMap<>();
public String getName(int id) {
return idToName.get(id);
}
@SneakyThrows
@Override
public Graph<Integer, ?> getGraph() {
Graph<Integer, ?> graph = new DefaultDirectedGraph<>(DefaultEdge.class);
idToName = new HashMap<>();
try (var stream = Files
.lines(domainDataPath)) {
stream.skip(1)
.mapMultiToInt((line, c) -> {
String[] parts = StringUtils.split(line, '\t');
int id = Integer.parseInt(parts[0]);
String name = parts[1];
int node_affinity = Integer.parseInt(parts[3]);
if (node_affinity > 0) {
c.accept(id);
idToName.put(id, parts[1]);
}
})
.forEach(graph::addVertex);
}
for (var path : linksDataPaths) {
try (var data = LongArrayFactory.mmapForReadingConfined(path)) {
data.forEach(0, data.size(), (pos, val) -> {
val = Long.reverseBytes(val); // data is in "java endian", LongArray is in "C endian"
int src = (int) (val >>> 32);
int dest = (int) (val & 0xFFFF_FFFFL);
if (graph.containsVertex(src) && graph.containsVertex(dest)) {
graph.addEdge(dest, src);
}
});
}
}
return graph;
}
}