From 2b77184281a477362348b6556c6b1f36c0af469f Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 6 Nov 2023 13:46:44 +0100 Subject: [PATCH] (converter) Integrate atags with the topology field --- .../model/idx/DocumentMetadata.java | 2 +- .../model/DocumentMetadataTest.java | 6 +++--- .../marginalia/atags/model/DomainLinks.java | 19 ++++++++++++++++--- .../atags/DomainAnchorTagsImplTest.java | 2 +- .../converting/processor/DomainProcessor.java | 14 ++++++++------ .../sideload/SideloaderProcessing.java | 2 +- 6 files changed, 30 insertions(+), 15 deletions(-) diff --git a/code/common/model/src/main/java/nu/marginalia/model/idx/DocumentMetadata.java b/code/common/model/src/main/java/nu/marginalia/model/idx/DocumentMetadata.java index 26794c32..f16261ff 100644 --- a/code/common/model/src/main/java/nu/marginalia/model/idx/DocumentMetadata.java +++ b/code/common/model/src/main/java/nu/marginalia/model/idx/DocumentMetadata.java @@ -79,7 +79,7 @@ public record DocumentMetadata(int avgSentLength, this(avgSentLength, 0, 0, 0, year, 0, quality, encodeFlags(flags)); } - public DocumentMetadata withSize(int size, int topology) { + public DocumentMetadata withSizeAndTopology(int size, int topology) { final int encSize = (int) Math.min(ENC_DOMAIN_SIZE_MASK, Math.max(1, size / ENC_DOMAIN_SIZE_MULTIPLIER)); return new DocumentMetadata(avgSentLength, rank, encSize, topology, year, sets, quality, flags); diff --git a/code/common/model/src/test/java/nu/marginalia/model/DocumentMetadataTest.java b/code/common/model/src/test/java/nu/marginalia/model/DocumentMetadataTest.java index 6ba75d4f..bcbc7c81 100644 --- a/code/common/model/src/test/java/nu/marginalia/model/DocumentMetadataTest.java +++ b/code/common/model/src/test/java/nu/marginalia/model/DocumentMetadataTest.java @@ -75,7 +75,7 @@ class DocumentMetadataTest { @Test public void encRank() { var meta = new DocumentMetadata(0, 22, 8, EnumSet.noneOf(DocumentFlags.class)) - .withSize(0xffffffff, 5).encode(); + .withSizeAndTopology(0xffffffff, 5).encode(); var enc2 = DocumentMetadata.encodeRank(meta, 83); assertEquals(83, DocumentMetadata.decodeRank(enc2)); @@ -86,7 +86,7 @@ class DocumentMetadataTest { public void testYear() { for (int year = 1996; year < 2023; year++) { var meta = new DocumentMetadata(~0, new PubDate(null, year).yearByte(), ~0, EnumSet.allOf(DocumentFlags.class)) - .withSize(~0, ~0); + .withSizeAndTopology(~0, ~0); var encoded = DocumentMetadata.encodeRank(meta.encode(), 0); @@ -95,7 +95,7 @@ class DocumentMetadataTest { for (int year = 1996; year < 2023; year++) { var meta = new DocumentMetadata(0, new PubDate(null, year).yearByte(), 0, EnumSet.noneOf(DocumentFlags.class)) - .withSize(0, 0); + .withSizeAndTopology(0, 0); var encoded = DocumentMetadata.encodeRank(meta.encode(), 0); diff --git a/code/features-convert/anchor-keywords/src/main/java/nu/marginalia/atags/model/DomainLinks.java b/code/features-convert/anchor-keywords/src/main/java/nu/marginalia/atags/model/DomainLinks.java index bee75337..14e6ad99 100644 --- a/code/features-convert/anchor-keywords/src/main/java/nu/marginalia/atags/model/DomainLinks.java +++ b/code/features-convert/anchor-keywords/src/main/java/nu/marginalia/atags/model/DomainLinks.java @@ -3,7 +3,6 @@ package nu.marginalia.atags.model; import nu.marginalia.model.EdgeUrl; import java.util.ArrayList; -import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.stream.Collectors; @@ -22,15 +21,29 @@ public class DomainLinks { Collectors.mapping(LinkWithText::toLink, Collectors.toList()))); } - public List getUrls() { - return new ArrayList<>(links.keySet()); + /** Get all urls in this domain. */ + public List getUrls(String schema) { + List ret = new ArrayList<>(links.size()); + + for (var link : links.keySet()) { + EdgeUrl.parse(schema + "://" + link).ifPresent(ret::add); + } + + return ret; } + /** Returns the links to the given url. */ public List forUrl(EdgeUrl url) { String key = url.domain.toString() + url.path + (url.param == null ? "" : "?" + url.param); return links.getOrDefault(key, List.of()); } + /** Returns the number of links to the given url. */ + public int countForUrl(EdgeUrl url) { + String key = url.domain.toString() + url.path + (url.param == null ? "" : "?" + url.param); + return links.getOrDefault(key, List.of()).size(); + } + @Override public String toString() { return "DomainLinks{" + diff --git a/code/features-convert/anchor-keywords/src/test/java/nu/marginalia/atags/DomainAnchorTagsImplTest.java b/code/features-convert/anchor-keywords/src/test/java/nu/marginalia/atags/DomainAnchorTagsImplTest.java index d585ff7c..b18f6e4b 100644 --- a/code/features-convert/anchor-keywords/src/test/java/nu/marginalia/atags/DomainAnchorTagsImplTest.java +++ b/code/features-convert/anchor-keywords/src/test/java/nu/marginalia/atags/DomainAnchorTagsImplTest.java @@ -23,7 +23,7 @@ class DomainAnchorTagsImplTest { var tags = domainAnchorTags.getAnchorTags(new EdgeDomain("www.chiark.greenend.org.uk")); System.out.println(tags); - System.out.println(tags.getUrls()); + System.out.println(tags.getUrls("http")); System.out.println(tags.forUrl(new EdgeUrl("https://www.chiark.greenend.org.uk/~sgtatham/putty/"))); System.out.println(tags.forUrl(new EdgeUrl("http://www.chiark.greenend.org.uk/~sgtatham/putty/"))); System.out.println(tags.forUrl(new EdgeUrl("http://www.chiark.greenend.org.uk/~sgtatham/putt"))); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java index e16d1afe..476dfc16 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -3,6 +3,7 @@ package nu.marginalia.converting.processor; import com.google.inject.Inject; import lombok.SneakyThrows; import nu.marginalia.atags.AnchorTextKeywords; +import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.atags.source.AnchorTagsSource; import nu.marginalia.atags.source.AnchorTagsSourceFactory; import nu.marginalia.converting.model.ProcessedDocument; @@ -90,7 +91,7 @@ public class DomainProcessor { terms.add(HtmlFeature.COOKIES.getKeyword()); } - var atags = anchorTagsSource.getAnchorTags(ret.domain); + var externalDomainLinks = anchorTagsSource.getAnchorTags(ret.domain); for (var document : ret.documents) { if (document.details == null) @@ -103,11 +104,11 @@ public class DomainProcessor { document.words.addAllSyntheticTerms(terms); document.words.addAnchorTerms( - anchorTextKeywords.getAnchorTextKeywords(atags, document.url) + anchorTextKeywords.getAnchorTextKeywords(externalDomainLinks, document.url) ); } documentDeduplicator.deduplicate(ret.documents); - calculateStatistics(ret); + calculateStatistics(ret, externalDomainLinks); return ret; } @@ -131,7 +132,7 @@ public class DomainProcessor { } } - private void calculateStatistics(ProcessedDomain ret) { + private void calculateStatistics(ProcessedDomain ret, DomainLinks externalDomainLinks) { LinkGraph linkGraph = new LinkGraph(); TopKeywords topKeywords = new TopKeywords(); @@ -147,9 +148,10 @@ public class DomainProcessor { return; int size = linkGraph.size(); - int topology = invertedLinkGraph.numLinks(doc.url); + int topology = invertedLinkGraph.numLinks(doc.url) + + externalDomainLinks.countForUrl(doc.url); - doc.details.metadata = doc.details.metadata.withSize(size, topology); + doc.details.metadata = doc.details.metadata.withSizeAndTopology(size, topology); }); siteWords.flagCommonSiteWords(ret); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java index e0691471..60bc2f1a 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java @@ -53,7 +53,7 @@ public class SideloaderProcessing { ret.details = details.details(); ret.details.metadata = ret.details.metadata - .withSize(size, Math.max(0, 32 - url.length()) / 4); + .withSizeAndTopology(size, Math.max(0, 32 - url.length()) / 4); ret.url = new EdgeUrl(url); ret.state = UrlIndexingState.OK; ret.stateReason = "SIDELOAD";