(converter) Integrate atags with the topology field

This commit is contained in:
Viktor Lofgren 2023-11-06 13:46:44 +01:00
parent e23976f6c4
commit 2b77184281
6 changed files with 30 additions and 15 deletions

View File

@ -79,7 +79,7 @@ public record DocumentMetadata(int avgSentLength,
this(avgSentLength, 0, 0, 0, year, 0, quality, encodeFlags(flags));
}
public DocumentMetadata withSize(int size, int topology) {
public DocumentMetadata withSizeAndTopology(int size, int topology) {
final int encSize = (int) Math.min(ENC_DOMAIN_SIZE_MASK, Math.max(1, size / ENC_DOMAIN_SIZE_MULTIPLIER));
return new DocumentMetadata(avgSentLength, rank, encSize, topology, year, sets, quality, flags);

View File

@ -75,7 +75,7 @@ class DocumentMetadataTest {
@Test
public void encRank() {
var meta = new DocumentMetadata(0, 22, 8, EnumSet.noneOf(DocumentFlags.class))
.withSize(0xffffffff, 5).encode();
.withSizeAndTopology(0xffffffff, 5).encode();
var enc2 = DocumentMetadata.encodeRank(meta, 83);
assertEquals(83, DocumentMetadata.decodeRank(enc2));
@ -86,7 +86,7 @@ class DocumentMetadataTest {
public void testYear() {
for (int year = 1996; year < 2023; year++) {
var meta = new DocumentMetadata(~0, new PubDate(null, year).yearByte(), ~0, EnumSet.allOf(DocumentFlags.class))
.withSize(~0, ~0);
.withSizeAndTopology(~0, ~0);
var encoded = DocumentMetadata.encodeRank(meta.encode(), 0);
@ -95,7 +95,7 @@ class DocumentMetadataTest {
for (int year = 1996; year < 2023; year++) {
var meta = new DocumentMetadata(0, new PubDate(null, year).yearByte(), 0, EnumSet.noneOf(DocumentFlags.class))
.withSize(0, 0);
.withSizeAndTopology(0, 0);
var encoded = DocumentMetadata.encodeRank(meta.encode(), 0);

View File

@ -3,7 +3,6 @@ package nu.marginalia.atags.model;
import nu.marginalia.model.EdgeUrl;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
@ -22,15 +21,29 @@ public class DomainLinks {
Collectors.mapping(LinkWithText::toLink, Collectors.toList())));
}
public List<String> getUrls() {
return new ArrayList<>(links.keySet());
/** Get all urls in this domain. */
public List<EdgeUrl> getUrls(String schema) {
List<EdgeUrl> ret = new ArrayList<>(links.size());
for (var link : links.keySet()) {
EdgeUrl.parse(schema + "://" + link).ifPresent(ret::add);
}
return ret;
}
/** Returns the links to the given url. */
public List<Link> forUrl(EdgeUrl url) {
String key = url.domain.toString() + url.path + (url.param == null ? "" : "?" + url.param);
return links.getOrDefault(key, List.of());
}
/** Returns the number of links to the given url. */
public int countForUrl(EdgeUrl url) {
String key = url.domain.toString() + url.path + (url.param == null ? "" : "?" + url.param);
return links.getOrDefault(key, List.of()).size();
}
@Override
public String toString() {
return "DomainLinks{" +

View File

@ -23,7 +23,7 @@ class DomainAnchorTagsImplTest {
var tags = domainAnchorTags.getAnchorTags(new EdgeDomain("www.chiark.greenend.org.uk"));
System.out.println(tags);
System.out.println(tags.getUrls());
System.out.println(tags.getUrls("http"));
System.out.println(tags.forUrl(new EdgeUrl("https://www.chiark.greenend.org.uk/~sgtatham/putty/")));
System.out.println(tags.forUrl(new EdgeUrl("http://www.chiark.greenend.org.uk/~sgtatham/putty/")));
System.out.println(tags.forUrl(new EdgeUrl("http://www.chiark.greenend.org.uk/~sgtatham/putt")));

View File

@ -3,6 +3,7 @@ package nu.marginalia.converting.processor;
import com.google.inject.Inject;
import lombok.SneakyThrows;
import nu.marginalia.atags.AnchorTextKeywords;
import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.atags.source.AnchorTagsSource;
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
import nu.marginalia.converting.model.ProcessedDocument;
@ -90,7 +91,7 @@ public class DomainProcessor {
terms.add(HtmlFeature.COOKIES.getKeyword());
}
var atags = anchorTagsSource.getAnchorTags(ret.domain);
var externalDomainLinks = anchorTagsSource.getAnchorTags(ret.domain);
for (var document : ret.documents) {
if (document.details == null)
@ -103,11 +104,11 @@ public class DomainProcessor {
document.words.addAllSyntheticTerms(terms);
document.words.addAnchorTerms(
anchorTextKeywords.getAnchorTextKeywords(atags, document.url)
anchorTextKeywords.getAnchorTextKeywords(externalDomainLinks, document.url)
);
}
documentDeduplicator.deduplicate(ret.documents);
calculateStatistics(ret);
calculateStatistics(ret, externalDomainLinks);
return ret;
}
@ -131,7 +132,7 @@ public class DomainProcessor {
}
}
private void calculateStatistics(ProcessedDomain ret) {
private void calculateStatistics(ProcessedDomain ret, DomainLinks externalDomainLinks) {
LinkGraph linkGraph = new LinkGraph();
TopKeywords topKeywords = new TopKeywords();
@ -147,9 +148,10 @@ public class DomainProcessor {
return;
int size = linkGraph.size();
int topology = invertedLinkGraph.numLinks(doc.url);
int topology = invertedLinkGraph.numLinks(doc.url)
+ externalDomainLinks.countForUrl(doc.url);
doc.details.metadata = doc.details.metadata.withSize(size, topology);
doc.details.metadata = doc.details.metadata.withSizeAndTopology(size, topology);
});
siteWords.flagCommonSiteWords(ret);

View File

@ -53,7 +53,7 @@ public class SideloaderProcessing {
ret.details = details.details();
ret.details.metadata = ret.details.metadata
.withSize(size, Math.max(0, 32 - url.length()) / 4);
.withSizeAndTopology(size, Math.max(0, 32 - url.length()) / 4);
ret.url = new EdgeUrl(url);
ret.state = UrlIndexingState.OK;
ret.stateReason = "SIDELOAD";