(converter) Integrate atags with the topology field
This commit is contained in:
parent
e23976f6c4
commit
2b77184281
@ -79,7 +79,7 @@ public record DocumentMetadata(int avgSentLength,
|
||||
this(avgSentLength, 0, 0, 0, year, 0, quality, encodeFlags(flags));
|
||||
}
|
||||
|
||||
public DocumentMetadata withSize(int size, int topology) {
|
||||
public DocumentMetadata withSizeAndTopology(int size, int topology) {
|
||||
final int encSize = (int) Math.min(ENC_DOMAIN_SIZE_MASK, Math.max(1, size / ENC_DOMAIN_SIZE_MULTIPLIER));
|
||||
|
||||
return new DocumentMetadata(avgSentLength, rank, encSize, topology, year, sets, quality, flags);
|
||||
|
@ -75,7 +75,7 @@ class DocumentMetadataTest {
|
||||
@Test
|
||||
public void encRank() {
|
||||
var meta = new DocumentMetadata(0, 22, 8, EnumSet.noneOf(DocumentFlags.class))
|
||||
.withSize(0xffffffff, 5).encode();
|
||||
.withSizeAndTopology(0xffffffff, 5).encode();
|
||||
var enc2 = DocumentMetadata.encodeRank(meta, 83);
|
||||
|
||||
assertEquals(83, DocumentMetadata.decodeRank(enc2));
|
||||
@ -86,7 +86,7 @@ class DocumentMetadataTest {
|
||||
public void testYear() {
|
||||
for (int year = 1996; year < 2023; year++) {
|
||||
var meta = new DocumentMetadata(~0, new PubDate(null, year).yearByte(), ~0, EnumSet.allOf(DocumentFlags.class))
|
||||
.withSize(~0, ~0);
|
||||
.withSizeAndTopology(~0, ~0);
|
||||
|
||||
var encoded = DocumentMetadata.encodeRank(meta.encode(), 0);
|
||||
|
||||
@ -95,7 +95,7 @@ class DocumentMetadataTest {
|
||||
|
||||
for (int year = 1996; year < 2023; year++) {
|
||||
var meta = new DocumentMetadata(0, new PubDate(null, year).yearByte(), 0, EnumSet.noneOf(DocumentFlags.class))
|
||||
.withSize(0, 0);
|
||||
.withSizeAndTopology(0, 0);
|
||||
|
||||
var encoded = DocumentMetadata.encodeRank(meta.encode(), 0);
|
||||
|
||||
|
@ -3,7 +3,6 @@ package nu.marginalia.atags.model;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
@ -22,15 +21,29 @@ public class DomainLinks {
|
||||
Collectors.mapping(LinkWithText::toLink, Collectors.toList())));
|
||||
}
|
||||
|
||||
public List<String> getUrls() {
|
||||
return new ArrayList<>(links.keySet());
|
||||
/** Get all urls in this domain. */
|
||||
public List<EdgeUrl> getUrls(String schema) {
|
||||
List<EdgeUrl> ret = new ArrayList<>(links.size());
|
||||
|
||||
for (var link : links.keySet()) {
|
||||
EdgeUrl.parse(schema + "://" + link).ifPresent(ret::add);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/** Returns the links to the given url. */
|
||||
public List<Link> forUrl(EdgeUrl url) {
|
||||
String key = url.domain.toString() + url.path + (url.param == null ? "" : "?" + url.param);
|
||||
return links.getOrDefault(key, List.of());
|
||||
}
|
||||
|
||||
/** Returns the number of links to the given url. */
|
||||
public int countForUrl(EdgeUrl url) {
|
||||
String key = url.domain.toString() + url.path + (url.param == null ? "" : "?" + url.param);
|
||||
return links.getOrDefault(key, List.of()).size();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "DomainLinks{" +
|
||||
|
@ -23,7 +23,7 @@ class DomainAnchorTagsImplTest {
|
||||
var tags = domainAnchorTags.getAnchorTags(new EdgeDomain("www.chiark.greenend.org.uk"));
|
||||
|
||||
System.out.println(tags);
|
||||
System.out.println(tags.getUrls());
|
||||
System.out.println(tags.getUrls("http"));
|
||||
System.out.println(tags.forUrl(new EdgeUrl("https://www.chiark.greenend.org.uk/~sgtatham/putty/")));
|
||||
System.out.println(tags.forUrl(new EdgeUrl("http://www.chiark.greenend.org.uk/~sgtatham/putty/")));
|
||||
System.out.println(tags.forUrl(new EdgeUrl("http://www.chiark.greenend.org.uk/~sgtatham/putt")));
|
||||
|
@ -3,6 +3,7 @@ package nu.marginalia.converting.processor;
|
||||
import com.google.inject.Inject;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.atags.AnchorTextKeywords;
|
||||
import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.atags.source.AnchorTagsSource;
|
||||
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
|
||||
import nu.marginalia.converting.model.ProcessedDocument;
|
||||
@ -90,7 +91,7 @@ public class DomainProcessor {
|
||||
terms.add(HtmlFeature.COOKIES.getKeyword());
|
||||
}
|
||||
|
||||
var atags = anchorTagsSource.getAnchorTags(ret.domain);
|
||||
var externalDomainLinks = anchorTagsSource.getAnchorTags(ret.domain);
|
||||
|
||||
for (var document : ret.documents) {
|
||||
if (document.details == null)
|
||||
@ -103,11 +104,11 @@ public class DomainProcessor {
|
||||
document.words.addAllSyntheticTerms(terms);
|
||||
|
||||
document.words.addAnchorTerms(
|
||||
anchorTextKeywords.getAnchorTextKeywords(atags, document.url)
|
||||
anchorTextKeywords.getAnchorTextKeywords(externalDomainLinks, document.url)
|
||||
);
|
||||
}
|
||||
documentDeduplicator.deduplicate(ret.documents);
|
||||
calculateStatistics(ret);
|
||||
calculateStatistics(ret, externalDomainLinks);
|
||||
|
||||
return ret;
|
||||
}
|
||||
@ -131,7 +132,7 @@ public class DomainProcessor {
|
||||
}
|
||||
}
|
||||
|
||||
private void calculateStatistics(ProcessedDomain ret) {
|
||||
private void calculateStatistics(ProcessedDomain ret, DomainLinks externalDomainLinks) {
|
||||
LinkGraph linkGraph = new LinkGraph();
|
||||
TopKeywords topKeywords = new TopKeywords();
|
||||
|
||||
@ -147,9 +148,10 @@ public class DomainProcessor {
|
||||
return;
|
||||
|
||||
int size = linkGraph.size();
|
||||
int topology = invertedLinkGraph.numLinks(doc.url);
|
||||
int topology = invertedLinkGraph.numLinks(doc.url)
|
||||
+ externalDomainLinks.countForUrl(doc.url);
|
||||
|
||||
doc.details.metadata = doc.details.metadata.withSize(size, topology);
|
||||
doc.details.metadata = doc.details.metadata.withSizeAndTopology(size, topology);
|
||||
});
|
||||
|
||||
siteWords.flagCommonSiteWords(ret);
|
||||
|
@ -53,7 +53,7 @@ public class SideloaderProcessing {
|
||||
|
||||
ret.details = details.details();
|
||||
ret.details.metadata = ret.details.metadata
|
||||
.withSize(size, Math.max(0, 32 - url.length()) / 4);
|
||||
.withSizeAndTopology(size, Math.max(0, 32 - url.length()) / 4);
|
||||
ret.url = new EdgeUrl(url);
|
||||
ret.state = UrlIndexingState.OK;
|
||||
ret.stateReason = "SIDELOAD";
|
||||
|
Loading…
Reference in New Issue
Block a user