(search) Merge similar sites results with the info view.

WIP: This commit needs to be cleaned up.
This commit is contained in:
Viktor Lofgren 2023-12-04 22:10:24 +01:00
parent b41bb9cfcf
commit 8a1934008c
13 changed files with 508 additions and 161 deletions

View File

@ -26,7 +26,7 @@ public class DbBrowseDomainsRandom {
public List<BrowseResult> getRandomDomains(int count, DomainBlacklist blacklist, int set) {
final String q = """
SELECT DOMAIN_ID, DOMAIN_NAME
SELECT DOMAIN_ID, DOMAIN_NAME, INDEXED
FROM EC_RANDOM_DOMAINS
INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID
WHERE STATE<2
@ -44,9 +44,10 @@ public class DbBrowseDomainsRandom {
while (rsp.next()) {
int id = rsp.getInt(1);
String domain = rsp.getString(2);
boolean indexed = rsp.getBoolean("INDEXED");
if (!blacklist.isBlacklisted(id)) {
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, 0));
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, 0, indexed));
}
}
}

View File

@ -3,6 +3,7 @@ package nu.marginalia.browse;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.zaxxer.hikari.HikariDataSource;
import gnu.trove.set.hash.TIntHashSet;
import nu.marginalia.browse.model.BrowseResult;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.db.DomainBlacklist;
@ -23,14 +24,15 @@ public class DbBrowseDomainsSimilarCosine {
this.dataSource = dataSource;
}
public List<BrowseResult> getDomainNeighborsAdjacentCosine(int domainId, DomainBlacklist blacklist, int count) {
public List<BrowseResult> getDomainNeighborsAdjacentCosineRequireScreenshot(int domainId, DomainBlacklist blacklist, int count) {
List<BrowseResult> domains = new ArrayList<>(count);
String q = """
SELECT
EC_DOMAIN.ID,
NV.NEIGHBOR_NAME,
NV.RELATEDNESS
NV.RELATEDNESS,
EC_DOMAIN.INDEXED
FROM EC_NEIGHBORS_VIEW NV
INNER JOIN DATA_DOMAIN_SCREENSHOT ON DATA_DOMAIN_SCREENSHOT.DOMAIN_NAME=NV.NEIGHBOR_NAME
INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=NV.NEIGHBOR_ID
@ -49,9 +51,10 @@ public class DbBrowseDomainsSimilarCosine {
int id = rsp.getInt(1);
String domain = rsp.getString(2);
double relatedness = rsp.getDouble(3);
boolean indexed = rsp.getBoolean("INDEXED");
if (!blacklist.isBlacklisted(id)) {
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, relatedness));
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, relatedness, indexed));
}
}
}

View File

@ -27,7 +27,7 @@ public class DbBrowseDomainsSimilarOldAlgo {
final Set<BrowseResult> domains = new HashSet<>(count*3);
final String q = """
SELECT EC_DOMAIN.ID AS NEIGHBOR_ID, DOMAIN_NAME, COUNT(*) AS CNT
SELECT EC_DOMAIN.ID AS NEIGHBOR_ID, DOMAIN_NAME, COUNT(*) AS CNT, INDEXED
FROM EC_DOMAIN_NEIGHBORS
INNER JOIN EC_DOMAIN ON NEIGHBOR_ID=EC_DOMAIN.ID
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
@ -54,14 +54,14 @@ public class DbBrowseDomainsSimilarOldAlgo {
String domain = rsp.getString(2);
if (!blacklist.isBlacklisted(id)) {
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, 0));
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, 0, rsp.getBoolean("INDEXED")));
}
}
}
if (domains.size() < count/2) {
final String q2 = """
SELECT EC_DOMAIN.ID, DOMAIN_NAME
SELECT EC_DOMAIN.ID, DOMAIN_NAME, INDEXED
FROM EC_DOMAIN
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
INNER JOIN EC_DOMAIN_LINK B ON DEST_DOMAIN_ID=EC_DOMAIN.ID
@ -83,7 +83,7 @@ public class DbBrowseDomainsSimilarOldAlgo {
String domain = rsp.getString(2);
if (!blacklist.isBlacklisted(id)) {
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, 0));
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, 0, rsp.getBoolean("INDEXED")));
}
}
}
@ -91,7 +91,7 @@ public class DbBrowseDomainsSimilarOldAlgo {
if (domains.size() < count/2) {
final String q3 = """
SELECT EC_DOMAIN.ID, DOMAIN_NAME
SELECT EC_DOMAIN.ID, DOMAIN_NAME, INDEXED
FROM EC_DOMAIN
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
INNER JOIN EC_DOMAIN_LINK B ON B.SOURCE_DOMAIN_ID=EC_DOMAIN.ID
@ -115,7 +115,7 @@ public class DbBrowseDomainsSimilarOldAlgo {
String domain = rsp.getString(2);
if (!blacklist.isBlacklisted(id)) {
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, 0));
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, 0, rsp.getBoolean("INDEXED")));
}
}
}
@ -128,38 +128,5 @@ public class DbBrowseDomainsSimilarOldAlgo {
return new ArrayList<>(domains);
}
public List<BrowseResult> getRandomDomains(int count, DomainBlacklist blacklist, int set) {
final String q = """
SELECT DOMAIN_ID, DOMAIN_NAME
FROM EC_RANDOM_DOMAINS
INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID
WHERE STATE<2
AND DOMAIN_SET=?
AND DOMAIN_ALIAS IS NULL
ORDER BY RAND()
LIMIT ?
""";
List<BrowseResult> domains = new ArrayList<>(count);
try (var conn = dataSource.getConnection()) {
try (var stmt = conn.prepareStatement(q)) {
stmt.setInt(1, set);;
stmt.setInt(2, count);
var rsp = stmt.executeQuery();
while (rsp.next()) {
int id = rsp.getInt(1);
String domain = rsp.getString(2);
if (!blacklist.isBlacklisted(id)) {
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, 0));
}
}
}
}
catch (SQLException ex) {
logger.error("SQL error", ex);
}
return domains;
}
}

View File

@ -2,7 +2,10 @@ package nu.marginalia.browse.model;
import nu.marginalia.model.EdgeUrl;
public record BrowseResult (EdgeUrl url, int domainId, double relatedness) {
public record BrowseResult (EdgeUrl url,
int domainId,
double relatedness,
boolean indexed) {
public String domainHash() {
var domain = url.domain;

View File

@ -28,7 +28,7 @@ public class DatingSessionObject {
}
public BrowseResult nextSimilar(int domainId, DbBrowseDomainsSimilarCosine adjacent, DomainBlacklist blacklist) {
adjacent.getDomainNeighborsAdjacentCosine(domainId, blacklist, 25).forEach(queue::addFirst);
adjacent.getDomainNeighborsAdjacentCosineRequireScreenshot(domainId, blacklist, 25).forEach(queue::addFirst);
while (queue.size() > MAX_QUEUE_SIZE) {
queue.removeLast();

View File

@ -52,7 +52,7 @@ public class SearchBrowseService {
public BrowseResultSet getRelatedEntries(String word) {
var domain = domainQueries.getDomainId(new EdgeDomain(word));
var neighbors = similarDomains.getDomainNeighborsAdjacentCosine(domain, blacklist, 256);
var neighbors = similarDomains.getDomainNeighborsAdjacentCosineRequireScreenshot(domain, blacklist, 256);
neighbors.removeIf(browseResultCleaner.shouldRemoveResultPredicate());
// If the results are very few, supplement with the alternative shitty algorithm

View File

@ -1,9 +1,9 @@
package nu.marginalia.search.svc;
import com.google.inject.Inject;
import nu.marginalia.browse.model.BrowseResult;
import nu.marginalia.browse.model.BrowseResultSet;
import nu.marginalia.client.Context;
import nu.marginalia.db.DbDomainQueries;
import nu.marginalia.db.DomainBlacklist;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.renderer.MustacheRenderer;
import nu.marginalia.renderer.RendererFactory;
@ -12,38 +12,37 @@ import nu.marginalia.search.model.DomainInformation;
import nu.marginalia.search.model.UrlDetails;
import nu.marginalia.search.siteinfo.DomainInformationService;
import nu.marginalia.search.svc.SearchFlagSiteService.FlagSiteFormData;
import spark.*;
import spark.Request;
import spark.Response;
import javax.annotation.Nullable;
import java.io.IOException;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.OptionalInt;
public class SearchSiteInfoService {
private final SearchOperator searchOperator;
private final SimilarDomainsService similarDomains;
private final DomainInformationService domainInformationService;
private final SearchFlagSiteService flagSiteService;
private final DbDomainQueries domainQueries;
private final SearchBrowseService browseService;
private final MustacheRenderer<Object> renderer;
@Inject
public SearchSiteInfoService(SearchOperator searchOperator,
SimilarDomainsService similarDomains,
DomainInformationService domainInformationService,
RendererFactory rendererFactory,
SearchFlagSiteService flagSiteService,
DbDomainQueries domainQueries, SearchBrowseService browseService) throws IOException {
DbDomainQueries domainQueries) throws IOException {
this.searchOperator = searchOperator;
this.similarDomains = similarDomains;
this.domainInformationService = domainInformationService;
this.flagSiteService = flagSiteService;
this.domainQueries = domainQueries;
this.renderer = rendererFactory.renderer("search/site-info/site-info");
this.browseService = browseService;
}
@ -60,10 +59,9 @@ public class SearchSiteInfoService {
var model = switch (view) {
case "links" -> listLinks(ctx, domainName);
case "docs" -> listDocs(ctx, domainName);
case "info" -> siteInfo(ctx, domainName);
case "similar" -> listSimilar(ctx, domainName);
case "info" -> listInfo(ctx, domainName);
case "report" -> reportSite(ctx, domainName);
default -> siteInfo(ctx, domainName);
default -> listInfo(ctx, domainName);
};
return renderer.renderInto(response, model);
@ -108,21 +106,6 @@ public class SearchSiteInfoService {
false);
}
private SiteInfo siteInfo(Context ctx, String domainName) {
OptionalInt id = domainQueries.tryGetDomainId(new EdgeDomain(domainName));
if (id.isEmpty()) {
return new SiteInfo(domainName, -1, null, dummyInformation(domainName));
}
String screenshotPath = "/screenshot/"+id.getAsInt();
DomainInformation domainInfo = domainInformationService
.domainInfo(domainName)
.orElseGet(() -> dummyInformation(domainName));
return new SiteInfo(domainName, id.getAsInt(), screenshotPath, domainInfo);
}
private DomainInformation dummyInformation(String domainName) {
return DomainInformation.builder()
.domain(new EdgeDomain(domainName))
@ -136,11 +119,25 @@ public class SearchSiteInfoService {
domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1),
searchOperator.doBacklinkSearch(ctx, domainName));
}
private SimilarSites listSimilar(Context ctx, String domainName) {
return new SimilarSites(domainName,
domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1),
browseService.getRelatedEntries(domainName));
private SiteInfoWithContext listInfo(Context ctx, String domainName) {
final int domainId = domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1);
final DomainInformation domainInfo = domainInformationService.domainInfo(domainName)
.orElseGet(() -> dummyInformation(domainName));
final List<SimilarDomainsService.SimilarDomain> similarSet =
similarDomains.getSimilarDomains(domainId, 100);
final List<SimilarDomainsService.SimilarDomain> linkingDomains =
similarDomains.getLinkingDomains(domainId, 100);
return new SiteInfoWithContext(domainName,
domainId,
domainInfo,
similarSet,
linkingDomains
);
}
private Docs listDocs(Context ctx, String domainName) {
return new Docs(domainName,
@ -148,51 +145,6 @@ public class SearchSiteInfoService {
searchOperator.doSiteSearch(ctx, domainName));
}
public record SiteInfo(Map<String, Boolean> view,
Map<String, Boolean> domainState,
long domainId,
String domain,
@Nullable String screenshotUrl,
DomainInformation domainInformation)
{
public SiteInfo(String domain,
long domainId,
@Nullable String screenshotUrl,
DomainInformation domainInformation)
{
this(Map.of("info", true),
Map.of(domainInfoState(domainInformation), true),
domainId,
domain,
screenshotUrl,
domainInformation);
}
private static String domainInfoState(DomainInformation info) {
if (info.isBlacklisted()) {
return "blacklisted";
}
if (!info.isUnknownDomain() && info.isSuggestForCrawling()) {
return "suggestForCrawling";
}
if (info.isInCrawlQueue()) {
return "inCrawlQueue";
}
if (info.isUnknownDomain()) {
return "unknownDomain";
}
else {
return "indexed";
}
}
public String query() { return "site:" + domain; }
public boolean isKnown() {
return domainId > 0;
}
}
public record Docs(Map<String, Boolean> view,
String domain,
long domainId,
@ -222,12 +174,48 @@ public class SearchSiteInfoService {
}
}
public record SimilarSites(Map<String, Boolean> view, String domain, long domainId, List<BrowseResult> results) {
public SimilarSites(String domain, long domainId, BrowseResultSet results) {
this(Map.of("similar", true), domain, domainId, new ArrayList<>(results.results()));
public record SiteInfoWithContext(Map<String, Boolean> view,
Map<String, Boolean> domainState,
String domain,
long domainId,
DomainInformation domainInformation,
List<SimilarDomainsService.SimilarDomain> similar,
List<SimilarDomainsService.SimilarDomain> linking) {
public SiteInfoWithContext(String domain,
long domainId,
DomainInformation domainInformation,
List<SimilarDomainsService.SimilarDomain> similar,
List<SimilarDomainsService.SimilarDomain> linking
)
{
this(Map.of("info", true),
Map.of(domainInfoState(domainInformation), true),
domain,
domainId,
domainInformation,
similar,
linking);
}
public String query() { return "similar:" + domain; }
public String query() { return "site:" + domain; }
private static String domainInfoState(DomainInformation info) {
if (info.isBlacklisted()) {
return "blacklisted";
}
if (!info.isUnknownDomain() && info.isSuggestForCrawling()) {
return "suggestForCrawling";
}
if (info.isInCrawlQueue()) {
return "inCrawlQueue";
}
if (info.isUnknownDomain()) {
return "unknownDomain";
}
else {
return "indexed";
}
}
public boolean isKnown() {
return domainId > 0;

View File

@ -0,0 +1,245 @@
package nu.marginalia.search.svc;
import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource;
import gnu.trove.set.hash.TIntHashSet;
import nu.marginalia.db.DomainBlacklist;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
public class SimilarDomainsService {
private static final Logger logger = LoggerFactory.getLogger(SimilarDomainsService.class);
private final HikariDataSource dataSource;
@Inject
public SimilarDomainsService(HikariDataSource dataSource) {
this.dataSource = dataSource;
}
enum LinkType {
STOD,
DTOS,
BIDI,
NONE;
public static LinkType find(boolean linkStod, boolean linkDtos) {
if (linkDtos && linkStod)
return BIDI;
if (linkDtos)
return DTOS;
if (linkStod)
return STOD;
return NONE;
}
public String toString() {
return switch (this) {
case DTOS -> "&#8594;";
case STOD -> "&#8592;";
case BIDI -> "&#8646;";
case NONE -> "-";
};
}
public String getDescription() {
return switch (this) {
case STOD -> "Backward Link";
case DTOS -> "Forward Link";
case BIDI -> "Mutual Link";
case NONE -> "No Link";
};
}
};
public record SimilarDomain(EdgeUrl url,
int domainId,
double relatedness,
double rank,
boolean indexed,
boolean active,
boolean screenshot,
LinkType linkType)
{
public String getRankSymbols() {
if (rank > 90) {
return "&#9733;&#9733;&#9733;&#9733;&#9733;";
}
if (rank > 70) {
return "&#9733;&#9733;&#9733;&#9733;";
}
if (rank > 50) {
return "&#9733;&#9733;&#9733;";
}
if (rank > 30) {
return "&#9733;&#9733;";
}
if (rank > 10) {
return "&#9733;";
}
return "";
}
}
public record SimilarDomainsSet(List<SimilarDomain> domains, String focusDomain) {
public SimilarDomainsSet(List<SimilarDomain> domains) {
this(domains, "");
}
}
public List<SimilarDomain> getSimilarDomains(int domainId, int count) {
// Tell me you've worked in enterprise software without telling me you've worked in enterprise software
String q1 = """
SELECT
NEIGHBOR.ID AS ID,
NEIGHBOR.DOMAIN_NAME AS DOMAIN_NAME,
SCREENSHOT.DOMAIN_NAME IS NOT NULL AS HAS_SCREENSHOT,
NODE_AFFINITY > 0 AS INDEXED,
STATE='ACTIVE' AS ACTIVE,
RELATEDNESS,
RANK,
STOD.ID IS NOT NULL AS LINK_STOD,
DTOS.ID IS NOT NULL AS LINK_DTOS
FROM EC_DOMAIN_NEIGHBORS_2
INNER JOIN EC_DOMAIN AS NEIGHBOR ON EC_DOMAIN_NEIGHBORS_2.NEIGHBOR_ID = NEIGHBOR.ID
LEFT JOIN DATA_DOMAIN_SCREENSHOT AS SCREENSHOT ON NEIGHBOR.DOMAIN_NAME = SCREENSHOT.DOMAIN_NAME
LEFT JOIN EC_DOMAIN_LINK STOD ON STOD.SOURCE_DOMAIN_ID = NEIGHBOR.ID AND STOD.DEST_DOMAIN_ID = EC_DOMAIN_NEIGHBORS_2.DOMAIN_ID
LEFT JOIN EC_DOMAIN_LINK DTOS ON DTOS.DEST_DOMAIN_ID = NEIGHBOR.ID AND DTOS.SOURCE_DOMAIN_ID = EC_DOMAIN_NEIGHBORS_2.DOMAIN_ID
WHERE DOMAIN_ID = ?
ORDER BY RELATEDNESS DESC
LIMIT ?
""";
String q2 = """
SELECT
NEIGHBOR.ID AS ID,
NEIGHBOR.DOMAIN_NAME AS DOMAIN_NAME,
SCREENSHOT.DOMAIN_NAME IS NOT NULL AS HAS_SCREENSHOT,
NODE_AFFINITY > 0 AS INDEXED,
STATE='ACTIVE' AS ACTIVE,
RELATEDNESS,
RANK,
STOD.ID IS NOT NULL AS LINK_STOD,
DTOS.ID IS NOT NULL AS LINK_DTOS
FROM EC_DOMAIN_NEIGHBORS_2
INNER JOIN EC_DOMAIN AS NEIGHBOR ON EC_DOMAIN_NEIGHBORS_2.DOMAIN_ID = NEIGHBOR.ID
LEFT JOIN DATA_DOMAIN_SCREENSHOT AS SCREENSHOT ON NEIGHBOR.DOMAIN_NAME = SCREENSHOT.DOMAIN_NAME
LEFT JOIN EC_DOMAIN_LINK STOD ON STOD.SOURCE_DOMAIN_ID = NEIGHBOR.ID AND STOD.DEST_DOMAIN_ID = EC_DOMAIN_NEIGHBORS_2.NEIGHBOR_ID
LEFT JOIN EC_DOMAIN_LINK DTOS ON DTOS.DEST_DOMAIN_ID = NEIGHBOR.ID AND DTOS.SOURCE_DOMAIN_ID = EC_DOMAIN_NEIGHBORS_2.NEIGHBOR_ID
WHERE NEIGHBOR_ID = ?
ORDER BY RELATEDNESS DESC
LIMIT ?
""";
var domains = executeSimilarDomainsQueries(domainId, count, q1, q2);
domains.sort(Comparator.comparing(SimilarDomain::relatedness).reversed().thenComparing(SimilarDomain::domainId));
return domains;
}
public List<SimilarDomain> getLinkingDomains(int domainId, int count) {
// Tell me you've worked in enterprise software without telling me you've worked in enterprise software
String q1 = """
SELECT
NEIGHBOR.ID AS ID,
NEIGHBOR.DOMAIN_NAME AS DOMAIN_NAME,
SCREENSHOT.DOMAIN_NAME IS NOT NULL AS HAS_SCREENSHOT,
NODE_AFFINITY > 0 AS INDEXED,
STATE='ACTIVE' AS ACTIVE,
COALESCE(COALESCE(NA.RELATEDNESS, NB.RELATEDNESS), 0) AS RELATEDNESS,
RANK,
TRUE AS LINK_STOD,
DTOS.ID IS NOT NULL AS LINK_DTOS
FROM EC_DOMAIN_LINK STOD
INNER JOIN EC_DOMAIN AS NEIGHBOR ON STOD.SOURCE_DOMAIN_ID = NEIGHBOR.ID
LEFT JOIN EC_DOMAIN_NEIGHBORS_2 NA ON STOD.SOURCE_DOMAIN_ID = NA.DOMAIN_ID AND STOD.DEST_DOMAIN_ID = NA.NEIGHBOR_ID
LEFT JOIN EC_DOMAIN_NEIGHBORS_2 NB ON STOD.SOURCE_DOMAIN_ID = NB.NEIGHBOR_ID AND STOD.DEST_DOMAIN_ID = NA.DOMAIN_ID
LEFT JOIN DATA_DOMAIN_SCREENSHOT AS SCREENSHOT ON NEIGHBOR.DOMAIN_NAME = SCREENSHOT.DOMAIN_NAME
LEFT JOIN EC_DOMAIN_LINK DTOS ON DTOS.DEST_DOMAIN_ID = STOD.SOURCE_DOMAIN_ID AND DTOS.SOURCE_DOMAIN_ID = STOD.DEST_DOMAIN_ID
WHERE STOD.DEST_DOMAIN_ID = ?
GROUP BY NEIGHBOR.ID
ORDER BY RELATEDNESS DESC
LIMIT ?
""";
String q2 = """
SELECT
NEIGHBOR.ID AS ID,
NEIGHBOR.DOMAIN_NAME AS DOMAIN_NAME,
SCREENSHOT.DOMAIN_NAME IS NOT NULL AS HAS_SCREENSHOT,
NODE_AFFINITY > 0 AS INDEXED,
STATE='ACTIVE' AS ACTIVE,
COALESCE(COALESCE(NA.RELATEDNESS, NB.RELATEDNESS), 0) AS RELATEDNESS,
RANK,
STOD.ID IS NOT NULL AS LINK_STOD,
TRUE AS LINK_DTOS
FROM EC_DOMAIN_LINK DTOS
INNER JOIN EC_DOMAIN AS NEIGHBOR ON DTOS.DEST_DOMAIN_ID = NEIGHBOR.ID
LEFT JOIN EC_DOMAIN_NEIGHBORS_2 NA ON DTOS.DEST_DOMAIN_ID = NA.DOMAIN_ID AND DTOS.SOURCE_DOMAIN_ID = NA.NEIGHBOR_ID
LEFT JOIN EC_DOMAIN_NEIGHBORS_2 NB ON DTOS.DEST_DOMAIN_ID = NB.NEIGHBOR_ID AND DTOS.SOURCE_DOMAIN_ID = NA.DOMAIN_ID
LEFT JOIN DATA_DOMAIN_SCREENSHOT AS SCREENSHOT ON NEIGHBOR.DOMAIN_NAME = SCREENSHOT.DOMAIN_NAME
LEFT JOIN EC_DOMAIN_LINK STOD ON STOD.DEST_DOMAIN_ID = DTOS.SOURCE_DOMAIN_ID AND STOD.SOURCE_DOMAIN_ID = DTOS.DEST_DOMAIN_ID
WHERE DTOS.SOURCE_DOMAIN_ID = ?
GROUP BY NEIGHBOR.ID
ORDER BY RELATEDNESS DESC
LIMIT ?
""";
var domains = executeSimilarDomainsQueries(domainId, count, q1, q2);
domains.sort(Comparator.comparing(SimilarDomain::rank)
.thenComparing(SimilarDomain::relatedness)
.thenComparing(SimilarDomain::indexed).reversed()
.thenComparing(SimilarDomain::domainId));
return domains;
}
private List<SimilarDomain> executeSimilarDomainsQueries(int domainId, int count, String... queries) {
List<SimilarDomain> domains = new ArrayList<>(count);
TIntHashSet seen = new TIntHashSet();
try (var connection = dataSource.getConnection()) {
for (var query : queries) {
try (var stmt = connection.prepareStatement(query)) {
stmt.setFetchSize(count);
stmt.setInt(1, domainId);
stmt.setInt(2, count);
var rsp = stmt.executeQuery();
while (rsp.next() && domains.size() < count * 2) {
int id = rsp.getInt("ID");
if (seen.add(id)) {
boolean linkStod = rsp.getBoolean("LINK_STOD");
boolean linkDtos = rsp.getBoolean("LINK_DTOS");
LinkType linkType = LinkType.find(linkStod, linkDtos);
domains.add(new SimilarDomain(
new EdgeDomain(rsp.getString("DOMAIN_NAME")).toRootUrl(),
id,
100 * rsp.getDouble("RELATEDNESS"),
100 * (1. - rsp.getDouble("RANK")),
rsp.getBoolean("INDEXED"),
rsp.getBoolean("ACTIVE"),
rsp.getBoolean("HAS_SCREENSHOT"),
linkType
));
}
}
}
}
} catch (SQLException throwables) {
logger.warn("Failed to get domain neighbors for domain", throwables);
}
return domains;
}
}

View File

@ -12,6 +12,15 @@ $visited: #fcc;
* {
box-sizing: border-box;
}
h1 a, h2 a {
color: $fg-light;
}
h1 a:visited, h2 a:visited {
color: $visited;
}
progress {
width: 10ch;
}
body {
background-color: $nicotine-light;
@ -343,6 +352,49 @@ footer {
align-items: start;
}
#similar-view {
display: grid;
grid-template-columns: 1fr 1fr;
grid-template-rows: auto 1fr;
grid-gap: 1ch;
align-content: start;
justify-content: start;
align-items: start;
table {
th {
text-align: left;
}
}
.screenshot {
width: 100%;
height: auto;
}
}
#similar-info {
@extend .dialog;
}
#similar-domains {
grid-row: span 2;
@extend .dialog;
}
#similar-links {
@extend .dialog;
}
@media (max-device-width: 900px) {
#similar-view {
display: block;
* {
margin-bottom: 1ch;
}
}
}
#search-box {
@extend .shadowbox;

View File

@ -7,10 +7,4 @@
Pages Crawled: {{pagesFetched}} <br/>
Pages Indexed: {{pagesIndexed}} <br/>
</fieldset>
<br/>
{{#if pagesFetched}}
<p>
If you've found a reason why this website should not be indexed,
you may use <a href="/site/flag-site/{{domainId}}">this form</a> to file a report.<p>
{{/if}}
<br/>

View File

@ -1,4 +1,3 @@
<section id="index-info">
<h2>Indexing Information</h2>
{{#if domainState.blacklisted}}
{{>search/site-info/site-info-index-blacklisted}}
@ -21,5 +20,4 @@ It may take up to a month before it is indexed.
{{#if domainState.indexed}}
{{>search/site-info/site-info-index-indexed}}
{{/if}}
</section>
{{/if}}

View File

@ -1,9 +1,7 @@
<section id="link-info">
<h2>Links</h2>
<fieldset>
<legend>Link Graph</legend>
Ranking: {{ranking}}%<br/>
Incoming Links: {{incomingLinks}} <br/>
Outbound Links: {{outboundLinks}} <br/>
</fieldset>
</section>
<h2>Links</h2>
<fieldset>
<legend>Link Graph</legend>
Ranking: {{ranking}}%<br/>
Incoming Links: {{incomingLinks}} <br/>
Outbound Links: {{outboundLinks}} <br/>
</fieldset>

View File

@ -21,22 +21,13 @@
<ul>
<li {{#if info}}class="current"{{/if}}><a href="?view=info">Info</a></li>
<li {{#if docs}}class="current"{{/if}}>{{#if known}}<a href="?view=docs">Docs</a>{{/if}}{{#unless known}}<a class="link-unavailable" title="This domain is not known by the search engine">Docs</a>{{/unless}}</li>
<li {{#if links}}class="current"{{/if}}><a href="?view=links">Links</a></li>
<li {{#if browse}}class="current"{{/if}}><a href="?view=similar">Similar</a></li>
<li {{#if links}}class="current"{{/if}}><a href="?view=links">Backlinks</a></li>
<li {{#if report}}class="current"{{/if}}>{{#if known}}<a href="?view=report">Report</a>{{/if}}{{#unless known}}<a class="link-unavailable" title="This domain is not known by the search engine">Report</a>{{/unless}}</li>
</ul>
</nav>
{{/with}}
{{#if view.info}}{{#with domainInformation}}
<section id="siteinfo">
{{> search/site-info/site-info-screenshot}}
{{> search/site-info/site-info-index}}
{{> search/site-info/site-info-links}}
</section>
{{/with}}{{/if}}
{{#if view.links}}
<div class="infobox">
Showing search results with links to {{domain}}.
@ -56,11 +47,118 @@
{{>search/site-info/site-info-report}}
{{/if}}
{{#if view.similar}}
<div class="infobox">Showing domains similar to {{domain}}</div>
<section class="cards">
{{#each results}}{{>search/browse-result}}{{/each}}
</section>
{{#if view.info}}
<div class="infobox">
A <a href="/explore/{{domain}}">visual exploration</a> mode is also available.
</div>
<div id="similar-view">
<div id="similar-info">
<h2><span title="External Link">&#x1F30E;</span>&nbsp;<a rel="external noopener" href="https://{{domain}}/">{{domain}}</a></h2>
<a rel="external noopener" href="https://{{domain}}/">
<img class="screenshot" width="300" height="225" src="/screenshot/{{domainId}}" alt="Screenshot of {{domain}}" />
</a>
{{#with domainInformation}}
{{> search/site-info/site-info-index}}
{{> search/site-info/site-info-links}}
{{/with}}
</div>
{{#if similar}}
<div id="similar-domains">
<h2>Similar Domains</h2>
<table class="similarity-table">
<tr>
<th colspan="3">Meta</th>
<th>Rank</th>
<th>Domain</th>
<th>Similarity</th>
</tr>
{{#each similar}}
<tr>
<td>
{{#if indexed}}
{{#if active}}
<span title="Indexed">&#128064;</span>
{{/if}}
{{#unless active}}
<span title="Problem">&#128293;</span>
{{/unless}}
{{/if}}
</td>
<td>
{{#if screenshot}}&#x1f4f7;{{/if}}
</td>
<td>
<span title="{{linkType.description}}">{{{linkType}}}</span>
</td>
<td>
<span title="{{rank}}%">{{{rankSymbols}}}</span>
</td>
<td>
<a href="/site/{{url.domain}}?view=similar" rel="external noopener nofollow">{{url.domain}}</a></td>
<td>
<progress value="{{relatedness}}" max="100.0">{{relatedness}}</progress><br>
</td>
</tr>
{{/each}}
</table>
<p><b>Note</b>: Because two domains are considered similar does not always mean they're in
cahoots. Similarity is a measure of how often they appear in the same contexts,
which may be an association like peas and carrots, but some pairings are also defined by their
contrasting opposition, like Sparta and Athens.</p>
</div>
{{/if}}
{{#if linking}}
<div id="similar-links">
<h2>Linking Domains</h2>
<table class="similarity-table">
<tr>
<th colspan="3">Meta</th>
<th>Rank</th>
<th>Domain</th>
<th>Similarity</th>
</tr>
{{#each linking}}
<tr>
<td>
{{#if indexed}}
{{#if active}}
<span title="Indexed">&#128064;</span>
{{/if}}
{{#unless active}}
<span title="Problem">&#128293;</span>
{{/unless}}
{{/if}}
</td>
<td>
{{#if screenshot}}&#x1f4f7;{{/if}}
</td>
<td>
<span title="{{linkType.description}}">{{{linkType}}}</span>
</td>
<td>
<span title="{{rank}}%">{{{rankSymbols}}}</span>
</td>
<td>
<a href="/site/{{url.domain}}?view=similar" rel="external noopener nofollow">{{url.domain}}</a></td>
<td>
<progress value="{{relatedness}}" max="100.0">{{relatedness}}</progress><br>
</td>
</tr>
{{/each}}
</table>
</div>
{{/if}}
</div>
{{/if}}
{{>search/parts/search-footer}}