(search) Merge similar sites results with the info view.
WIP: This commit needs to be cleaned up.
This commit is contained in:
parent
b41bb9cfcf
commit
8a1934008c
@ -26,7 +26,7 @@ public class DbBrowseDomainsRandom {
|
||||
public List<BrowseResult> getRandomDomains(int count, DomainBlacklist blacklist, int set) {
|
||||
|
||||
final String q = """
|
||||
SELECT DOMAIN_ID, DOMAIN_NAME
|
||||
SELECT DOMAIN_ID, DOMAIN_NAME, INDEXED
|
||||
FROM EC_RANDOM_DOMAINS
|
||||
INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID
|
||||
WHERE STATE<2
|
||||
@ -44,9 +44,10 @@ public class DbBrowseDomainsRandom {
|
||||
while (rsp.next()) {
|
||||
int id = rsp.getInt(1);
|
||||
String domain = rsp.getString(2);
|
||||
boolean indexed = rsp.getBoolean("INDEXED");
|
||||
|
||||
if (!blacklist.isBlacklisted(id)) {
|
||||
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, 0));
|
||||
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, 0, indexed));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -3,6 +3,7 @@ package nu.marginalia.browse;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import gnu.trove.set.hash.TIntHashSet;
|
||||
import nu.marginalia.browse.model.BrowseResult;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.db.DomainBlacklist;
|
||||
@ -23,14 +24,15 @@ public class DbBrowseDomainsSimilarCosine {
|
||||
this.dataSource = dataSource;
|
||||
}
|
||||
|
||||
public List<BrowseResult> getDomainNeighborsAdjacentCosine(int domainId, DomainBlacklist blacklist, int count) {
|
||||
public List<BrowseResult> getDomainNeighborsAdjacentCosineRequireScreenshot(int domainId, DomainBlacklist blacklist, int count) {
|
||||
List<BrowseResult> domains = new ArrayList<>(count);
|
||||
|
||||
String q = """
|
||||
SELECT
|
||||
EC_DOMAIN.ID,
|
||||
NV.NEIGHBOR_NAME,
|
||||
NV.RELATEDNESS
|
||||
NV.RELATEDNESS,
|
||||
EC_DOMAIN.INDEXED
|
||||
FROM EC_NEIGHBORS_VIEW NV
|
||||
INNER JOIN DATA_DOMAIN_SCREENSHOT ON DATA_DOMAIN_SCREENSHOT.DOMAIN_NAME=NV.NEIGHBOR_NAME
|
||||
INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=NV.NEIGHBOR_ID
|
||||
@ -49,9 +51,10 @@ public class DbBrowseDomainsSimilarCosine {
|
||||
int id = rsp.getInt(1);
|
||||
String domain = rsp.getString(2);
|
||||
double relatedness = rsp.getDouble(3);
|
||||
boolean indexed = rsp.getBoolean("INDEXED");
|
||||
|
||||
if (!blacklist.isBlacklisted(id)) {
|
||||
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, relatedness));
|
||||
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, relatedness, indexed));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -27,7 +27,7 @@ public class DbBrowseDomainsSimilarOldAlgo {
|
||||
final Set<BrowseResult> domains = new HashSet<>(count*3);
|
||||
|
||||
final String q = """
|
||||
SELECT EC_DOMAIN.ID AS NEIGHBOR_ID, DOMAIN_NAME, COUNT(*) AS CNT
|
||||
SELECT EC_DOMAIN.ID AS NEIGHBOR_ID, DOMAIN_NAME, COUNT(*) AS CNT, INDEXED
|
||||
FROM EC_DOMAIN_NEIGHBORS
|
||||
INNER JOIN EC_DOMAIN ON NEIGHBOR_ID=EC_DOMAIN.ID
|
||||
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
||||
@ -54,14 +54,14 @@ public class DbBrowseDomainsSimilarOldAlgo {
|
||||
String domain = rsp.getString(2);
|
||||
|
||||
if (!blacklist.isBlacklisted(id)) {
|
||||
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, 0));
|
||||
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, 0, rsp.getBoolean("INDEXED")));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (domains.size() < count/2) {
|
||||
final String q2 = """
|
||||
SELECT EC_DOMAIN.ID, DOMAIN_NAME
|
||||
SELECT EC_DOMAIN.ID, DOMAIN_NAME, INDEXED
|
||||
FROM EC_DOMAIN
|
||||
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
||||
INNER JOIN EC_DOMAIN_LINK B ON DEST_DOMAIN_ID=EC_DOMAIN.ID
|
||||
@ -83,7 +83,7 @@ public class DbBrowseDomainsSimilarOldAlgo {
|
||||
String domain = rsp.getString(2);
|
||||
|
||||
if (!blacklist.isBlacklisted(id)) {
|
||||
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, 0));
|
||||
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, 0, rsp.getBoolean("INDEXED")));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -91,7 +91,7 @@ public class DbBrowseDomainsSimilarOldAlgo {
|
||||
|
||||
if (domains.size() < count/2) {
|
||||
final String q3 = """
|
||||
SELECT EC_DOMAIN.ID, DOMAIN_NAME
|
||||
SELECT EC_DOMAIN.ID, DOMAIN_NAME, INDEXED
|
||||
FROM EC_DOMAIN
|
||||
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
||||
INNER JOIN EC_DOMAIN_LINK B ON B.SOURCE_DOMAIN_ID=EC_DOMAIN.ID
|
||||
@ -115,7 +115,7 @@ public class DbBrowseDomainsSimilarOldAlgo {
|
||||
String domain = rsp.getString(2);
|
||||
|
||||
if (!blacklist.isBlacklisted(id)) {
|
||||
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, 0));
|
||||
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, 0, rsp.getBoolean("INDEXED")));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -128,38 +128,5 @@ public class DbBrowseDomainsSimilarOldAlgo {
|
||||
return new ArrayList<>(domains);
|
||||
}
|
||||
|
||||
public List<BrowseResult> getRandomDomains(int count, DomainBlacklist blacklist, int set) {
|
||||
|
||||
final String q = """
|
||||
SELECT DOMAIN_ID, DOMAIN_NAME
|
||||
FROM EC_RANDOM_DOMAINS
|
||||
INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID
|
||||
WHERE STATE<2
|
||||
AND DOMAIN_SET=?
|
||||
AND DOMAIN_ALIAS IS NULL
|
||||
ORDER BY RAND()
|
||||
LIMIT ?
|
||||
""";
|
||||
List<BrowseResult> domains = new ArrayList<>(count);
|
||||
try (var conn = dataSource.getConnection()) {
|
||||
try (var stmt = conn.prepareStatement(q)) {
|
||||
stmt.setInt(1, set);;
|
||||
stmt.setInt(2, count);
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
int id = rsp.getInt(1);
|
||||
String domain = rsp.getString(2);
|
||||
|
||||
if (!blacklist.isBlacklisted(id)) {
|
||||
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, 0));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("SQL error", ex);
|
||||
}
|
||||
return domains;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -2,7 +2,10 @@ package nu.marginalia.browse.model;
|
||||
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
|
||||
public record BrowseResult (EdgeUrl url, int domainId, double relatedness) {
|
||||
public record BrowseResult (EdgeUrl url,
|
||||
int domainId,
|
||||
double relatedness,
|
||||
boolean indexed) {
|
||||
|
||||
public String domainHash() {
|
||||
var domain = url.domain;
|
||||
|
@ -28,7 +28,7 @@ public class DatingSessionObject {
|
||||
}
|
||||
|
||||
public BrowseResult nextSimilar(int domainId, DbBrowseDomainsSimilarCosine adjacent, DomainBlacklist blacklist) {
|
||||
adjacent.getDomainNeighborsAdjacentCosine(domainId, blacklist, 25).forEach(queue::addFirst);
|
||||
adjacent.getDomainNeighborsAdjacentCosineRequireScreenshot(domainId, blacklist, 25).forEach(queue::addFirst);
|
||||
|
||||
while (queue.size() > MAX_QUEUE_SIZE) {
|
||||
queue.removeLast();
|
||||
|
@ -52,7 +52,7 @@ public class SearchBrowseService {
|
||||
public BrowseResultSet getRelatedEntries(String word) {
|
||||
var domain = domainQueries.getDomainId(new EdgeDomain(word));
|
||||
|
||||
var neighbors = similarDomains.getDomainNeighborsAdjacentCosine(domain, blacklist, 256);
|
||||
var neighbors = similarDomains.getDomainNeighborsAdjacentCosineRequireScreenshot(domain, blacklist, 256);
|
||||
neighbors.removeIf(browseResultCleaner.shouldRemoveResultPredicate());
|
||||
|
||||
// If the results are very few, supplement with the alternative shitty algorithm
|
||||
|
@ -1,9 +1,9 @@
|
||||
package nu.marginalia.search.svc;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.browse.model.BrowseResult;
|
||||
import nu.marginalia.browse.model.BrowseResultSet;
|
||||
import nu.marginalia.client.Context;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import nu.marginalia.db.DomainBlacklist;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.renderer.MustacheRenderer;
|
||||
import nu.marginalia.renderer.RendererFactory;
|
||||
@ -12,38 +12,37 @@ import nu.marginalia.search.model.DomainInformation;
|
||||
import nu.marginalia.search.model.UrlDetails;
|
||||
import nu.marginalia.search.siteinfo.DomainInformationService;
|
||||
import nu.marginalia.search.svc.SearchFlagSiteService.FlagSiteFormData;
|
||||
import spark.*;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.io.IOException;
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.OptionalInt;
|
||||
|
||||
public class SearchSiteInfoService {
|
||||
|
||||
private final SearchOperator searchOperator;
|
||||
private final SimilarDomainsService similarDomains;
|
||||
private final DomainInformationService domainInformationService;
|
||||
private final SearchFlagSiteService flagSiteService;
|
||||
private final DbDomainQueries domainQueries;
|
||||
private final SearchBrowseService browseService;
|
||||
private final MustacheRenderer<Object> renderer;
|
||||
|
||||
@Inject
|
||||
public SearchSiteInfoService(SearchOperator searchOperator,
|
||||
SimilarDomainsService similarDomains,
|
||||
DomainInformationService domainInformationService,
|
||||
RendererFactory rendererFactory,
|
||||
SearchFlagSiteService flagSiteService,
|
||||
DbDomainQueries domainQueries, SearchBrowseService browseService) throws IOException {
|
||||
DbDomainQueries domainQueries) throws IOException {
|
||||
this.searchOperator = searchOperator;
|
||||
this.similarDomains = similarDomains;
|
||||
this.domainInformationService = domainInformationService;
|
||||
this.flagSiteService = flagSiteService;
|
||||
this.domainQueries = domainQueries;
|
||||
|
||||
this.renderer = rendererFactory.renderer("search/site-info/site-info");
|
||||
this.browseService = browseService;
|
||||
|
||||
}
|
||||
|
||||
@ -60,10 +59,9 @@ public class SearchSiteInfoService {
|
||||
var model = switch (view) {
|
||||
case "links" -> listLinks(ctx, domainName);
|
||||
case "docs" -> listDocs(ctx, domainName);
|
||||
case "info" -> siteInfo(ctx, domainName);
|
||||
case "similar" -> listSimilar(ctx, domainName);
|
||||
case "info" -> listInfo(ctx, domainName);
|
||||
case "report" -> reportSite(ctx, domainName);
|
||||
default -> siteInfo(ctx, domainName);
|
||||
default -> listInfo(ctx, domainName);
|
||||
};
|
||||
|
||||
return renderer.renderInto(response, model);
|
||||
@ -108,21 +106,6 @@ public class SearchSiteInfoService {
|
||||
false);
|
||||
}
|
||||
|
||||
private SiteInfo siteInfo(Context ctx, String domainName) {
|
||||
OptionalInt id = domainQueries.tryGetDomainId(new EdgeDomain(domainName));
|
||||
|
||||
if (id.isEmpty()) {
|
||||
return new SiteInfo(domainName, -1, null, dummyInformation(domainName));
|
||||
}
|
||||
|
||||
String screenshotPath = "/screenshot/"+id.getAsInt();
|
||||
DomainInformation domainInfo = domainInformationService
|
||||
.domainInfo(domainName)
|
||||
.orElseGet(() -> dummyInformation(domainName));
|
||||
|
||||
return new SiteInfo(domainName, id.getAsInt(), screenshotPath, domainInfo);
|
||||
}
|
||||
|
||||
private DomainInformation dummyInformation(String domainName) {
|
||||
return DomainInformation.builder()
|
||||
.domain(new EdgeDomain(domainName))
|
||||
@ -136,11 +119,25 @@ public class SearchSiteInfoService {
|
||||
domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1),
|
||||
searchOperator.doBacklinkSearch(ctx, domainName));
|
||||
}
|
||||
private SimilarSites listSimilar(Context ctx, String domainName) {
|
||||
|
||||
return new SimilarSites(domainName,
|
||||
domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1),
|
||||
browseService.getRelatedEntries(domainName));
|
||||
private SiteInfoWithContext listInfo(Context ctx, String domainName) {
|
||||
|
||||
final int domainId = domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1);
|
||||
|
||||
final DomainInformation domainInfo = domainInformationService.domainInfo(domainName)
|
||||
.orElseGet(() -> dummyInformation(domainName));
|
||||
|
||||
final List<SimilarDomainsService.SimilarDomain> similarSet =
|
||||
similarDomains.getSimilarDomains(domainId, 100);
|
||||
final List<SimilarDomainsService.SimilarDomain> linkingDomains =
|
||||
similarDomains.getLinkingDomains(domainId, 100);
|
||||
|
||||
return new SiteInfoWithContext(domainName,
|
||||
domainId,
|
||||
domainInfo,
|
||||
similarSet,
|
||||
linkingDomains
|
||||
);
|
||||
}
|
||||
private Docs listDocs(Context ctx, String domainName) {
|
||||
return new Docs(domainName,
|
||||
@ -148,51 +145,6 @@ public class SearchSiteInfoService {
|
||||
searchOperator.doSiteSearch(ctx, domainName));
|
||||
}
|
||||
|
||||
public record SiteInfo(Map<String, Boolean> view,
|
||||
Map<String, Boolean> domainState,
|
||||
long domainId,
|
||||
String domain,
|
||||
@Nullable String screenshotUrl,
|
||||
DomainInformation domainInformation)
|
||||
{
|
||||
public SiteInfo(String domain,
|
||||
long domainId,
|
||||
@Nullable String screenshotUrl,
|
||||
DomainInformation domainInformation)
|
||||
{
|
||||
this(Map.of("info", true),
|
||||
Map.of(domainInfoState(domainInformation), true),
|
||||
domainId,
|
||||
domain,
|
||||
screenshotUrl,
|
||||
domainInformation);
|
||||
}
|
||||
|
||||
private static String domainInfoState(DomainInformation info) {
|
||||
if (info.isBlacklisted()) {
|
||||
return "blacklisted";
|
||||
}
|
||||
if (!info.isUnknownDomain() && info.isSuggestForCrawling()) {
|
||||
return "suggestForCrawling";
|
||||
}
|
||||
if (info.isInCrawlQueue()) {
|
||||
return "inCrawlQueue";
|
||||
}
|
||||
if (info.isUnknownDomain()) {
|
||||
return "unknownDomain";
|
||||
}
|
||||
else {
|
||||
return "indexed";
|
||||
}
|
||||
}
|
||||
|
||||
public String query() { return "site:" + domain; }
|
||||
|
||||
public boolean isKnown() {
|
||||
return domainId > 0;
|
||||
}
|
||||
}
|
||||
|
||||
public record Docs(Map<String, Boolean> view,
|
||||
String domain,
|
||||
long domainId,
|
||||
@ -222,12 +174,48 @@ public class SearchSiteInfoService {
|
||||
}
|
||||
}
|
||||
|
||||
public record SimilarSites(Map<String, Boolean> view, String domain, long domainId, List<BrowseResult> results) {
|
||||
public SimilarSites(String domain, long domainId, BrowseResultSet results) {
|
||||
this(Map.of("similar", true), domain, domainId, new ArrayList<>(results.results()));
|
||||
public record SiteInfoWithContext(Map<String, Boolean> view,
|
||||
Map<String, Boolean> domainState,
|
||||
String domain,
|
||||
long domainId,
|
||||
DomainInformation domainInformation,
|
||||
List<SimilarDomainsService.SimilarDomain> similar,
|
||||
List<SimilarDomainsService.SimilarDomain> linking) {
|
||||
public SiteInfoWithContext(String domain,
|
||||
long domainId,
|
||||
DomainInformation domainInformation,
|
||||
List<SimilarDomainsService.SimilarDomain> similar,
|
||||
List<SimilarDomainsService.SimilarDomain> linking
|
||||
)
|
||||
{
|
||||
this(Map.of("info", true),
|
||||
Map.of(domainInfoState(domainInformation), true),
|
||||
domain,
|
||||
domainId,
|
||||
domainInformation,
|
||||
similar,
|
||||
linking);
|
||||
}
|
||||
|
||||
public String query() { return "similar:" + domain; }
|
||||
public String query() { return "site:" + domain; }
|
||||
|
||||
private static String domainInfoState(DomainInformation info) {
|
||||
if (info.isBlacklisted()) {
|
||||
return "blacklisted";
|
||||
}
|
||||
if (!info.isUnknownDomain() && info.isSuggestForCrawling()) {
|
||||
return "suggestForCrawling";
|
||||
}
|
||||
if (info.isInCrawlQueue()) {
|
||||
return "inCrawlQueue";
|
||||
}
|
||||
if (info.isUnknownDomain()) {
|
||||
return "unknownDomain";
|
||||
}
|
||||
else {
|
||||
return "indexed";
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isKnown() {
|
||||
return domainId > 0;
|
||||
|
@ -0,0 +1,245 @@
|
||||
package nu.marginalia.search.svc;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import gnu.trove.set.hash.TIntHashSet;
|
||||
import nu.marginalia.db.DomainBlacklist;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
public class SimilarDomainsService {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(SimilarDomainsService.class);
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
@Inject
|
||||
public SimilarDomainsService(HikariDataSource dataSource) {
|
||||
this.dataSource = dataSource;
|
||||
}
|
||||
|
||||
enum LinkType {
|
||||
STOD,
|
||||
DTOS,
|
||||
BIDI,
|
||||
NONE;
|
||||
|
||||
public static LinkType find(boolean linkStod, boolean linkDtos) {
|
||||
if (linkDtos && linkStod)
|
||||
return BIDI;
|
||||
if (linkDtos)
|
||||
return DTOS;
|
||||
if (linkStod)
|
||||
return STOD;
|
||||
|
||||
return NONE;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return switch (this) {
|
||||
case DTOS -> "→";
|
||||
case STOD -> "←";
|
||||
case BIDI -> "⇆";
|
||||
case NONE -> "-";
|
||||
};
|
||||
}
|
||||
|
||||
public String getDescription() {
|
||||
return switch (this) {
|
||||
case STOD -> "Backward Link";
|
||||
case DTOS -> "Forward Link";
|
||||
case BIDI -> "Mutual Link";
|
||||
case NONE -> "No Link";
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
public record SimilarDomain(EdgeUrl url,
|
||||
int domainId,
|
||||
double relatedness,
|
||||
double rank,
|
||||
boolean indexed,
|
||||
boolean active,
|
||||
boolean screenshot,
|
||||
LinkType linkType)
|
||||
{
|
||||
public String getRankSymbols() {
|
||||
if (rank > 90) {
|
||||
return "★★★★★";
|
||||
}
|
||||
if (rank > 70) {
|
||||
return "★★★★";
|
||||
}
|
||||
if (rank > 50) {
|
||||
return "★★★";
|
||||
}
|
||||
if (rank > 30) {
|
||||
return "★★";
|
||||
}
|
||||
if (rank > 10) {
|
||||
return "★";
|
||||
}
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
public record SimilarDomainsSet(List<SimilarDomain> domains, String focusDomain) {
|
||||
public SimilarDomainsSet(List<SimilarDomain> domains) {
|
||||
this(domains, "");
|
||||
}
|
||||
}
|
||||
|
||||
public List<SimilarDomain> getSimilarDomains(int domainId, int count) {
|
||||
// Tell me you've worked in enterprise software without telling me you've worked in enterprise software
|
||||
String q1 = """
|
||||
SELECT
|
||||
NEIGHBOR.ID AS ID,
|
||||
NEIGHBOR.DOMAIN_NAME AS DOMAIN_NAME,
|
||||
SCREENSHOT.DOMAIN_NAME IS NOT NULL AS HAS_SCREENSHOT,
|
||||
NODE_AFFINITY > 0 AS INDEXED,
|
||||
STATE='ACTIVE' AS ACTIVE,
|
||||
RELATEDNESS,
|
||||
RANK,
|
||||
STOD.ID IS NOT NULL AS LINK_STOD,
|
||||
DTOS.ID IS NOT NULL AS LINK_DTOS
|
||||
FROM EC_DOMAIN_NEIGHBORS_2
|
||||
INNER JOIN EC_DOMAIN AS NEIGHBOR ON EC_DOMAIN_NEIGHBORS_2.NEIGHBOR_ID = NEIGHBOR.ID
|
||||
LEFT JOIN DATA_DOMAIN_SCREENSHOT AS SCREENSHOT ON NEIGHBOR.DOMAIN_NAME = SCREENSHOT.DOMAIN_NAME
|
||||
LEFT JOIN EC_DOMAIN_LINK STOD ON STOD.SOURCE_DOMAIN_ID = NEIGHBOR.ID AND STOD.DEST_DOMAIN_ID = EC_DOMAIN_NEIGHBORS_2.DOMAIN_ID
|
||||
LEFT JOIN EC_DOMAIN_LINK DTOS ON DTOS.DEST_DOMAIN_ID = NEIGHBOR.ID AND DTOS.SOURCE_DOMAIN_ID = EC_DOMAIN_NEIGHBORS_2.DOMAIN_ID
|
||||
WHERE DOMAIN_ID = ?
|
||||
ORDER BY RELATEDNESS DESC
|
||||
LIMIT ?
|
||||
""";
|
||||
String q2 = """
|
||||
SELECT
|
||||
NEIGHBOR.ID AS ID,
|
||||
NEIGHBOR.DOMAIN_NAME AS DOMAIN_NAME,
|
||||
SCREENSHOT.DOMAIN_NAME IS NOT NULL AS HAS_SCREENSHOT,
|
||||
NODE_AFFINITY > 0 AS INDEXED,
|
||||
STATE='ACTIVE' AS ACTIVE,
|
||||
RELATEDNESS,
|
||||
RANK,
|
||||
STOD.ID IS NOT NULL AS LINK_STOD,
|
||||
DTOS.ID IS NOT NULL AS LINK_DTOS
|
||||
FROM EC_DOMAIN_NEIGHBORS_2
|
||||
INNER JOIN EC_DOMAIN AS NEIGHBOR ON EC_DOMAIN_NEIGHBORS_2.DOMAIN_ID = NEIGHBOR.ID
|
||||
LEFT JOIN DATA_DOMAIN_SCREENSHOT AS SCREENSHOT ON NEIGHBOR.DOMAIN_NAME = SCREENSHOT.DOMAIN_NAME
|
||||
LEFT JOIN EC_DOMAIN_LINK STOD ON STOD.SOURCE_DOMAIN_ID = NEIGHBOR.ID AND STOD.DEST_DOMAIN_ID = EC_DOMAIN_NEIGHBORS_2.NEIGHBOR_ID
|
||||
LEFT JOIN EC_DOMAIN_LINK DTOS ON DTOS.DEST_DOMAIN_ID = NEIGHBOR.ID AND DTOS.SOURCE_DOMAIN_ID = EC_DOMAIN_NEIGHBORS_2.NEIGHBOR_ID
|
||||
WHERE NEIGHBOR_ID = ?
|
||||
ORDER BY RELATEDNESS DESC
|
||||
LIMIT ?
|
||||
""";
|
||||
|
||||
var domains = executeSimilarDomainsQueries(domainId, count, q1, q2);
|
||||
|
||||
domains.sort(Comparator.comparing(SimilarDomain::relatedness).reversed().thenComparing(SimilarDomain::domainId));
|
||||
|
||||
return domains;
|
||||
}
|
||||
|
||||
public List<SimilarDomain> getLinkingDomains(int domainId, int count) {
|
||||
// Tell me you've worked in enterprise software without telling me you've worked in enterprise software
|
||||
String q1 = """
|
||||
SELECT
|
||||
NEIGHBOR.ID AS ID,
|
||||
NEIGHBOR.DOMAIN_NAME AS DOMAIN_NAME,
|
||||
SCREENSHOT.DOMAIN_NAME IS NOT NULL AS HAS_SCREENSHOT,
|
||||
NODE_AFFINITY > 0 AS INDEXED,
|
||||
STATE='ACTIVE' AS ACTIVE,
|
||||
COALESCE(COALESCE(NA.RELATEDNESS, NB.RELATEDNESS), 0) AS RELATEDNESS,
|
||||
RANK,
|
||||
TRUE AS LINK_STOD,
|
||||
DTOS.ID IS NOT NULL AS LINK_DTOS
|
||||
FROM EC_DOMAIN_LINK STOD
|
||||
INNER JOIN EC_DOMAIN AS NEIGHBOR ON STOD.SOURCE_DOMAIN_ID = NEIGHBOR.ID
|
||||
LEFT JOIN EC_DOMAIN_NEIGHBORS_2 NA ON STOD.SOURCE_DOMAIN_ID = NA.DOMAIN_ID AND STOD.DEST_DOMAIN_ID = NA.NEIGHBOR_ID
|
||||
LEFT JOIN EC_DOMAIN_NEIGHBORS_2 NB ON STOD.SOURCE_DOMAIN_ID = NB.NEIGHBOR_ID AND STOD.DEST_DOMAIN_ID = NA.DOMAIN_ID
|
||||
LEFT JOIN DATA_DOMAIN_SCREENSHOT AS SCREENSHOT ON NEIGHBOR.DOMAIN_NAME = SCREENSHOT.DOMAIN_NAME
|
||||
LEFT JOIN EC_DOMAIN_LINK DTOS ON DTOS.DEST_DOMAIN_ID = STOD.SOURCE_DOMAIN_ID AND DTOS.SOURCE_DOMAIN_ID = STOD.DEST_DOMAIN_ID
|
||||
WHERE STOD.DEST_DOMAIN_ID = ?
|
||||
GROUP BY NEIGHBOR.ID
|
||||
ORDER BY RELATEDNESS DESC
|
||||
LIMIT ?
|
||||
""";
|
||||
String q2 = """
|
||||
SELECT
|
||||
NEIGHBOR.ID AS ID,
|
||||
NEIGHBOR.DOMAIN_NAME AS DOMAIN_NAME,
|
||||
SCREENSHOT.DOMAIN_NAME IS NOT NULL AS HAS_SCREENSHOT,
|
||||
NODE_AFFINITY > 0 AS INDEXED,
|
||||
STATE='ACTIVE' AS ACTIVE,
|
||||
COALESCE(COALESCE(NA.RELATEDNESS, NB.RELATEDNESS), 0) AS RELATEDNESS,
|
||||
RANK,
|
||||
STOD.ID IS NOT NULL AS LINK_STOD,
|
||||
TRUE AS LINK_DTOS
|
||||
FROM EC_DOMAIN_LINK DTOS
|
||||
INNER JOIN EC_DOMAIN AS NEIGHBOR ON DTOS.DEST_DOMAIN_ID = NEIGHBOR.ID
|
||||
LEFT JOIN EC_DOMAIN_NEIGHBORS_2 NA ON DTOS.DEST_DOMAIN_ID = NA.DOMAIN_ID AND DTOS.SOURCE_DOMAIN_ID = NA.NEIGHBOR_ID
|
||||
LEFT JOIN EC_DOMAIN_NEIGHBORS_2 NB ON DTOS.DEST_DOMAIN_ID = NB.NEIGHBOR_ID AND DTOS.SOURCE_DOMAIN_ID = NA.DOMAIN_ID
|
||||
LEFT JOIN DATA_DOMAIN_SCREENSHOT AS SCREENSHOT ON NEIGHBOR.DOMAIN_NAME = SCREENSHOT.DOMAIN_NAME
|
||||
LEFT JOIN EC_DOMAIN_LINK STOD ON STOD.DEST_DOMAIN_ID = DTOS.SOURCE_DOMAIN_ID AND STOD.SOURCE_DOMAIN_ID = DTOS.DEST_DOMAIN_ID
|
||||
WHERE DTOS.SOURCE_DOMAIN_ID = ?
|
||||
GROUP BY NEIGHBOR.ID
|
||||
ORDER BY RELATEDNESS DESC
|
||||
LIMIT ?
|
||||
""";
|
||||
|
||||
var domains = executeSimilarDomainsQueries(domainId, count, q1, q2);
|
||||
|
||||
domains.sort(Comparator.comparing(SimilarDomain::rank)
|
||||
.thenComparing(SimilarDomain::relatedness)
|
||||
.thenComparing(SimilarDomain::indexed).reversed()
|
||||
.thenComparing(SimilarDomain::domainId));
|
||||
|
||||
return domains;
|
||||
}
|
||||
|
||||
private List<SimilarDomain> executeSimilarDomainsQueries(int domainId, int count, String... queries) {
|
||||
List<SimilarDomain> domains = new ArrayList<>(count);
|
||||
TIntHashSet seen = new TIntHashSet();
|
||||
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
for (var query : queries) {
|
||||
try (var stmt = connection.prepareStatement(query)) {
|
||||
stmt.setFetchSize(count);
|
||||
stmt.setInt(1, domainId);
|
||||
stmt.setInt(2, count);
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next() && domains.size() < count * 2) {
|
||||
int id = rsp.getInt("ID");
|
||||
|
||||
if (seen.add(id)) {
|
||||
boolean linkStod = rsp.getBoolean("LINK_STOD");
|
||||
boolean linkDtos = rsp.getBoolean("LINK_DTOS");
|
||||
LinkType linkType = LinkType.find(linkStod, linkDtos);
|
||||
|
||||
domains.add(new SimilarDomain(
|
||||
new EdgeDomain(rsp.getString("DOMAIN_NAME")).toRootUrl(),
|
||||
id,
|
||||
100 * rsp.getDouble("RELATEDNESS"),
|
||||
100 * (1. - rsp.getDouble("RANK")),
|
||||
rsp.getBoolean("INDEXED"),
|
||||
rsp.getBoolean("ACTIVE"),
|
||||
rsp.getBoolean("HAS_SCREENSHOT"),
|
||||
linkType
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (SQLException throwables) {
|
||||
logger.warn("Failed to get domain neighbors for domain", throwables);
|
||||
}
|
||||
|
||||
return domains;
|
||||
}
|
||||
}
|
@ -12,6 +12,15 @@ $visited: #fcc;
|
||||
* {
|
||||
box-sizing: border-box;
|
||||
}
|
||||
h1 a, h2 a {
|
||||
color: $fg-light;
|
||||
}
|
||||
h1 a:visited, h2 a:visited {
|
||||
color: $visited;
|
||||
}
|
||||
progress {
|
||||
width: 10ch;
|
||||
}
|
||||
|
||||
body {
|
||||
background-color: $nicotine-light;
|
||||
@ -343,6 +352,49 @@ footer {
|
||||
align-items: start;
|
||||
}
|
||||
|
||||
|
||||
#similar-view {
|
||||
display: grid;
|
||||
grid-template-columns: 1fr 1fr;
|
||||
grid-template-rows: auto 1fr;
|
||||
grid-gap: 1ch;
|
||||
align-content: start;
|
||||
justify-content: start;
|
||||
align-items: start;
|
||||
table {
|
||||
th {
|
||||
text-align: left;
|
||||
}
|
||||
}
|
||||
.screenshot {
|
||||
width: 100%;
|
||||
height: auto;
|
||||
}
|
||||
}
|
||||
|
||||
#similar-info {
|
||||
@extend .dialog;
|
||||
}
|
||||
|
||||
#similar-domains {
|
||||
grid-row: span 2;
|
||||
|
||||
@extend .dialog;
|
||||
}
|
||||
|
||||
#similar-links {
|
||||
@extend .dialog;
|
||||
}
|
||||
|
||||
@media (max-device-width: 900px) {
|
||||
#similar-view {
|
||||
display: block;
|
||||
* {
|
||||
margin-bottom: 1ch;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#search-box {
|
||||
@extend .shadowbox;
|
||||
|
||||
|
@ -7,10 +7,4 @@
|
||||
Pages Crawled: {{pagesFetched}} <br/>
|
||||
Pages Indexed: {{pagesIndexed}} <br/>
|
||||
</fieldset>
|
||||
<br/>
|
||||
|
||||
{{#if pagesFetched}}
|
||||
<p>
|
||||
If you've found a reason why this website should not be indexed,
|
||||
you may use <a href="/site/flag-site/{{domainId}}">this form</a> to file a report.<p>
|
||||
{{/if}}
|
||||
<br/>
|
@ -1,4 +1,3 @@
|
||||
<section id="index-info">
|
||||
<h2>Indexing Information</h2>
|
||||
{{#if domainState.blacklisted}}
|
||||
{{>search/site-info/site-info-index-blacklisted}}
|
||||
@ -21,5 +20,4 @@ It may take up to a month before it is indexed.
|
||||
|
||||
{{#if domainState.indexed}}
|
||||
{{>search/site-info/site-info-index-indexed}}
|
||||
{{/if}}
|
||||
</section>
|
||||
{{/if}}
|
@ -1,9 +1,7 @@
|
||||
<section id="link-info">
|
||||
<h2>Links</h2>
|
||||
<fieldset>
|
||||
<legend>Link Graph</legend>
|
||||
Ranking: {{ranking}}%<br/>
|
||||
Incoming Links: {{incomingLinks}} <br/>
|
||||
Outbound Links: {{outboundLinks}} <br/>
|
||||
</fieldset>
|
||||
</section>
|
||||
<h2>Links</h2>
|
||||
<fieldset>
|
||||
<legend>Link Graph</legend>
|
||||
Ranking: {{ranking}}%<br/>
|
||||
Incoming Links: {{incomingLinks}} <br/>
|
||||
Outbound Links: {{outboundLinks}} <br/>
|
||||
</fieldset>
|
@ -21,22 +21,13 @@
|
||||
<ul>
|
||||
<li {{#if info}}class="current"{{/if}}><a href="?view=info">Info</a></li>
|
||||
<li {{#if docs}}class="current"{{/if}}>{{#if known}}<a href="?view=docs">Docs</a>{{/if}}{{#unless known}}<a class="link-unavailable" title="This domain is not known by the search engine">Docs</a>{{/unless}}</li>
|
||||
<li {{#if links}}class="current"{{/if}}><a href="?view=links">Links</a></li>
|
||||
<li {{#if browse}}class="current"{{/if}}><a href="?view=similar">Similar</a></li>
|
||||
<li {{#if links}}class="current"{{/if}}><a href="?view=links">Backlinks</a></li>
|
||||
|
||||
<li {{#if report}}class="current"{{/if}}>{{#if known}}<a href="?view=report">Report</a>{{/if}}{{#unless known}}<a class="link-unavailable" title="This domain is not known by the search engine">Report</a>{{/unless}}</li>
|
||||
</ul>
|
||||
</nav>
|
||||
{{/with}}
|
||||
|
||||
{{#if view.info}}{{#with domainInformation}}
|
||||
<section id="siteinfo">
|
||||
{{> search/site-info/site-info-screenshot}}
|
||||
{{> search/site-info/site-info-index}}
|
||||
{{> search/site-info/site-info-links}}
|
||||
</section>
|
||||
{{/with}}{{/if}}
|
||||
|
||||
{{#if view.links}}
|
||||
<div class="infobox">
|
||||
Showing search results with links to {{domain}}.
|
||||
@ -56,11 +47,118 @@
|
||||
{{>search/site-info/site-info-report}}
|
||||
{{/if}}
|
||||
|
||||
{{#if view.similar}}
|
||||
<div class="infobox">Showing domains similar to {{domain}}</div>
|
||||
<section class="cards">
|
||||
{{#each results}}{{>search/browse-result}}{{/each}}
|
||||
</section>
|
||||
{{#if view.info}}
|
||||
<div class="infobox">
|
||||
A <a href="/explore/{{domain}}">visual exploration</a> mode is also available.
|
||||
</div>
|
||||
|
||||
|
||||
<div id="similar-view">
|
||||
<div id="similar-info">
|
||||
<h2><span title="External Link">🌎</span> <a rel="external noopener" href="https://{{domain}}/">{{domain}}</a></h2>
|
||||
|
||||
|
||||
<a rel="external noopener" href="https://{{domain}}/">
|
||||
<img class="screenshot" width="300" height="225" src="/screenshot/{{domainId}}" alt="Screenshot of {{domain}}" />
|
||||
</a>
|
||||
{{#with domainInformation}}
|
||||
{{> search/site-info/site-info-index}}
|
||||
{{> search/site-info/site-info-links}}
|
||||
{{/with}}
|
||||
</div>
|
||||
|
||||
{{#if similar}}
|
||||
<div id="similar-domains">
|
||||
<h2>Similar Domains</h2>
|
||||
|
||||
<table class="similarity-table">
|
||||
<tr>
|
||||
<th colspan="3">Meta</th>
|
||||
<th>Rank</th>
|
||||
<th>Domain</th>
|
||||
<th>Similarity</th>
|
||||
</tr>
|
||||
{{#each similar}}
|
||||
<tr>
|
||||
<td>
|
||||
{{#if indexed}}
|
||||
{{#if active}}
|
||||
<span title="Indexed">👀</span>
|
||||
{{/if}}
|
||||
{{#unless active}}
|
||||
<span title="Problem">🔥</span>
|
||||
{{/unless}}
|
||||
{{/if}}
|
||||
</td>
|
||||
<td>
|
||||
{{#if screenshot}}📷{{/if}}
|
||||
</td>
|
||||
<td>
|
||||
<span title="{{linkType.description}}">{{{linkType}}}</span>
|
||||
</td>
|
||||
<td>
|
||||
<span title="{{rank}}%">{{{rankSymbols}}}</span>
|
||||
</td>
|
||||
<td>
|
||||
<a href="/site/{{url.domain}}?view=similar" rel="external noopener nofollow">{{url.domain}}</a></td>
|
||||
<td>
|
||||
<progress value="{{relatedness}}" max="100.0">{{relatedness}}</progress><br>
|
||||
|
||||
</td>
|
||||
</tr>
|
||||
{{/each}}
|
||||
</table>
|
||||
<p><b>Note</b>: Because two domains are considered similar does not always mean they're in
|
||||
cahoots. Similarity is a measure of how often they appear in the same contexts,
|
||||
which may be an association like peas and carrots, but some pairings are also defined by their
|
||||
contrasting opposition, like Sparta and Athens.</p>
|
||||
</div>
|
||||
{{/if}}
|
||||
|
||||
{{#if linking}}
|
||||
<div id="similar-links">
|
||||
<h2>Linking Domains</h2>
|
||||
|
||||
<table class="similarity-table">
|
||||
<tr>
|
||||
<th colspan="3">Meta</th>
|
||||
<th>Rank</th>
|
||||
<th>Domain</th>
|
||||
<th>Similarity</th>
|
||||
</tr>
|
||||
{{#each linking}}
|
||||
<tr>
|
||||
<td>
|
||||
{{#if indexed}}
|
||||
{{#if active}}
|
||||
<span title="Indexed">👀</span>
|
||||
{{/if}}
|
||||
{{#unless active}}
|
||||
<span title="Problem">🔥</span>
|
||||
{{/unless}}
|
||||
{{/if}}
|
||||
</td>
|
||||
<td>
|
||||
{{#if screenshot}}📷{{/if}}
|
||||
</td>
|
||||
<td>
|
||||
<span title="{{linkType.description}}">{{{linkType}}}</span>
|
||||
</td>
|
||||
<td>
|
||||
<span title="{{rank}}%">{{{rankSymbols}}}</span>
|
||||
</td>
|
||||
<td>
|
||||
<a href="/site/{{url.domain}}?view=similar" rel="external noopener nofollow">{{url.domain}}</a></td>
|
||||
<td>
|
||||
<progress value="{{relatedness}}" max="100.0">{{relatedness}}</progress><br>
|
||||
|
||||
</td>
|
||||
</tr>
|
||||
{{/each}}
|
||||
</table>
|
||||
</div>
|
||||
{{/if}}
|
||||
</div>
|
||||
{{/if}}
|
||||
|
||||
{{>search/parts/search-footer}}
|
||||
|
Loading…
Reference in New Issue
Block a user