Merge pull request 'Fix index.hdb errors from copying' (#106) from master into release

Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/106
This commit is contained in:
Viktor Lofgren 2022-09-02 09:36:15 +02:00
commit 5eb16fc643
2 changed files with 75 additions and 2 deletions

View File

@ -0,0 +1,74 @@
package nu.marginalia.wmsa.edge.converting.processor.logic;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import java.util.HashMap;
import java.util.Map;
public class SalientImageDetector {
public boolean hasSalientImage(Document document) {
document.getElementsByTag("a").removeIf(Element::hasText);
Map<String, Integer> counts = new HashMap<>();
for (var elem : document.getElementsByTag("img")) {
counts.merge(elem.attr("src"), 1, Integer::sum);
}
for (var elem : document.select("p,div,section,article,font,center")) {
String tagName = elem.tagName();
if (("p".equals(tagName) || "center".equals(tagName) || "font".equals(tagName))
&& elem.text().length() < 16)
{
continue;
}
if (aTagDensity(elem) < 0.1 && htmlTagDensity(elem) > 0.85) {
for (var imgTag : elem.getElementsByTag("img")) {
if (counts.getOrDefault(imgTag.attr("src"), 1) > 1) {
continue;
}
if (isSmall(imgTag)) {
if (!imgTag.id().isBlank()) {
continue;
}
}
return true;
}
}
}
return false;
}
private boolean isSmall(Element imgTag) {
final String width = imgTag.attr("width");
final String height = imgTag.attr("height");
if (width.isBlank() || height.isBlank())
return true;
try {
if (Integer.parseInt(width) < 400)
return true;
if (Integer.parseInt(height) < 400)
return true;
}
catch (NumberFormatException ex) { /* no-op */ }
return false;
}
private double htmlTagDensity(Element elem) {
return (double) elem.text().length() / elem.html().length();
}
private double aTagDensity(Element elem) {
return (double) elem.getElementsByTag("a").text().length() / elem.text().length();
}
}

View File

@ -2,12 +2,11 @@
<html> <html>
<head> <head>
<meta charset="UTF-8"> <meta charset="UTF-8">
<title>Marginalia Search}</title> <title>Marginalia Search</title>
<link rel="stylesheet" href="/style-new.css" /> <link rel="stylesheet" href="/style-new.css" />
<link rel="search" type="application/opensearchdescription+xml" href="/opensearch.xml" title="Marginalia"> <link rel="search" type="application/opensearchdescription+xml" href="/opensearch.xml" title="Marginalia">
<meta name="viewport" content="width=device-width, initial-scale=1.0" /> <meta name="viewport" content="width=device-width, initial-scale=1.0" />
<meta name="robots" content="noindex" />
<meta property="og:description" content="search.marginalia.nu is a small independent do-it-yourself search engine for surprising but content-rich websites that never ask you to accept cookies or subscribe to newsletters. The goal is to bring you the sort of grass fed, free range HTML your grandma used to write. " /> <meta property="og:description" content="search.marginalia.nu is a small independent do-it-yourself search engine for surprising but content-rich websites that never ask you to accept cookies or subscribe to newsletters. The goal is to bring you the sort of grass fed, free range HTML your grandma used to write. " />
<meta property="og:locale" content="en_US" /> <meta property="og:locale" content="en_US" />