Merge pull request 'Fix index.hdb errors from copying' (#106) from master into release
Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/106
This commit is contained in:
commit
5eb16fc643
@ -0,0 +1,74 @@
|
||||
package nu.marginalia.wmsa.edge.converting.processor.logic;
|
||||
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
public class SalientImageDetector {
|
||||
|
||||
public boolean hasSalientImage(Document document) {
|
||||
document.getElementsByTag("a").removeIf(Element::hasText);
|
||||
|
||||
Map<String, Integer> counts = new HashMap<>();
|
||||
for (var elem : document.getElementsByTag("img")) {
|
||||
counts.merge(elem.attr("src"), 1, Integer::sum);
|
||||
}
|
||||
for (var elem : document.select("p,div,section,article,font,center")) {
|
||||
|
||||
String tagName = elem.tagName();
|
||||
if (("p".equals(tagName) || "center".equals(tagName) || "font".equals(tagName))
|
||||
&& elem.text().length() < 16)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if (aTagDensity(elem) < 0.1 && htmlTagDensity(elem) > 0.85) {
|
||||
for (var imgTag : elem.getElementsByTag("img")) {
|
||||
if (counts.getOrDefault(imgTag.attr("src"), 1) > 1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (isSmall(imgTag)) {
|
||||
if (!imgTag.id().isBlank()) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
|
||||
}
|
||||
|
||||
private boolean isSmall(Element imgTag) {
|
||||
final String width = imgTag.attr("width");
|
||||
final String height = imgTag.attr("height");
|
||||
|
||||
if (width.isBlank() || height.isBlank())
|
||||
return true;
|
||||
|
||||
try {
|
||||
if (Integer.parseInt(width) < 400)
|
||||
return true;
|
||||
if (Integer.parseInt(height) < 400)
|
||||
return true;
|
||||
}
|
||||
catch (NumberFormatException ex) { /* no-op */ }
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private double htmlTagDensity(Element elem) {
|
||||
return (double) elem.text().length() / elem.html().length();
|
||||
}
|
||||
|
||||
private double aTagDensity(Element elem) {
|
||||
return (double) elem.getElementsByTag("a").text().length() / elem.text().length();
|
||||
}
|
||||
|
||||
}
|
@ -2,12 +2,11 @@
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>Marginalia Search}</title>
|
||||
<title>Marginalia Search</title>
|
||||
|
||||
<link rel="stylesheet" href="/style-new.css" />
|
||||
<link rel="search" type="application/opensearchdescription+xml" href="/opensearch.xml" title="Marginalia">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||
<meta name="robots" content="noindex" />
|
||||
|
||||
<meta property="og:description" content="search.marginalia.nu is a small independent do-it-yourself search engine for surprising but content-rich websites that never ask you to accept cookies or subscribe to newsletters. The goal is to bring you the sort of grass fed, free range HTML your grandma used to write. " />
|
||||
<meta property="og:locale" content="en_US" />
|
||||
|
Loading…
Reference in New Issue
Block a user