Clean up summary extractor module.

This commit is contained in:
Viktor Lofgren 2023-03-18 10:21:41 +01:00
parent 6a20b2b678
commit 43430728aa
8 changed files with 164 additions and 88 deletions

View File

@ -2,10 +2,12 @@ package nu.marginalia.summary;
import com.google.inject.Inject;
import com.google.inject.name.Named;
import nu.marginalia.summary.heuristic.*;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
public class SummaryExtractor {
@ -13,100 +15,36 @@ public class SummaryExtractor {
private final Pattern truncatedCharacters = Pattern.compile("[\\-.,!?' ]{3,}");
private final List<SummaryHeuristic> heuristics = new ArrayList<>();
@Inject
public SummaryExtractor(@Named("max-summary-length") Integer maxSummaryLength) {
public SummaryExtractor(@Named("max-summary-length") Integer maxSummaryLength,
DomFilterHeuristic domFilterHeuristic,
TagDensityHeuristic tagDensityHeuristic,
OpenGraphDescriptionHeuristic ogTagHeuristic,
MetaDescriptionHeuristic metaDescriptionHeuristic,
FallbackHeuristic fallbackHeuristic)
{
this.maxSummaryLength = maxSummaryLength;
heuristics.add(domFilterHeuristic);
heuristics.add(tagDensityHeuristic);
heuristics.add(ogTagHeuristic);
heuristics.add(metaDescriptionHeuristic);
heuristics.add(fallbackHeuristic);
}
public String extractSummary(Document parsed) {
String summaryString = extractSummaryRaw(parsed);
summaryString = truncatedCharacters.matcher(summaryString).replaceAll(" ");
summaryString = StringUtils.abbreviate(summaryString, "", maxSummaryLength);
return summaryString;
}
private String extractSummaryRaw(Document parsed) {
String maybe;
parsed.select("header,nav,#header,#nav,#navigation,.header,.nav,.navigation,ul,li").remove();
// Plan A
maybe = getSummaryNew(parsed.clone());
if (!maybe.isBlank()) return maybe;
maybe = getSummaryByTagDensity(parsed.clone());
if (!maybe.isBlank()) return maybe;
// Plan B: Open Graph Description
maybe = parsed.select("meta[name=og:description]").attr("content");
if (!maybe.isBlank()) return maybe;
// Plan C: Ye Olde meta-description
maybe = parsed.select("meta[name=description]").attr("content");
if (!maybe.isBlank()) return maybe;
// Plan D: The kitchen sink?
return lastDitchSummaryEffort(parsed);
}
private String getSummaryNew(Document parsed) {
var filter = new SummaryExtractionFilter();
parsed.filter(filter);
return filter.getSummary(maxSummaryLength+32);
}
private String getSummaryByTagDensity(Document parsed) {
StringBuilder content = new StringBuilder();
for (var elem : parsed.select("p,div,section,article,font,center")) {
if (content.length() >= maxSummaryLength) break;
String tagName = elem.tagName();
if (("p".equals(tagName) || "center".equals(tagName) || "font".equals(tagName))
&& elem.text().length() < 16)
{
continue;
}
if (aTagDensity(elem) < 0.1 && htmlTagDensity(elem) > 0.85) {
content.append(elem.text()).append(' ');
for (var heuristic : heuristics) {
String maybe = heuristic.summarize(parsed);
if (!maybe.isBlank()) {
String cleaned = truncatedCharacters.matcher(maybe).replaceAll(" ");
return StringUtils.abbreviate(cleaned, "", maxSummaryLength);
}
}
if (content.length() > 32) {
// AAAABBBBCCCCDDDDEEEEFFFFGGGGHHHH
return content.toString();
}
return "";
}
private String lastDitchSummaryEffort(Document parsed) {
int bodyTextLength = parsed.body().text().length();
parsed.getElementsByTag("a").remove();
for (var elem : parsed.select("p,div,section,article,font,center,td,h1,h2,h3,h4,h5,h6,tr,th")) {
if (elem.text().length() < bodyTextLength / 2 && aTagDensity(elem) > 0.25) {
elem.remove();
}
}
return parsed.body().text();
}
private double htmlTagDensity(Element elem) {
return (double) elem.text().length() / elem.html().length();
}
private double aTagDensity(Element elem) {
return (double) elem.getElementsByTag("a").text().length() / elem.text().length();
}
}

View File

@ -0,0 +1,26 @@
package nu.marginalia.summary.heuristic;
import com.google.inject.Inject;
import com.google.inject.name.Named;
import nu.marginalia.summary.SummaryExtractionFilter;
import org.jsoup.nodes.Document;
public class DomFilterHeuristic implements SummaryHeuristic {
private final int maxSummaryLength;
@Inject
public DomFilterHeuristic(@Named("max-summary-length") Integer maxSummaryLength) {
this.maxSummaryLength = maxSummaryLength;
}
@Override
public String summarize(Document doc) {
doc = doc.clone();
var filter = new SummaryExtractionFilter();
doc.filter(filter);
return filter.getSummary(maxSummaryLength+32);
}
}

View File

@ -0,0 +1,28 @@
package nu.marginalia.summary.heuristic;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
public class FallbackHeuristic implements SummaryHeuristic {
@Override
public String summarize(Document doc) {
doc = doc.clone();
int bodyTextLength = doc.body().text().length();
doc.getElementsByTag("a").remove();
for (var elem : doc.select("p,div,section,article,font,center,td,h1,h2,h3,h4,h5,h6,tr,th")) {
if (elem.text().length() < bodyTextLength / 2 && aTagDensity(elem) > 0.25) {
elem.remove();
}
}
return doc.body().text();
}
private double aTagDensity(Element elem) {
return (double) elem.getElementsByTag("a").text().length() / elem.text().length();
}
}

View File

@ -0,0 +1,10 @@
package nu.marginalia.summary.heuristic;
import org.jsoup.nodes.Document;
public class MetaDescriptionHeuristic implements SummaryHeuristic {
@Override
public String summarize(Document doc) {
return doc.select("meta[name=description]").attr("content");
}
}

View File

@ -0,0 +1,10 @@
package nu.marginalia.summary.heuristic;
import org.jsoup.nodes.Document;
public class OpenGraphDescriptionHeuristic implements SummaryHeuristic {
@Override
public String summarize(Document doc) {
return doc.select("meta[name=og:description]").attr("content");
}
}

View File

@ -0,0 +1,7 @@
package nu.marginalia.summary.heuristic;
import org.jsoup.nodes.Document;
public interface SummaryHeuristic {
String summarize(Document doc);
}

View File

@ -0,0 +1,53 @@
package nu.marginalia.summary.heuristic;
import com.google.inject.Inject;
import com.google.inject.name.Named;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
public class TagDensityHeuristic implements SummaryHeuristic {
private final int maxSummaryLength;
@Inject
public TagDensityHeuristic(@Named("max-summary-length") Integer maxSummaryLength) {
this.maxSummaryLength = maxSummaryLength;
}
@Override
public String summarize(Document doc) {
doc = doc.clone();
StringBuilder content = new StringBuilder();
for (var elem : doc.select("p,div,section,article,font,center")) {
if (content.length() >= maxSummaryLength) break;
String tagName = elem.tagName();
if (("p".equals(tagName) || "center".equals(tagName) || "font".equals(tagName))
&& elem.text().length() < 16)
{
continue;
}
if (aTagDensity(elem) < 0.1 && htmlTagDensity(elem) > 0.85) {
content.append(elem.text()).append(' ');
}
}
if (content.length() > 32) {
// AAAABBBBCCCCDDDDEEEEFFFFGGGGHHHH
return content.toString();
}
return "";
}
private double htmlTagDensity(Element elem) {
return (double) elem.text().length() / elem.html().length();
}
private double aTagDensity(Element elem) {
return (double) elem.getElementsByTag("a").text().length() / elem.text().length();
}
}

View File

@ -1,7 +1,6 @@
package nu.marginalia.summary;
import nu.marginalia.summary.SummaryExtractionFilter;
import nu.marginalia.summary.SummaryExtractor;
import nu.marginalia.summary.heuristic.*;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
@ -18,7 +17,12 @@ class SummaryExtractorTest {
SummaryExtractor summaryExtractor;
@BeforeEach
public void setUp() {
summaryExtractor = new SummaryExtractor(255);
summaryExtractor = new SummaryExtractor(255,
new DomFilterHeuristic(255),
new TagDensityHeuristic(255),
new OpenGraphDescriptionHeuristic(),
new MetaDescriptionHeuristic(),
new FallbackHeuristic());
}
@Test