Clean up summary extractor module.
This commit is contained in:
parent
6a20b2b678
commit
43430728aa
@ -2,10 +2,12 @@ package nu.marginalia.summary;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.summary.heuristic.*;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class SummaryExtractor {
|
||||
@ -13,100 +15,36 @@ public class SummaryExtractor {
|
||||
|
||||
private final Pattern truncatedCharacters = Pattern.compile("[\\-.,!?' ]{3,}");
|
||||
|
||||
private final List<SummaryHeuristic> heuristics = new ArrayList<>();
|
||||
|
||||
@Inject
|
||||
public SummaryExtractor(@Named("max-summary-length") Integer maxSummaryLength) {
|
||||
public SummaryExtractor(@Named("max-summary-length") Integer maxSummaryLength,
|
||||
DomFilterHeuristic domFilterHeuristic,
|
||||
TagDensityHeuristic tagDensityHeuristic,
|
||||
OpenGraphDescriptionHeuristic ogTagHeuristic,
|
||||
MetaDescriptionHeuristic metaDescriptionHeuristic,
|
||||
FallbackHeuristic fallbackHeuristic)
|
||||
{
|
||||
this.maxSummaryLength = maxSummaryLength;
|
||||
|
||||
heuristics.add(domFilterHeuristic);
|
||||
heuristics.add(tagDensityHeuristic);
|
||||
heuristics.add(ogTagHeuristic);
|
||||
heuristics.add(metaDescriptionHeuristic);
|
||||
heuristics.add(fallbackHeuristic);
|
||||
}
|
||||
|
||||
public String extractSummary(Document parsed) {
|
||||
String summaryString = extractSummaryRaw(parsed);
|
||||
|
||||
summaryString = truncatedCharacters.matcher(summaryString).replaceAll(" ");
|
||||
summaryString = StringUtils.abbreviate(summaryString, "", maxSummaryLength);
|
||||
|
||||
return summaryString;
|
||||
}
|
||||
|
||||
|
||||
private String extractSummaryRaw(Document parsed) {
|
||||
|
||||
String maybe;
|
||||
|
||||
parsed.select("header,nav,#header,#nav,#navigation,.header,.nav,.navigation,ul,li").remove();
|
||||
|
||||
// Plan A
|
||||
maybe = getSummaryNew(parsed.clone());
|
||||
if (!maybe.isBlank()) return maybe;
|
||||
|
||||
maybe = getSummaryByTagDensity(parsed.clone());
|
||||
if (!maybe.isBlank()) return maybe;
|
||||
|
||||
// Plan B: Open Graph Description
|
||||
maybe = parsed.select("meta[name=og:description]").attr("content");
|
||||
if (!maybe.isBlank()) return maybe;
|
||||
|
||||
// Plan C: Ye Olde meta-description
|
||||
maybe = parsed.select("meta[name=description]").attr("content");
|
||||
if (!maybe.isBlank()) return maybe;
|
||||
|
||||
// Plan D: The kitchen sink?
|
||||
return lastDitchSummaryEffort(parsed);
|
||||
}
|
||||
|
||||
private String getSummaryNew(Document parsed) {
|
||||
var filter = new SummaryExtractionFilter();
|
||||
|
||||
parsed.filter(filter);
|
||||
|
||||
return filter.getSummary(maxSummaryLength+32);
|
||||
}
|
||||
|
||||
private String getSummaryByTagDensity(Document parsed) {
|
||||
StringBuilder content = new StringBuilder();
|
||||
|
||||
for (var elem : parsed.select("p,div,section,article,font,center")) {
|
||||
if (content.length() >= maxSummaryLength) break;
|
||||
|
||||
String tagName = elem.tagName();
|
||||
if (("p".equals(tagName) || "center".equals(tagName) || "font".equals(tagName))
|
||||
&& elem.text().length() < 16)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if (aTagDensity(elem) < 0.1 && htmlTagDensity(elem) > 0.85) {
|
||||
content.append(elem.text()).append(' ');
|
||||
for (var heuristic : heuristics) {
|
||||
String maybe = heuristic.summarize(parsed);
|
||||
if (!maybe.isBlank()) {
|
||||
String cleaned = truncatedCharacters.matcher(maybe).replaceAll(" ");
|
||||
return StringUtils.abbreviate(cleaned, "", maxSummaryLength);
|
||||
}
|
||||
}
|
||||
|
||||
if (content.length() > 32) {
|
||||
// AAAABBBBCCCCDDDDEEEEFFFFGGGGHHHH
|
||||
return content.toString();
|
||||
}
|
||||
|
||||
return "";
|
||||
}
|
||||
|
||||
private String lastDitchSummaryEffort(Document parsed) {
|
||||
int bodyTextLength = parsed.body().text().length();
|
||||
|
||||
parsed.getElementsByTag("a").remove();
|
||||
|
||||
for (var elem : parsed.select("p,div,section,article,font,center,td,h1,h2,h3,h4,h5,h6,tr,th")) {
|
||||
if (elem.text().length() < bodyTextLength / 2 && aTagDensity(elem) > 0.25) {
|
||||
elem.remove();
|
||||
}
|
||||
}
|
||||
|
||||
return parsed.body().text();
|
||||
}
|
||||
|
||||
private double htmlTagDensity(Element elem) {
|
||||
return (double) elem.text().length() / elem.html().length();
|
||||
}
|
||||
|
||||
private double aTagDensity(Element elem) {
|
||||
return (double) elem.getElementsByTag("a").text().length() / elem.text().length();
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -0,0 +1,26 @@
|
||||
package nu.marginalia.summary.heuristic;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.summary.SummaryExtractionFilter;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
public class DomFilterHeuristic implements SummaryHeuristic {
|
||||
private final int maxSummaryLength;
|
||||
|
||||
@Inject
|
||||
public DomFilterHeuristic(@Named("max-summary-length") Integer maxSummaryLength) {
|
||||
this.maxSummaryLength = maxSummaryLength;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String summarize(Document doc) {
|
||||
doc = doc.clone();
|
||||
|
||||
var filter = new SummaryExtractionFilter();
|
||||
|
||||
doc.filter(filter);
|
||||
|
||||
return filter.getSummary(maxSummaryLength+32);
|
||||
}
|
||||
}
|
@ -0,0 +1,28 @@
|
||||
package nu.marginalia.summary.heuristic;
|
||||
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
|
||||
public class FallbackHeuristic implements SummaryHeuristic {
|
||||
|
||||
@Override
|
||||
public String summarize(Document doc) {
|
||||
doc = doc.clone();
|
||||
|
||||
int bodyTextLength = doc.body().text().length();
|
||||
|
||||
doc.getElementsByTag("a").remove();
|
||||
|
||||
for (var elem : doc.select("p,div,section,article,font,center,td,h1,h2,h3,h4,h5,h6,tr,th")) {
|
||||
if (elem.text().length() < bodyTextLength / 2 && aTagDensity(elem) > 0.25) {
|
||||
elem.remove();
|
||||
}
|
||||
}
|
||||
|
||||
return doc.body().text();
|
||||
}
|
||||
|
||||
private double aTagDensity(Element elem) {
|
||||
return (double) elem.getElementsByTag("a").text().length() / elem.text().length();
|
||||
}
|
||||
}
|
@ -0,0 +1,10 @@
|
||||
package nu.marginalia.summary.heuristic;
|
||||
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
public class MetaDescriptionHeuristic implements SummaryHeuristic {
|
||||
@Override
|
||||
public String summarize(Document doc) {
|
||||
return doc.select("meta[name=description]").attr("content");
|
||||
}
|
||||
}
|
@ -0,0 +1,10 @@
|
||||
package nu.marginalia.summary.heuristic;
|
||||
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
public class OpenGraphDescriptionHeuristic implements SummaryHeuristic {
|
||||
@Override
|
||||
public String summarize(Document doc) {
|
||||
return doc.select("meta[name=og:description]").attr("content");
|
||||
}
|
||||
}
|
@ -0,0 +1,7 @@
|
||||
package nu.marginalia.summary.heuristic;
|
||||
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
public interface SummaryHeuristic {
|
||||
String summarize(Document doc);
|
||||
}
|
@ -0,0 +1,53 @@
|
||||
package nu.marginalia.summary.heuristic;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
|
||||
public class TagDensityHeuristic implements SummaryHeuristic {
|
||||
private final int maxSummaryLength;
|
||||
|
||||
@Inject
|
||||
public TagDensityHeuristic(@Named("max-summary-length") Integer maxSummaryLength) {
|
||||
this.maxSummaryLength = maxSummaryLength;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String summarize(Document doc) {
|
||||
doc = doc.clone();
|
||||
|
||||
StringBuilder content = new StringBuilder();
|
||||
|
||||
for (var elem : doc.select("p,div,section,article,font,center")) {
|
||||
if (content.length() >= maxSummaryLength) break;
|
||||
|
||||
String tagName = elem.tagName();
|
||||
if (("p".equals(tagName) || "center".equals(tagName) || "font".equals(tagName))
|
||||
&& elem.text().length() < 16)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if (aTagDensity(elem) < 0.1 && htmlTagDensity(elem) > 0.85) {
|
||||
content.append(elem.text()).append(' ');
|
||||
}
|
||||
}
|
||||
|
||||
if (content.length() > 32) {
|
||||
// AAAABBBBCCCCDDDDEEEEFFFFGGGGHHHH
|
||||
return content.toString();
|
||||
}
|
||||
|
||||
return "";
|
||||
}
|
||||
|
||||
private double htmlTagDensity(Element elem) {
|
||||
return (double) elem.text().length() / elem.html().length();
|
||||
}
|
||||
|
||||
private double aTagDensity(Element elem) {
|
||||
return (double) elem.getElementsByTag("a").text().length() / elem.text().length();
|
||||
}
|
||||
|
||||
}
|
@ -1,7 +1,6 @@
|
||||
package nu.marginalia.summary;
|
||||
|
||||
import nu.marginalia.summary.SummaryExtractionFilter;
|
||||
import nu.marginalia.summary.SummaryExtractor;
|
||||
import nu.marginalia.summary.heuristic.*;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
@ -18,7 +17,12 @@ class SummaryExtractorTest {
|
||||
SummaryExtractor summaryExtractor;
|
||||
@BeforeEach
|
||||
public void setUp() {
|
||||
summaryExtractor = new SummaryExtractor(255);
|
||||
summaryExtractor = new SummaryExtractor(255,
|
||||
new DomFilterHeuristic(255),
|
||||
new TagDensityHeuristic(255),
|
||||
new OpenGraphDescriptionHeuristic(),
|
||||
new MetaDescriptionHeuristic(),
|
||||
new FallbackHeuristic());
|
||||
}
|
||||
|
||||
@Test
|
||||
|
Loading…
Reference in New Issue
Block a user