Clean up summary extractor module.

This commit is contained in:
Viktor Lofgren 2023-03-18 10:24:12 +01:00
parent 43430728aa
commit 8def95e849
4 changed files with 5 additions and 7 deletions

View File

@ -13,5 +13,4 @@ order of a 100,000,000 documents with a time budget of a couple of hours.
## Central Classes
* [SummaryExtractor](src/main/java/nu/marginalia/summary/SummaryExtractor.java)
* [SummaryExtractionFilter](src/main/java/nu/marginalia/summary/SummaryExtractionFilter.java) - DOM pruning algo.
Doesn't always work, but when it works it's pretty good.

View File

@ -2,7 +2,6 @@ package nu.marginalia.summary.heuristic;
import com.google.inject.Inject;
import com.google.inject.name.Named;
import nu.marginalia.summary.SummaryExtractionFilter;
import org.jsoup.nodes.Document;
public class DomFilterHeuristic implements SummaryHeuristic {
@ -17,7 +16,7 @@ public class DomFilterHeuristic implements SummaryHeuristic {
public String summarize(Document doc) {
doc = doc.clone();
var filter = new SummaryExtractionFilter();
var filter = new SummarizingDOMFilter();
doc.filter(filter);

View File

@ -1,4 +1,4 @@
package nu.marginalia.summary;
package nu.marginalia.summary.heuristic;
import com.google.common.base.Strings;
import org.apache.commons.lang3.StringUtils;
@ -12,7 +12,7 @@ import java.util.*;
import static org.jsoup.internal.StringUtil.isActuallyWhitespace;
import static org.jsoup.internal.StringUtil.isInvisibleChar;
public class SummaryExtractionFilter implements NodeFilter {
public class SummarizingDOMFilter implements NodeFilter {
public Map<Node, NodeStatistics> statistics = new HashMap<>(10000);
public Map<Node, Integer> pos = new HashMap<>(10000);

View File

@ -29,7 +29,7 @@ class SummaryExtractorTest {
public void testSummaryFilter() throws IOException {
String html = readClassPathFile("html/monadnock.html");
var doc = Jsoup.parse(html);
var filter = new SummaryExtractionFilter();
var filter = new SummarizingDOMFilter();
doc.filter(filter);
filter.statistics.entrySet().stream().sorted(Comparator.comparing(e -> -e.getValue().textLength()))