Clean up summary extractor module.
This commit is contained in:
parent
43430728aa
commit
8def95e849
@ -13,5 +13,4 @@ order of a 100,000,000 documents with a time budget of a couple of hours.
|
||||
## Central Classes
|
||||
|
||||
* [SummaryExtractor](src/main/java/nu/marginalia/summary/SummaryExtractor.java)
|
||||
* [SummaryExtractionFilter](src/main/java/nu/marginalia/summary/SummaryExtractionFilter.java) - DOM pruning algo.
|
||||
Doesn't always work, but when it works it's pretty good.
|
||||
|
||||
|
@ -2,7 +2,6 @@ package nu.marginalia.summary.heuristic;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.summary.SummaryExtractionFilter;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
public class DomFilterHeuristic implements SummaryHeuristic {
|
||||
@ -17,7 +16,7 @@ public class DomFilterHeuristic implements SummaryHeuristic {
|
||||
public String summarize(Document doc) {
|
||||
doc = doc.clone();
|
||||
|
||||
var filter = new SummaryExtractionFilter();
|
||||
var filter = new SummarizingDOMFilter();
|
||||
|
||||
doc.filter(filter);
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.summary;
|
||||
package nu.marginalia.summary.heuristic;
|
||||
|
||||
import com.google.common.base.Strings;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
@ -12,7 +12,7 @@ import java.util.*;
|
||||
import static org.jsoup.internal.StringUtil.isActuallyWhitespace;
|
||||
import static org.jsoup.internal.StringUtil.isInvisibleChar;
|
||||
|
||||
public class SummaryExtractionFilter implements NodeFilter {
|
||||
public class SummarizingDOMFilter implements NodeFilter {
|
||||
|
||||
public Map<Node, NodeStatistics> statistics = new HashMap<>(10000);
|
||||
public Map<Node, Integer> pos = new HashMap<>(10000);
|
@ -29,7 +29,7 @@ class SummaryExtractorTest {
|
||||
public void testSummaryFilter() throws IOException {
|
||||
String html = readClassPathFile("html/monadnock.html");
|
||||
var doc = Jsoup.parse(html);
|
||||
var filter = new SummaryExtractionFilter();
|
||||
var filter = new SummarizingDOMFilter();
|
||||
doc.filter(filter);
|
||||
|
||||
filter.statistics.entrySet().stream().sorted(Comparator.comparing(e -> -e.getValue().textLength()))
|
||||
|
Loading…
Reference in New Issue
Block a user