From 8def95e8493b73132af4179cca6b4740fe602c84 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 18 Mar 2023 10:24:12 +0100 Subject: [PATCH] Clean up summary extractor module. --- code/features-convert/summary-extraction/readme.md | 3 +-- .../nu/marginalia/summary/heuristic/DomFilterHeuristic.java | 3 +-- .../SummarizingDOMFilter.java} | 4 ++-- .../test/java/nu/marginalia/summary/SummaryExtractorTest.java | 2 +- 4 files changed, 5 insertions(+), 7 deletions(-) rename code/features-convert/summary-extraction/src/main/java/nu/marginalia/summary/{SummaryExtractionFilter.java => heuristic/SummarizingDOMFilter.java} (98%) diff --git a/code/features-convert/summary-extraction/readme.md b/code/features-convert/summary-extraction/readme.md index 8c75d238..dc14b366 100644 --- a/code/features-convert/summary-extraction/readme.md +++ b/code/features-convert/summary-extraction/readme.md @@ -13,5 +13,4 @@ order of a 100,000,000 documents with a time budget of a couple of hours. ## Central Classes * [SummaryExtractor](src/main/java/nu/marginalia/summary/SummaryExtractor.java) -* [SummaryExtractionFilter](src/main/java/nu/marginalia/summary/SummaryExtractionFilter.java) - DOM pruning algo. - Doesn't always work, but when it works it's pretty good. + diff --git a/code/features-convert/summary-extraction/src/main/java/nu/marginalia/summary/heuristic/DomFilterHeuristic.java b/code/features-convert/summary-extraction/src/main/java/nu/marginalia/summary/heuristic/DomFilterHeuristic.java index cb24dd2d..05002f35 100644 --- a/code/features-convert/summary-extraction/src/main/java/nu/marginalia/summary/heuristic/DomFilterHeuristic.java +++ b/code/features-convert/summary-extraction/src/main/java/nu/marginalia/summary/heuristic/DomFilterHeuristic.java @@ -2,7 +2,6 @@ package nu.marginalia.summary.heuristic; import com.google.inject.Inject; import com.google.inject.name.Named; -import nu.marginalia.summary.SummaryExtractionFilter; import org.jsoup.nodes.Document; public class DomFilterHeuristic implements SummaryHeuristic { @@ -17,7 +16,7 @@ public class DomFilterHeuristic implements SummaryHeuristic { public String summarize(Document doc) { doc = doc.clone(); - var filter = new SummaryExtractionFilter(); + var filter = new SummarizingDOMFilter(); doc.filter(filter); diff --git a/code/features-convert/summary-extraction/src/main/java/nu/marginalia/summary/SummaryExtractionFilter.java b/code/features-convert/summary-extraction/src/main/java/nu/marginalia/summary/heuristic/SummarizingDOMFilter.java similarity index 98% rename from code/features-convert/summary-extraction/src/main/java/nu/marginalia/summary/SummaryExtractionFilter.java rename to code/features-convert/summary-extraction/src/main/java/nu/marginalia/summary/heuristic/SummarizingDOMFilter.java index b20f2b3a..fd71016e 100644 --- a/code/features-convert/summary-extraction/src/main/java/nu/marginalia/summary/SummaryExtractionFilter.java +++ b/code/features-convert/summary-extraction/src/main/java/nu/marginalia/summary/heuristic/SummarizingDOMFilter.java @@ -1,4 +1,4 @@ -package nu.marginalia.summary; +package nu.marginalia.summary.heuristic; import com.google.common.base.Strings; import org.apache.commons.lang3.StringUtils; @@ -12,7 +12,7 @@ import java.util.*; import static org.jsoup.internal.StringUtil.isActuallyWhitespace; import static org.jsoup.internal.StringUtil.isInvisibleChar; -public class SummaryExtractionFilter implements NodeFilter { +public class SummarizingDOMFilter implements NodeFilter { public Map statistics = new HashMap<>(10000); public Map pos = new HashMap<>(10000); diff --git a/code/features-convert/summary-extraction/src/test/java/nu/marginalia/summary/SummaryExtractorTest.java b/code/features-convert/summary-extraction/src/test/java/nu/marginalia/summary/SummaryExtractorTest.java index fafc0747..721e99c4 100644 --- a/code/features-convert/summary-extraction/src/test/java/nu/marginalia/summary/SummaryExtractorTest.java +++ b/code/features-convert/summary-extraction/src/test/java/nu/marginalia/summary/SummaryExtractorTest.java @@ -29,7 +29,7 @@ class SummaryExtractorTest { public void testSummaryFilter() throws IOException { String html = readClassPathFile("html/monadnock.html"); var doc = Jsoup.parse(html); - var filter = new SummaryExtractionFilter(); + var filter = new SummarizingDOMFilter(); doc.filter(filter); filter.statistics.entrySet().stream().sorted(Comparator.comparing(e -> -e.getValue().textLength()))