From db0216936eddbbcc902e9d089b1cacc5e00caaf7 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 16 Aug 2023 15:48:34 +0200 Subject: [PATCH] (summary) Reduce the chance of expensive operations --- .../summary/heuristic/SummarizingDOMFilter.java | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/code/features-convert/summary-extraction/src/main/java/nu/marginalia/summary/heuristic/SummarizingDOMFilter.java b/code/features-convert/summary-extraction/src/main/java/nu/marginalia/summary/heuristic/SummarizingDOMFilter.java index f2137a60..f72b0eae 100644 --- a/code/features-convert/summary-extraction/src/main/java/nu/marginalia/summary/heuristic/SummarizingDOMFilter.java +++ b/code/features-convert/summary-extraction/src/main/java/nu/marginalia/summary/heuristic/SummarizingDOMFilter.java @@ -101,9 +101,15 @@ public class SummarizingDOMFilter implements NodeFilter { for (var stats : in) { // text() is expensive, we don't mind sifting through superfluous whitespace - int cnt = stats.score(tn -> - countOccurrencesOfAnyWord(tn.getWholeText(), importantWords) - - countOccurrencesOfAnyWord(tn.getWholeText(), badWords)); + int cnt = stats.score(tn -> { + String wholeText = tn.getWholeText(); + + if (wholeText.length() > 128) + return 0; + + return countOccurrencesOfAnyWord(wholeText, importantWords) + - countOccurrencesOfAnyWord(wholeText, badWords); + }); if (cnt > 0) { ret.put(stats, -cnt);