Allocation-free text utility

This commit is contained in:
Viktor Lofgren 2023-06-12 17:42:31 +02:00 committed by Viktor
parent 77f2ca51af
commit 2979f4703e
2 changed files with 126 additions and 0 deletions

View File

@ -0,0 +1,87 @@
package nu.marginalia.summary.heuristic;
import org.apache.commons.lang3.StringUtils;
import java.util.Collection;
public class HeuristicTextUtil {
/** Return the number of occurrences of any word in the set of words in the text.
*
* The words must be all lower case, the text may be in any case. To count as a match,
* the word must be surrounded by non-alphabetic characters.
*
*/
public static int countOccurrencesOfAnyWord(String text, Collection<String> wordsLc) {
if (StringUtils.isAllLowerCase(text)) {
return countOccurrencesOfAnyWordLowerCase(text, wordsLc);
}
int cnt = 0;
for (var word : wordsLc) {
if (containsWordInAnyCase(text, word)) {
cnt++;
}
}
return cnt;
}
public static boolean containsWordInAnyCase(String text, String wordLowerCase) {
int pos = StringUtils.indexOfIgnoreCase(text, wordLowerCase);
int wl = wordLowerCase.length();
while (pos >= 0) {
if (pos > 0) {
char c = text.charAt(pos - 1);
if (Character.isAlphabetic(c) || Character.isDigit(c)) {
pos = StringUtils.indexOfIgnoreCase(text, wordLowerCase, pos + 1);
continue;
}
}
if (pos + wl < text.length()) {
char c = text.charAt(pos + wl);
if (Character.isAlphabetic(c) || Character.isDigit(c)) {
pos = StringUtils.indexOfIgnoreCase(text, wordLowerCase, pos + 1);
continue;
}
}
return true;
}
return false;
}
public static int countOccurrencesOfAnyWordLowerCase(String textLc, Collection<String> wordsLc) {
int cnt = 0;
for (var word : wordsLc) {
if (containsWordAllLowerCase(textLc, word)) {
cnt++;
}
}
return cnt;
}
public static boolean containsWordAllLowerCase(String text, String wordLowerCase) {
int pos = text.indexOf(wordLowerCase);
int wl = wordLowerCase.length();
while (pos >= 0) {
if (pos > 0) {
char c = text.charAt(pos - 1);
if (Character.isAlphabetic(c) || Character.isDigit(c)) {
pos = text.indexOf(wordLowerCase, pos + 1);
continue;
}
}
if (pos + wl < text.length()) {
char c = text.charAt(pos + wl);
if (Character.isAlphabetic(c) || Character.isDigit(c)) {
pos = text.indexOf(wordLowerCase, pos + 1);
continue;
}
}
return true;
}
return false;
}
}

View File

@ -0,0 +1,39 @@
package nu.marginalia.summary.heuristic;
import org.junit.jupiter.api.Test;
import java.util.Set;
import static org.junit.jupiter.api.Assertions.*;
class HeuristicTextUtilTest {
@Test
void countOccurrencesOfAnyWord() {
String sentence = "B A Baracus was an expert with the Abacus";
assertEquals(4, HeuristicTextUtil.countOccurrencesOfAnyWord(sentence, Set.of("b", "a", "baracus", "abacus")));
}
@Test
void containsWordInAnyCase() {
String sentence = "B A Baracus was an expert with the Abacus";
assertTrue(HeuristicTextUtil.containsWordInAnyCase(sentence, "b"));
assertTrue(HeuristicTextUtil.containsWordInAnyCase(sentence, "a"));
assertTrue(HeuristicTextUtil.containsWordInAnyCase(sentence, "baracus"));
assertTrue(HeuristicTextUtil.containsWordInAnyCase(sentence, "abacus"));
assertFalse(HeuristicTextUtil.containsWordInAnyCase(sentence, "cus"));
}
@Test
void containsWordAllLowerCase() {
String sentence = "b a baracus was an expert with the abacus";
assertTrue(HeuristicTextUtil.containsWordInAnyCase(sentence, "b"));
assertTrue(HeuristicTextUtil.containsWordInAnyCase(sentence, "a"));
assertTrue(HeuristicTextUtil.containsWordInAnyCase(sentence, "baracus"));
assertTrue(HeuristicTextUtil.containsWordInAnyCase(sentence, "abacus"));
assertFalse(HeuristicTextUtil.containsWordInAnyCase(sentence, "cus"));
}
}