Allocation-free text utility
This commit is contained in:
parent
77f2ca51af
commit
2979f4703e
@ -0,0 +1,87 @@
|
||||
package nu.marginalia.summary.heuristic;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.util.Collection;
|
||||
|
||||
public class HeuristicTextUtil {
|
||||
|
||||
/** Return the number of occurrences of any word in the set of words in the text.
|
||||
*
|
||||
* The words must be all lower case, the text may be in any case. To count as a match,
|
||||
* the word must be surrounded by non-alphabetic characters.
|
||||
*
|
||||
*/
|
||||
public static int countOccurrencesOfAnyWord(String text, Collection<String> wordsLc) {
|
||||
if (StringUtils.isAllLowerCase(text)) {
|
||||
return countOccurrencesOfAnyWordLowerCase(text, wordsLc);
|
||||
}
|
||||
|
||||
int cnt = 0;
|
||||
for (var word : wordsLc) {
|
||||
if (containsWordInAnyCase(text, word)) {
|
||||
cnt++;
|
||||
}
|
||||
}
|
||||
return cnt;
|
||||
}
|
||||
|
||||
public static boolean containsWordInAnyCase(String text, String wordLowerCase) {
|
||||
int pos = StringUtils.indexOfIgnoreCase(text, wordLowerCase);
|
||||
int wl = wordLowerCase.length();
|
||||
|
||||
while (pos >= 0) {
|
||||
if (pos > 0) {
|
||||
char c = text.charAt(pos - 1);
|
||||
if (Character.isAlphabetic(c) || Character.isDigit(c)) {
|
||||
pos = StringUtils.indexOfIgnoreCase(text, wordLowerCase, pos + 1);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (pos + wl < text.length()) {
|
||||
char c = text.charAt(pos + wl);
|
||||
if (Character.isAlphabetic(c) || Character.isDigit(c)) {
|
||||
pos = StringUtils.indexOfIgnoreCase(text, wordLowerCase, pos + 1);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public static int countOccurrencesOfAnyWordLowerCase(String textLc, Collection<String> wordsLc) {
|
||||
int cnt = 0;
|
||||
for (var word : wordsLc) {
|
||||
if (containsWordAllLowerCase(textLc, word)) {
|
||||
cnt++;
|
||||
}
|
||||
}
|
||||
return cnt;
|
||||
}
|
||||
|
||||
public static boolean containsWordAllLowerCase(String text, String wordLowerCase) {
|
||||
int pos = text.indexOf(wordLowerCase);
|
||||
int wl = wordLowerCase.length();
|
||||
|
||||
while (pos >= 0) {
|
||||
if (pos > 0) {
|
||||
char c = text.charAt(pos - 1);
|
||||
if (Character.isAlphabetic(c) || Character.isDigit(c)) {
|
||||
pos = text.indexOf(wordLowerCase, pos + 1);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (pos + wl < text.length()) {
|
||||
char c = text.charAt(pos + wl);
|
||||
if (Character.isAlphabetic(c) || Character.isDigit(c)) {
|
||||
pos = text.indexOf(wordLowerCase, pos + 1);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,39 @@
|
||||
package nu.marginalia.summary.heuristic;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
class HeuristicTextUtilTest {
|
||||
|
||||
@Test
|
||||
void countOccurrencesOfAnyWord() {
|
||||
String sentence = "B A Baracus was an expert with the Abacus";
|
||||
assertEquals(4, HeuristicTextUtil.countOccurrencesOfAnyWord(sentence, Set.of("b", "a", "baracus", "abacus")));
|
||||
}
|
||||
|
||||
@Test
|
||||
void containsWordInAnyCase() {
|
||||
String sentence = "B A Baracus was an expert with the Abacus";
|
||||
|
||||
assertTrue(HeuristicTextUtil.containsWordInAnyCase(sentence, "b"));
|
||||
assertTrue(HeuristicTextUtil.containsWordInAnyCase(sentence, "a"));
|
||||
assertTrue(HeuristicTextUtil.containsWordInAnyCase(sentence, "baracus"));
|
||||
assertTrue(HeuristicTextUtil.containsWordInAnyCase(sentence, "abacus"));
|
||||
assertFalse(HeuristicTextUtil.containsWordInAnyCase(sentence, "cus"));
|
||||
}
|
||||
|
||||
@Test
|
||||
void containsWordAllLowerCase() {
|
||||
String sentence = "b a baracus was an expert with the abacus";
|
||||
|
||||
assertTrue(HeuristicTextUtil.containsWordInAnyCase(sentence, "b"));
|
||||
assertTrue(HeuristicTextUtil.containsWordInAnyCase(sentence, "a"));
|
||||
assertTrue(HeuristicTextUtil.containsWordInAnyCase(sentence, "baracus"));
|
||||
assertTrue(HeuristicTextUtil.containsWordInAnyCase(sentence, "abacus"));
|
||||
assertFalse(HeuristicTextUtil.containsWordInAnyCase(sentence, "cus"));
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue
Block a user