(language-processing) Add maximum length limit for text input in SentenceExtractor

Added a new constant, MAX_TEXT_LENGTH, to the SentenceExtractor class. If the length of the text input exceeds this limit, the text is truncated to fit within the limit. This modification is designed to prevent excessive resource usage for unusually long text inputs.
This commit is contained in:
Viktor Lofgren 2024-01-03 13:59:05 +01:00
parent 32436d099c
commit 0806aa6dfe

View File

@ -96,7 +96,7 @@ public class SentenceExtractor {
title = doc.getElementsByTag("h2").text();
}
if (title.trim().length() < 3 && textSentences.length > 0) {
if (title.trim().length() < 3) {
for (DocumentSentence textSentence : textSentences) {
if (textSentence.length() > 0) {
title = textSentence.originalSentence.toLowerCase();
@ -138,10 +138,6 @@ public class SentenceExtractor {
String textNormalizedSpaces = SentenceExtractorStringUtils.normalizeSpaces(text);
if (text.length() > MAX_TEXT_LENGTH) {
textNormalizedSpaces = textNormalizedSpaces.substring(0, MAX_TEXT_LENGTH);
}
try {
sentences = sentenceDetector.sentDetect(textNormalizedSpaces);
}
@ -221,7 +217,12 @@ public class SentenceExtractor {
public String asText(Document dc) {
String text = dc.getElementsByTag("body").text();
return text.substring(0, (int) (text.length()*0.95));
if (text.length() > MAX_TEXT_LENGTH) {
return text.substring(0, MAX_TEXT_LENGTH);
}
else {
return text.substring(0, (int) (text.length() * 0.95));
}
}