(converter) Fix rare sentence extractor bug

It was caused by non-thread safe concurrent memory access in SentenceExtractor.
This commit is contained in:
Viktor Lofgren 2023-09-24 19:39:27 +02:00
parent 8ca20f184d
commit a433bbbe45
6 changed files with 44 additions and 15 deletions

View File

@ -22,6 +22,11 @@ import java.io.IOException;
import java.io.InputStream;
import java.util.*;
/** This class still isn't thread safe!! If you use it concurrently, it won't throw exceptions,
* it will just haunt your code and cause unpredictable mysterious errors.
*
* Use {@link ThreadLocalSentenceExtractorProvider} instead to avoid falling into the twilight zone!
*/
public class SentenceExtractor {
private SentenceDetectorME sentenceDetector;

View File

@ -0,0 +1,19 @@
package nu.marginalia.language.sentence;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.LanguageModels;
@Singleton
public class ThreadLocalSentenceExtractorProvider {
private final ThreadLocal<SentenceExtractor> sentenceExtractorThreadLocal;
@Inject
public ThreadLocalSentenceExtractorProvider(LanguageModels languageModels) {
sentenceExtractorThreadLocal = ThreadLocal.withInitial(() -> new SentenceExtractor(languageModels));
}
public SentenceExtractor get() {
return sentenceExtractorThreadLocal.get();
}
}

View File

@ -10,6 +10,7 @@ import nu.marginalia.converting.processor.logic.links.FileLinks;
import nu.marginalia.converting.processor.logic.links.LinkProcessor;
import nu.marginalia.converting.processor.plugin.specialization.*;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.link_parser.LinkParser;
import nu.marginalia.crawling.model.CrawledDocument;
@ -44,7 +45,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
private final Logger logger = LoggerFactory.getLogger(getClass());
private final double minDocumentQuality;
private final SentenceExtractor sentenceExtractor;
private final FeatureExtractor featureExtractor;
private final TitleExtractor titleExtractor;
private final DocumentKeywordExtractor keywordExtractor;
@ -59,6 +59,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
private static final LinkParser linkParser = new LinkParser();
private static final FeedExtractor feedExtractor = new FeedExtractor(linkParser);
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
private final HtmlProcessorSpecializations htmlProcessorSpecializations;
@Inject
@ -73,13 +74,13 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
DocumentLengthLogic documentLengthLogic,
MetaRobotsTag metaRobotsTag,
DocumentGeneratorExtractor documentGeneratorExtractor,
ThreadLocalSentenceExtractorProvider sentenceExtractorProvider,
HtmlProcessorSpecializations specializations)
{
super(languageFilter);
this.documentLengthLogic = documentLengthLogic;
this.minDocumentQuality = minDocumentQuality;
this.sentenceExtractor = sentenceExtractor;
this.featureExtractor = featureExtractor;
this.titleExtractor = titleExtractor;
@ -88,6 +89,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
this.metaRobotsTag = metaRobotsTag;
this.documentGeneratorExtractor = documentGeneratorExtractor;
this.sentenceExtractorProvider = sentenceExtractorProvider;
this.htmlProcessorSpecializations = specializations;
}
@ -122,7 +124,8 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
throw new DisqualifiedException(DisqualificationReason.IRRELEVANT);
}
DocumentLanguageData dld = sentenceExtractor.extractSentences(specialization.prune(doc));
DocumentLanguageData dld =
sentenceExtractorProvider.get().extractSentences(specialization.prune(doc));
checkDocumentLanguage(dld);

View File

@ -6,7 +6,7 @@ import nu.marginalia.converting.sideload.dirtree.DirtreeSideloaderFactory;
import nu.marginalia.converting.sideload.encyclopedia.EncyclopediaMarginaliaNuSideloader;
import nu.marginalia.converting.sideload.stackexchange.StackexchangeSideloader;
import nu.marginalia.keyword.DocumentKeywordExtractor;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
import java.io.IOException;
import java.nio.file.Files;
@ -17,19 +17,19 @@ import java.util.Collection;
public class SideloadSourceFactory {
private final Gson gson;
private final SideloaderProcessing sideloaderProcessing;
private final SentenceExtractor sentenceExtractor;
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
private final DocumentKeywordExtractor documentKeywordExtractor;
private final DirtreeSideloaderFactory dirtreeSideloaderFactory;
@Inject
public SideloadSourceFactory(Gson gson,
SideloaderProcessing sideloaderProcessing,
SentenceExtractor sentenceExtractor,
ThreadLocalSentenceExtractorProvider sentenceExtractorProvider,
DocumentKeywordExtractor documentKeywordExtractor,
DirtreeSideloaderFactory dirtreeSideloaderFactory) {
this.gson = gson;
this.sideloaderProcessing = sideloaderProcessing;
this.sentenceExtractor = sentenceExtractor;
this.sentenceExtractorProvider = sentenceExtractorProvider;
this.documentKeywordExtractor = documentKeywordExtractor;
this.dirtreeSideloaderFactory = dirtreeSideloaderFactory;
}
@ -48,7 +48,7 @@ public class SideloadSourceFactory {
return dirs
.filter(Files::isRegularFile)
.filter(f -> f.toFile().getName().endsWith(".db"))
.map(dbFile -> new StackexchangeSideloader(dbFile, sentenceExtractor, documentKeywordExtractor))
.map(dbFile -> new StackexchangeSideloader(dbFile, sentenceExtractorProvider, documentKeywordExtractor))
.toList();
}
}

View File

@ -8,7 +8,6 @@ import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.UrlIndexingState;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.idx.WordMetadata;
import java.net.URISyntaxException;
import java.time.LocalDateTime;
@ -23,7 +22,10 @@ public class SideloaderProcessing {
this.htmlProcessorPlugin = htmlProcessorPlugin;
}
public ProcessedDocument processDocument(String url, String body, List<String> extraKeywords, int size) throws URISyntaxException {
public ProcessedDocument processDocument(String url,
String body,
List<String> extraKeywords,
int size) throws URISyntaxException {
var crawledDoc = new CrawledDocument(
"encyclopedia.marginalia.nu",
url,

View File

@ -5,7 +5,7 @@ import nu.marginalia.converting.model.*;
import nu.marginalia.converting.sideload.SideloadSource;
import nu.marginalia.integration.stackexchange.sqlite.StackExchangePostsDb;
import nu.marginalia.keyword.DocumentKeywordExtractor;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.DomainIndexingState;
@ -29,7 +29,7 @@ import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.TimeUnit;
public class StackexchangeSideloader implements SideloadSource {
private final SentenceExtractor sentenceExtractor;
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
private final DocumentKeywordExtractor keywordExtractor;
private final String domainName;
@ -37,12 +37,12 @@ public class StackexchangeSideloader implements SideloadSource {
@SneakyThrows
public StackexchangeSideloader(Path pathToDbFile,
SentenceExtractor sentenceExtractor,
ThreadLocalSentenceExtractorProvider sentenceExtractorProvider,
DocumentKeywordExtractor keywordExtractor
) {
this.dbFile = pathToDbFile;
this.domainName = StackExchangePostsDb.getDomainName(pathToDbFile);
this.sentenceExtractor = sentenceExtractor;
this.sentenceExtractorProvider = sentenceExtractorProvider;
this.keywordExtractor = keywordExtractor;
}
@ -109,7 +109,7 @@ public class StackexchangeSideloader implements SideloadSource {
var url = new EdgeUrl(fullUrl);
var doc = Jsoup.parse(fullHtml.toString());
var dld = sentenceExtractor.extractSentences(doc);
var dld = sentenceExtractorProvider.get().extractSentences(doc);
ret.url = url;
ret.words = keywordExtractor.extractKeywords(dld, url);