(converter) Fix rare sentence extractor bug

It was caused by non-thread safe concurrent memory access in SentenceExtractor.
This commit is contained in:
Viktor Lofgren 2023-09-24 19:39:27 +02:00
parent 8ca20f184d
commit a433bbbe45
6 changed files with 44 additions and 15 deletions

View File

@ -22,6 +22,11 @@ import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.util.*; import java.util.*;
/** This class still isn't thread safe!! If you use it concurrently, it won't throw exceptions,
* it will just haunt your code and cause unpredictable mysterious errors.
*
* Use {@link ThreadLocalSentenceExtractorProvider} instead to avoid falling into the twilight zone!
*/
public class SentenceExtractor { public class SentenceExtractor {
private SentenceDetectorME sentenceDetector; private SentenceDetectorME sentenceDetector;

View File

@ -0,0 +1,19 @@
package nu.marginalia.language.sentence;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.LanguageModels;
@Singleton
public class ThreadLocalSentenceExtractorProvider {
private final ThreadLocal<SentenceExtractor> sentenceExtractorThreadLocal;
@Inject
public ThreadLocalSentenceExtractorProvider(LanguageModels languageModels) {
sentenceExtractorThreadLocal = ThreadLocal.withInitial(() -> new SentenceExtractor(languageModels));
}
public SentenceExtractor get() {
return sentenceExtractorThreadLocal.get();
}
}

View File

@ -10,6 +10,7 @@ import nu.marginalia.converting.processor.logic.links.FileLinks;
import nu.marginalia.converting.processor.logic.links.LinkProcessor; import nu.marginalia.converting.processor.logic.links.LinkProcessor;
import nu.marginalia.converting.processor.plugin.specialization.*; import nu.marginalia.converting.processor.plugin.specialization.*;
import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.link_parser.LinkParser; import nu.marginalia.link_parser.LinkParser;
import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDocument;
@ -44,7 +45,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
private final Logger logger = LoggerFactory.getLogger(getClass()); private final Logger logger = LoggerFactory.getLogger(getClass());
private final double minDocumentQuality; private final double minDocumentQuality;
private final SentenceExtractor sentenceExtractor;
private final FeatureExtractor featureExtractor; private final FeatureExtractor featureExtractor;
private final TitleExtractor titleExtractor; private final TitleExtractor titleExtractor;
private final DocumentKeywordExtractor keywordExtractor; private final DocumentKeywordExtractor keywordExtractor;
@ -59,6 +59,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
private static final LinkParser linkParser = new LinkParser(); private static final LinkParser linkParser = new LinkParser();
private static final FeedExtractor feedExtractor = new FeedExtractor(linkParser); private static final FeedExtractor feedExtractor = new FeedExtractor(linkParser);
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
private final HtmlProcessorSpecializations htmlProcessorSpecializations; private final HtmlProcessorSpecializations htmlProcessorSpecializations;
@Inject @Inject
@ -73,13 +74,13 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
DocumentLengthLogic documentLengthLogic, DocumentLengthLogic documentLengthLogic,
MetaRobotsTag metaRobotsTag, MetaRobotsTag metaRobotsTag,
DocumentGeneratorExtractor documentGeneratorExtractor, DocumentGeneratorExtractor documentGeneratorExtractor,
ThreadLocalSentenceExtractorProvider sentenceExtractorProvider,
HtmlProcessorSpecializations specializations) HtmlProcessorSpecializations specializations)
{ {
super(languageFilter); super(languageFilter);
this.documentLengthLogic = documentLengthLogic; this.documentLengthLogic = documentLengthLogic;
this.minDocumentQuality = minDocumentQuality; this.minDocumentQuality = minDocumentQuality;
this.sentenceExtractor = sentenceExtractor;
this.featureExtractor = featureExtractor; this.featureExtractor = featureExtractor;
this.titleExtractor = titleExtractor; this.titleExtractor = titleExtractor;
@ -88,6 +89,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
this.metaRobotsTag = metaRobotsTag; this.metaRobotsTag = metaRobotsTag;
this.documentGeneratorExtractor = documentGeneratorExtractor; this.documentGeneratorExtractor = documentGeneratorExtractor;
this.sentenceExtractorProvider = sentenceExtractorProvider;
this.htmlProcessorSpecializations = specializations; this.htmlProcessorSpecializations = specializations;
} }
@ -122,7 +124,8 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
throw new DisqualifiedException(DisqualificationReason.IRRELEVANT); throw new DisqualifiedException(DisqualificationReason.IRRELEVANT);
} }
DocumentLanguageData dld = sentenceExtractor.extractSentences(specialization.prune(doc)); DocumentLanguageData dld =
sentenceExtractorProvider.get().extractSentences(specialization.prune(doc));
checkDocumentLanguage(dld); checkDocumentLanguage(dld);

View File

@ -6,7 +6,7 @@ import nu.marginalia.converting.sideload.dirtree.DirtreeSideloaderFactory;
import nu.marginalia.converting.sideload.encyclopedia.EncyclopediaMarginaliaNuSideloader; import nu.marginalia.converting.sideload.encyclopedia.EncyclopediaMarginaliaNuSideloader;
import nu.marginalia.converting.sideload.stackexchange.StackexchangeSideloader; import nu.marginalia.converting.sideload.stackexchange.StackexchangeSideloader;
import nu.marginalia.keyword.DocumentKeywordExtractor; import nu.marginalia.keyword.DocumentKeywordExtractor;
import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
@ -17,19 +17,19 @@ import java.util.Collection;
public class SideloadSourceFactory { public class SideloadSourceFactory {
private final Gson gson; private final Gson gson;
private final SideloaderProcessing sideloaderProcessing; private final SideloaderProcessing sideloaderProcessing;
private final SentenceExtractor sentenceExtractor; private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
private final DocumentKeywordExtractor documentKeywordExtractor; private final DocumentKeywordExtractor documentKeywordExtractor;
private final DirtreeSideloaderFactory dirtreeSideloaderFactory; private final DirtreeSideloaderFactory dirtreeSideloaderFactory;
@Inject @Inject
public SideloadSourceFactory(Gson gson, public SideloadSourceFactory(Gson gson,
SideloaderProcessing sideloaderProcessing, SideloaderProcessing sideloaderProcessing,
SentenceExtractor sentenceExtractor, ThreadLocalSentenceExtractorProvider sentenceExtractorProvider,
DocumentKeywordExtractor documentKeywordExtractor, DocumentKeywordExtractor documentKeywordExtractor,
DirtreeSideloaderFactory dirtreeSideloaderFactory) { DirtreeSideloaderFactory dirtreeSideloaderFactory) {
this.gson = gson; this.gson = gson;
this.sideloaderProcessing = sideloaderProcessing; this.sideloaderProcessing = sideloaderProcessing;
this.sentenceExtractor = sentenceExtractor; this.sentenceExtractorProvider = sentenceExtractorProvider;
this.documentKeywordExtractor = documentKeywordExtractor; this.documentKeywordExtractor = documentKeywordExtractor;
this.dirtreeSideloaderFactory = dirtreeSideloaderFactory; this.dirtreeSideloaderFactory = dirtreeSideloaderFactory;
} }
@ -48,7 +48,7 @@ public class SideloadSourceFactory {
return dirs return dirs
.filter(Files::isRegularFile) .filter(Files::isRegularFile)
.filter(f -> f.toFile().getName().endsWith(".db")) .filter(f -> f.toFile().getName().endsWith(".db"))
.map(dbFile -> new StackexchangeSideloader(dbFile, sentenceExtractor, documentKeywordExtractor)) .map(dbFile -> new StackexchangeSideloader(dbFile, sentenceExtractorProvider, documentKeywordExtractor))
.toList(); .toList();
} }
} }

View File

@ -8,7 +8,6 @@ import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.UrlIndexingState; import nu.marginalia.model.crawl.UrlIndexingState;
import nu.marginalia.model.idx.WordFlags; import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.idx.WordMetadata;
import java.net.URISyntaxException; import java.net.URISyntaxException;
import java.time.LocalDateTime; import java.time.LocalDateTime;
@ -23,7 +22,10 @@ public class SideloaderProcessing {
this.htmlProcessorPlugin = htmlProcessorPlugin; this.htmlProcessorPlugin = htmlProcessorPlugin;
} }
public ProcessedDocument processDocument(String url, String body, List<String> extraKeywords, int size) throws URISyntaxException { public ProcessedDocument processDocument(String url,
String body,
List<String> extraKeywords,
int size) throws URISyntaxException {
var crawledDoc = new CrawledDocument( var crawledDoc = new CrawledDocument(
"encyclopedia.marginalia.nu", "encyclopedia.marginalia.nu",
url, url,

View File

@ -5,7 +5,7 @@ import nu.marginalia.converting.model.*;
import nu.marginalia.converting.sideload.SideloadSource; import nu.marginalia.converting.sideload.SideloadSource;
import nu.marginalia.integration.stackexchange.sqlite.StackExchangePostsDb; import nu.marginalia.integration.stackexchange.sqlite.StackExchangePostsDb;
import nu.marginalia.keyword.DocumentKeywordExtractor; import nu.marginalia.keyword.DocumentKeywordExtractor;
import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.model.crawl.DomainIndexingState;
@ -29,7 +29,7 @@ import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
public class StackexchangeSideloader implements SideloadSource { public class StackexchangeSideloader implements SideloadSource {
private final SentenceExtractor sentenceExtractor; private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
private final DocumentKeywordExtractor keywordExtractor; private final DocumentKeywordExtractor keywordExtractor;
private final String domainName; private final String domainName;
@ -37,12 +37,12 @@ public class StackexchangeSideloader implements SideloadSource {
@SneakyThrows @SneakyThrows
public StackexchangeSideloader(Path pathToDbFile, public StackexchangeSideloader(Path pathToDbFile,
SentenceExtractor sentenceExtractor, ThreadLocalSentenceExtractorProvider sentenceExtractorProvider,
DocumentKeywordExtractor keywordExtractor DocumentKeywordExtractor keywordExtractor
) { ) {
this.dbFile = pathToDbFile; this.dbFile = pathToDbFile;
this.domainName = StackExchangePostsDb.getDomainName(pathToDbFile); this.domainName = StackExchangePostsDb.getDomainName(pathToDbFile);
this.sentenceExtractor = sentenceExtractor; this.sentenceExtractorProvider = sentenceExtractorProvider;
this.keywordExtractor = keywordExtractor; this.keywordExtractor = keywordExtractor;
} }
@ -109,7 +109,7 @@ public class StackexchangeSideloader implements SideloadSource {
var url = new EdgeUrl(fullUrl); var url = new EdgeUrl(fullUrl);
var doc = Jsoup.parse(fullHtml.toString()); var doc = Jsoup.parse(fullHtml.toString());
var dld = sentenceExtractor.extractSentences(doc); var dld = sentenceExtractorProvider.get().extractSentences(doc);
ret.url = url; ret.url = url;
ret.words = keywordExtractor.extractKeywords(dld, url); ret.words = keywordExtractor.extractKeywords(dld, url);