(converter) Fix rare sentence extractor bug
It was caused by non-thread safe concurrent memory access in SentenceExtractor.
This commit is contained in:
parent
8ca20f184d
commit
a433bbbe45
@ -22,6 +22,11 @@ import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.*;
|
||||
|
||||
/** This class still isn't thread safe!! If you use it concurrently, it won't throw exceptions,
|
||||
* it will just haunt your code and cause unpredictable mysterious errors.
|
||||
*
|
||||
* Use {@link ThreadLocalSentenceExtractorProvider} instead to avoid falling into the twilight zone!
|
||||
*/
|
||||
public class SentenceExtractor {
|
||||
|
||||
private SentenceDetectorME sentenceDetector;
|
||||
|
@ -0,0 +1,19 @@
|
||||
package nu.marginalia.language.sentence;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.LanguageModels;
|
||||
|
||||
@Singleton
|
||||
public class ThreadLocalSentenceExtractorProvider {
|
||||
private final ThreadLocal<SentenceExtractor> sentenceExtractorThreadLocal;
|
||||
|
||||
@Inject
|
||||
public ThreadLocalSentenceExtractorProvider(LanguageModels languageModels) {
|
||||
sentenceExtractorThreadLocal = ThreadLocal.withInitial(() -> new SentenceExtractor(languageModels));
|
||||
}
|
||||
|
||||
public SentenceExtractor get() {
|
||||
return sentenceExtractorThreadLocal.get();
|
||||
}
|
||||
}
|
@ -10,6 +10,7 @@ import nu.marginalia.converting.processor.logic.links.FileLinks;
|
||||
import nu.marginalia.converting.processor.logic.links.LinkProcessor;
|
||||
import nu.marginalia.converting.processor.plugin.specialization.*;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import nu.marginalia.link_parser.LinkParser;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
@ -44,7 +45,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final double minDocumentQuality;
|
||||
|
||||
private final SentenceExtractor sentenceExtractor;
|
||||
private final FeatureExtractor featureExtractor;
|
||||
private final TitleExtractor titleExtractor;
|
||||
private final DocumentKeywordExtractor keywordExtractor;
|
||||
@ -59,6 +59,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
private static final LinkParser linkParser = new LinkParser();
|
||||
private static final FeedExtractor feedExtractor = new FeedExtractor(linkParser);
|
||||
|
||||
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
|
||||
private final HtmlProcessorSpecializations htmlProcessorSpecializations;
|
||||
|
||||
@Inject
|
||||
@ -73,13 +74,13 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
DocumentLengthLogic documentLengthLogic,
|
||||
MetaRobotsTag metaRobotsTag,
|
||||
DocumentGeneratorExtractor documentGeneratorExtractor,
|
||||
ThreadLocalSentenceExtractorProvider sentenceExtractorProvider,
|
||||
HtmlProcessorSpecializations specializations)
|
||||
{
|
||||
super(languageFilter);
|
||||
|
||||
this.documentLengthLogic = documentLengthLogic;
|
||||
this.minDocumentQuality = minDocumentQuality;
|
||||
this.sentenceExtractor = sentenceExtractor;
|
||||
this.featureExtractor = featureExtractor;
|
||||
|
||||
this.titleExtractor = titleExtractor;
|
||||
@ -88,6 +89,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
this.metaRobotsTag = metaRobotsTag;
|
||||
|
||||
this.documentGeneratorExtractor = documentGeneratorExtractor;
|
||||
this.sentenceExtractorProvider = sentenceExtractorProvider;
|
||||
this.htmlProcessorSpecializations = specializations;
|
||||
}
|
||||
|
||||
@ -122,7 +124,8 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
throw new DisqualifiedException(DisqualificationReason.IRRELEVANT);
|
||||
}
|
||||
|
||||
DocumentLanguageData dld = sentenceExtractor.extractSentences(specialization.prune(doc));
|
||||
DocumentLanguageData dld =
|
||||
sentenceExtractorProvider.get().extractSentences(specialization.prune(doc));
|
||||
|
||||
checkDocumentLanguage(dld);
|
||||
|
||||
|
@ -6,7 +6,7 @@ import nu.marginalia.converting.sideload.dirtree.DirtreeSideloaderFactory;
|
||||
import nu.marginalia.converting.sideload.encyclopedia.EncyclopediaMarginaliaNuSideloader;
|
||||
import nu.marginalia.converting.sideload.stackexchange.StackexchangeSideloader;
|
||||
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
@ -17,19 +17,19 @@ import java.util.Collection;
|
||||
public class SideloadSourceFactory {
|
||||
private final Gson gson;
|
||||
private final SideloaderProcessing sideloaderProcessing;
|
||||
private final SentenceExtractor sentenceExtractor;
|
||||
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
|
||||
private final DocumentKeywordExtractor documentKeywordExtractor;
|
||||
private final DirtreeSideloaderFactory dirtreeSideloaderFactory;
|
||||
|
||||
@Inject
|
||||
public SideloadSourceFactory(Gson gson,
|
||||
SideloaderProcessing sideloaderProcessing,
|
||||
SentenceExtractor sentenceExtractor,
|
||||
ThreadLocalSentenceExtractorProvider sentenceExtractorProvider,
|
||||
DocumentKeywordExtractor documentKeywordExtractor,
|
||||
DirtreeSideloaderFactory dirtreeSideloaderFactory) {
|
||||
this.gson = gson;
|
||||
this.sideloaderProcessing = sideloaderProcessing;
|
||||
this.sentenceExtractor = sentenceExtractor;
|
||||
this.sentenceExtractorProvider = sentenceExtractorProvider;
|
||||
this.documentKeywordExtractor = documentKeywordExtractor;
|
||||
this.dirtreeSideloaderFactory = dirtreeSideloaderFactory;
|
||||
}
|
||||
@ -48,7 +48,7 @@ public class SideloadSourceFactory {
|
||||
return dirs
|
||||
.filter(Files::isRegularFile)
|
||||
.filter(f -> f.toFile().getName().endsWith(".db"))
|
||||
.map(dbFile -> new StackexchangeSideloader(dbFile, sentenceExtractor, documentKeywordExtractor))
|
||||
.map(dbFile -> new StackexchangeSideloader(dbFile, sentenceExtractorProvider, documentKeywordExtractor))
|
||||
.toList();
|
||||
}
|
||||
}
|
||||
|
@ -8,7 +8,6 @@ import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.UrlIndexingState;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
|
||||
import java.net.URISyntaxException;
|
||||
import java.time.LocalDateTime;
|
||||
@ -23,7 +22,10 @@ public class SideloaderProcessing {
|
||||
this.htmlProcessorPlugin = htmlProcessorPlugin;
|
||||
}
|
||||
|
||||
public ProcessedDocument processDocument(String url, String body, List<String> extraKeywords, int size) throws URISyntaxException {
|
||||
public ProcessedDocument processDocument(String url,
|
||||
String body,
|
||||
List<String> extraKeywords,
|
||||
int size) throws URISyntaxException {
|
||||
var crawledDoc = new CrawledDocument(
|
||||
"encyclopedia.marginalia.nu",
|
||||
url,
|
||||
|
@ -5,7 +5,7 @@ import nu.marginalia.converting.model.*;
|
||||
import nu.marginalia.converting.sideload.SideloadSource;
|
||||
import nu.marginalia.integration.stackexchange.sqlite.StackExchangePostsDb;
|
||||
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||
@ -29,7 +29,7 @@ import java.util.concurrent.ArrayBlockingQueue;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
public class StackexchangeSideloader implements SideloadSource {
|
||||
private final SentenceExtractor sentenceExtractor;
|
||||
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
|
||||
private final DocumentKeywordExtractor keywordExtractor;
|
||||
private final String domainName;
|
||||
|
||||
@ -37,12 +37,12 @@ public class StackexchangeSideloader implements SideloadSource {
|
||||
|
||||
@SneakyThrows
|
||||
public StackexchangeSideloader(Path pathToDbFile,
|
||||
SentenceExtractor sentenceExtractor,
|
||||
ThreadLocalSentenceExtractorProvider sentenceExtractorProvider,
|
||||
DocumentKeywordExtractor keywordExtractor
|
||||
) {
|
||||
this.dbFile = pathToDbFile;
|
||||
this.domainName = StackExchangePostsDb.getDomainName(pathToDbFile);
|
||||
this.sentenceExtractor = sentenceExtractor;
|
||||
this.sentenceExtractorProvider = sentenceExtractorProvider;
|
||||
this.keywordExtractor = keywordExtractor;
|
||||
}
|
||||
|
||||
@ -109,7 +109,7 @@ public class StackexchangeSideloader implements SideloadSource {
|
||||
|
||||
var url = new EdgeUrl(fullUrl);
|
||||
var doc = Jsoup.parse(fullHtml.toString());
|
||||
var dld = sentenceExtractor.extractSentences(doc);
|
||||
var dld = sentenceExtractorProvider.get().extractSentences(doc);
|
||||
|
||||
ret.url = url;
|
||||
ret.words = keywordExtractor.extractKeywords(dld, url);
|
||||
|
Loading…
Reference in New Issue
Block a user