(converter) Fix rare sentence extractor bug
It was caused by non-thread safe concurrent memory access in SentenceExtractor.
This commit is contained in:
parent
8ca20f184d
commit
a433bbbe45
@ -22,6 +22,11 @@ import java.io.IOException;
|
|||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
|
/** This class still isn't thread safe!! If you use it concurrently, it won't throw exceptions,
|
||||||
|
* it will just haunt your code and cause unpredictable mysterious errors.
|
||||||
|
*
|
||||||
|
* Use {@link ThreadLocalSentenceExtractorProvider} instead to avoid falling into the twilight zone!
|
||||||
|
*/
|
||||||
public class SentenceExtractor {
|
public class SentenceExtractor {
|
||||||
|
|
||||||
private SentenceDetectorME sentenceDetector;
|
private SentenceDetectorME sentenceDetector;
|
||||||
|
@ -0,0 +1,19 @@
|
|||||||
|
package nu.marginalia.language.sentence;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import nu.marginalia.LanguageModels;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class ThreadLocalSentenceExtractorProvider {
|
||||||
|
private final ThreadLocal<SentenceExtractor> sentenceExtractorThreadLocal;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public ThreadLocalSentenceExtractorProvider(LanguageModels languageModels) {
|
||||||
|
sentenceExtractorThreadLocal = ThreadLocal.withInitial(() -> new SentenceExtractor(languageModels));
|
||||||
|
}
|
||||||
|
|
||||||
|
public SentenceExtractor get() {
|
||||||
|
return sentenceExtractorThreadLocal.get();
|
||||||
|
}
|
||||||
|
}
|
@ -10,6 +10,7 @@ import nu.marginalia.converting.processor.logic.links.FileLinks;
|
|||||||
import nu.marginalia.converting.processor.logic.links.LinkProcessor;
|
import nu.marginalia.converting.processor.logic.links.LinkProcessor;
|
||||||
import nu.marginalia.converting.processor.plugin.specialization.*;
|
import nu.marginalia.converting.processor.plugin.specialization.*;
|
||||||
import nu.marginalia.language.model.DocumentLanguageData;
|
import nu.marginalia.language.model.DocumentLanguageData;
|
||||||
|
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
||||||
import nu.marginalia.model.crawl.HtmlFeature;
|
import nu.marginalia.model.crawl.HtmlFeature;
|
||||||
import nu.marginalia.link_parser.LinkParser;
|
import nu.marginalia.link_parser.LinkParser;
|
||||||
import nu.marginalia.crawling.model.CrawledDocument;
|
import nu.marginalia.crawling.model.CrawledDocument;
|
||||||
@ -44,7 +45,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
private final double minDocumentQuality;
|
private final double minDocumentQuality;
|
||||||
|
|
||||||
private final SentenceExtractor sentenceExtractor;
|
|
||||||
private final FeatureExtractor featureExtractor;
|
private final FeatureExtractor featureExtractor;
|
||||||
private final TitleExtractor titleExtractor;
|
private final TitleExtractor titleExtractor;
|
||||||
private final DocumentKeywordExtractor keywordExtractor;
|
private final DocumentKeywordExtractor keywordExtractor;
|
||||||
@ -59,6 +59,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
private static final LinkParser linkParser = new LinkParser();
|
private static final LinkParser linkParser = new LinkParser();
|
||||||
private static final FeedExtractor feedExtractor = new FeedExtractor(linkParser);
|
private static final FeedExtractor feedExtractor = new FeedExtractor(linkParser);
|
||||||
|
|
||||||
|
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
|
||||||
private final HtmlProcessorSpecializations htmlProcessorSpecializations;
|
private final HtmlProcessorSpecializations htmlProcessorSpecializations;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
@ -73,13 +74,13 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
DocumentLengthLogic documentLengthLogic,
|
DocumentLengthLogic documentLengthLogic,
|
||||||
MetaRobotsTag metaRobotsTag,
|
MetaRobotsTag metaRobotsTag,
|
||||||
DocumentGeneratorExtractor documentGeneratorExtractor,
|
DocumentGeneratorExtractor documentGeneratorExtractor,
|
||||||
|
ThreadLocalSentenceExtractorProvider sentenceExtractorProvider,
|
||||||
HtmlProcessorSpecializations specializations)
|
HtmlProcessorSpecializations specializations)
|
||||||
{
|
{
|
||||||
super(languageFilter);
|
super(languageFilter);
|
||||||
|
|
||||||
this.documentLengthLogic = documentLengthLogic;
|
this.documentLengthLogic = documentLengthLogic;
|
||||||
this.minDocumentQuality = minDocumentQuality;
|
this.minDocumentQuality = minDocumentQuality;
|
||||||
this.sentenceExtractor = sentenceExtractor;
|
|
||||||
this.featureExtractor = featureExtractor;
|
this.featureExtractor = featureExtractor;
|
||||||
|
|
||||||
this.titleExtractor = titleExtractor;
|
this.titleExtractor = titleExtractor;
|
||||||
@ -88,6 +89,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
this.metaRobotsTag = metaRobotsTag;
|
this.metaRobotsTag = metaRobotsTag;
|
||||||
|
|
||||||
this.documentGeneratorExtractor = documentGeneratorExtractor;
|
this.documentGeneratorExtractor = documentGeneratorExtractor;
|
||||||
|
this.sentenceExtractorProvider = sentenceExtractorProvider;
|
||||||
this.htmlProcessorSpecializations = specializations;
|
this.htmlProcessorSpecializations = specializations;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -122,7 +124,8 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
throw new DisqualifiedException(DisqualificationReason.IRRELEVANT);
|
throw new DisqualifiedException(DisqualificationReason.IRRELEVANT);
|
||||||
}
|
}
|
||||||
|
|
||||||
DocumentLanguageData dld = sentenceExtractor.extractSentences(specialization.prune(doc));
|
DocumentLanguageData dld =
|
||||||
|
sentenceExtractorProvider.get().extractSentences(specialization.prune(doc));
|
||||||
|
|
||||||
checkDocumentLanguage(dld);
|
checkDocumentLanguage(dld);
|
||||||
|
|
||||||
|
@ -6,7 +6,7 @@ import nu.marginalia.converting.sideload.dirtree.DirtreeSideloaderFactory;
|
|||||||
import nu.marginalia.converting.sideload.encyclopedia.EncyclopediaMarginaliaNuSideloader;
|
import nu.marginalia.converting.sideload.encyclopedia.EncyclopediaMarginaliaNuSideloader;
|
||||||
import nu.marginalia.converting.sideload.stackexchange.StackexchangeSideloader;
|
import nu.marginalia.converting.sideload.stackexchange.StackexchangeSideloader;
|
||||||
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
||||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
@ -17,19 +17,19 @@ import java.util.Collection;
|
|||||||
public class SideloadSourceFactory {
|
public class SideloadSourceFactory {
|
||||||
private final Gson gson;
|
private final Gson gson;
|
||||||
private final SideloaderProcessing sideloaderProcessing;
|
private final SideloaderProcessing sideloaderProcessing;
|
||||||
private final SentenceExtractor sentenceExtractor;
|
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
|
||||||
private final DocumentKeywordExtractor documentKeywordExtractor;
|
private final DocumentKeywordExtractor documentKeywordExtractor;
|
||||||
private final DirtreeSideloaderFactory dirtreeSideloaderFactory;
|
private final DirtreeSideloaderFactory dirtreeSideloaderFactory;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public SideloadSourceFactory(Gson gson,
|
public SideloadSourceFactory(Gson gson,
|
||||||
SideloaderProcessing sideloaderProcessing,
|
SideloaderProcessing sideloaderProcessing,
|
||||||
SentenceExtractor sentenceExtractor,
|
ThreadLocalSentenceExtractorProvider sentenceExtractorProvider,
|
||||||
DocumentKeywordExtractor documentKeywordExtractor,
|
DocumentKeywordExtractor documentKeywordExtractor,
|
||||||
DirtreeSideloaderFactory dirtreeSideloaderFactory) {
|
DirtreeSideloaderFactory dirtreeSideloaderFactory) {
|
||||||
this.gson = gson;
|
this.gson = gson;
|
||||||
this.sideloaderProcessing = sideloaderProcessing;
|
this.sideloaderProcessing = sideloaderProcessing;
|
||||||
this.sentenceExtractor = sentenceExtractor;
|
this.sentenceExtractorProvider = sentenceExtractorProvider;
|
||||||
this.documentKeywordExtractor = documentKeywordExtractor;
|
this.documentKeywordExtractor = documentKeywordExtractor;
|
||||||
this.dirtreeSideloaderFactory = dirtreeSideloaderFactory;
|
this.dirtreeSideloaderFactory = dirtreeSideloaderFactory;
|
||||||
}
|
}
|
||||||
@ -48,7 +48,7 @@ public class SideloadSourceFactory {
|
|||||||
return dirs
|
return dirs
|
||||||
.filter(Files::isRegularFile)
|
.filter(Files::isRegularFile)
|
||||||
.filter(f -> f.toFile().getName().endsWith(".db"))
|
.filter(f -> f.toFile().getName().endsWith(".db"))
|
||||||
.map(dbFile -> new StackexchangeSideloader(dbFile, sentenceExtractor, documentKeywordExtractor))
|
.map(dbFile -> new StackexchangeSideloader(dbFile, sentenceExtractorProvider, documentKeywordExtractor))
|
||||||
.toList();
|
.toList();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -8,7 +8,6 @@ import nu.marginalia.crawling.model.CrawledDocument;
|
|||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.UrlIndexingState;
|
import nu.marginalia.model.crawl.UrlIndexingState;
|
||||||
import nu.marginalia.model.idx.WordFlags;
|
import nu.marginalia.model.idx.WordFlags;
|
||||||
import nu.marginalia.model.idx.WordMetadata;
|
|
||||||
|
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
import java.time.LocalDateTime;
|
import java.time.LocalDateTime;
|
||||||
@ -23,7 +22,10 @@ public class SideloaderProcessing {
|
|||||||
this.htmlProcessorPlugin = htmlProcessorPlugin;
|
this.htmlProcessorPlugin = htmlProcessorPlugin;
|
||||||
}
|
}
|
||||||
|
|
||||||
public ProcessedDocument processDocument(String url, String body, List<String> extraKeywords, int size) throws URISyntaxException {
|
public ProcessedDocument processDocument(String url,
|
||||||
|
String body,
|
||||||
|
List<String> extraKeywords,
|
||||||
|
int size) throws URISyntaxException {
|
||||||
var crawledDoc = new CrawledDocument(
|
var crawledDoc = new CrawledDocument(
|
||||||
"encyclopedia.marginalia.nu",
|
"encyclopedia.marginalia.nu",
|
||||||
url,
|
url,
|
||||||
|
@ -5,7 +5,7 @@ import nu.marginalia.converting.model.*;
|
|||||||
import nu.marginalia.converting.sideload.SideloadSource;
|
import nu.marginalia.converting.sideload.SideloadSource;
|
||||||
import nu.marginalia.integration.stackexchange.sqlite.StackExchangePostsDb;
|
import nu.marginalia.integration.stackexchange.sqlite.StackExchangePostsDb;
|
||||||
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
||||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||||
@ -29,7 +29,7 @@ import java.util.concurrent.ArrayBlockingQueue;
|
|||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
public class StackexchangeSideloader implements SideloadSource {
|
public class StackexchangeSideloader implements SideloadSource {
|
||||||
private final SentenceExtractor sentenceExtractor;
|
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
|
||||||
private final DocumentKeywordExtractor keywordExtractor;
|
private final DocumentKeywordExtractor keywordExtractor;
|
||||||
private final String domainName;
|
private final String domainName;
|
||||||
|
|
||||||
@ -37,12 +37,12 @@ public class StackexchangeSideloader implements SideloadSource {
|
|||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public StackexchangeSideloader(Path pathToDbFile,
|
public StackexchangeSideloader(Path pathToDbFile,
|
||||||
SentenceExtractor sentenceExtractor,
|
ThreadLocalSentenceExtractorProvider sentenceExtractorProvider,
|
||||||
DocumentKeywordExtractor keywordExtractor
|
DocumentKeywordExtractor keywordExtractor
|
||||||
) {
|
) {
|
||||||
this.dbFile = pathToDbFile;
|
this.dbFile = pathToDbFile;
|
||||||
this.domainName = StackExchangePostsDb.getDomainName(pathToDbFile);
|
this.domainName = StackExchangePostsDb.getDomainName(pathToDbFile);
|
||||||
this.sentenceExtractor = sentenceExtractor;
|
this.sentenceExtractorProvider = sentenceExtractorProvider;
|
||||||
this.keywordExtractor = keywordExtractor;
|
this.keywordExtractor = keywordExtractor;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -109,7 +109,7 @@ public class StackexchangeSideloader implements SideloadSource {
|
|||||||
|
|
||||||
var url = new EdgeUrl(fullUrl);
|
var url = new EdgeUrl(fullUrl);
|
||||||
var doc = Jsoup.parse(fullHtml.toString());
|
var doc = Jsoup.parse(fullHtml.toString());
|
||||||
var dld = sentenceExtractor.extractSentences(doc);
|
var dld = sentenceExtractorProvider.get().extractSentences(doc);
|
||||||
|
|
||||||
ret.url = url;
|
ret.url = url;
|
||||||
ret.words = keywordExtractor.extractKeywords(dld, url);
|
ret.words = keywordExtractor.extractKeywords(dld, url);
|
||||||
|
Loading…
Reference in New Issue
Block a user