Misc tweaks and cleanups
This commit is contained in:
parent
1dac4e7e67
commit
9320a457a5
@ -42,7 +42,7 @@ public abstract class ParallelPipe<INPUT,INTERMEDIATE> {
|
||||
@SneakyThrows
|
||||
private void runProcessThread() {
|
||||
while (expectingInput || !inputs.isEmpty()) {
|
||||
var in = inputs.poll(1, TimeUnit.SECONDS);
|
||||
var in = inputs.poll(10, TimeUnit.SECONDS);
|
||||
|
||||
if (in != null) {
|
||||
try {
|
||||
|
@ -18,7 +18,6 @@ public record KeywordMetadata(HashSet<String> titleKeywords,
|
||||
{
|
||||
|
||||
private static final KeywordCounter.WordFrequencyData empty = new KeywordCounter.WordFrequencyData(0, 0);
|
||||
private static final int TF_IDF_HIGH_LIMIT = 64;
|
||||
|
||||
public KeywordMetadata(EnumSet<EdgePageWordFlags> flags) {
|
||||
this(new HashSet<>(50), new HashSet<>(10), new HashSet<>(50),
|
||||
|
@ -46,17 +46,12 @@ public class ConverterMain {
|
||||
InstructionsCompiler compiler,
|
||||
Gson gson
|
||||
) throws Exception {
|
||||
|
||||
;
|
||||
|
||||
|
||||
|
||||
logger.info("Starting pipe");
|
||||
|
||||
try (WorkLog processLog = plan.createProcessWorkLog();
|
||||
ConversionLog log = new ConversionLog(plan.process.getDir())) {
|
||||
instructionWriter = new LoadInstructionWriter(log, plan.process.getDir(), gson);
|
||||
var pipe = new ParallelPipe<CrawledDomain, ProcessingInstructions>("Crawler", 20, 4, 2) {
|
||||
var pipe = new ParallelPipe<CrawledDomain, ProcessingInstructions>("Crawler", 16, 4, 2) {
|
||||
|
||||
@Override
|
||||
protected ProcessingInstructions onProcess(CrawledDomain domainData) {
|
||||
|
@ -11,7 +11,7 @@ public class CommonKeywordExtractor {
|
||||
|
||||
private static final int MIN_REQUIRED_DOCUMENTS = 25;
|
||||
|
||||
private static final int REQUIRED_TOTAL_COUNT_FOR_CONSIDERATION = 100;
|
||||
private static final int REQUIRED_TOTAL_COUNT_FOR_CONSIDERATION = 15;
|
||||
private static final double QUALIFYING_PROPORTION_FOR_KEYWORD = .25;
|
||||
|
||||
private static final int MAX_SITE_KEYWORDS_TO_EXTRACT = 5;
|
||||
|
@ -139,6 +139,9 @@ public class IndexResultValuator {
|
||||
| EdgePageWordFlags.Subjects.asBit()
|
||||
| EdgePageWordFlags.Synthetic.asBit();
|
||||
|
||||
int termCount = 0;
|
||||
double tfIdfSum = 1.;
|
||||
|
||||
for (String term : termList) {
|
||||
var meta = termMetadata.getTermMetadata(termToId.get(term), urlId);
|
||||
long positions;
|
||||
@ -156,18 +159,22 @@ public class IndexResultValuator {
|
||||
maskDirectGenerous &= positions;
|
||||
}
|
||||
|
||||
termCount++;
|
||||
tfIdfSum += EdgePageWordMetadata.decodeTfidf(meta);
|
||||
}
|
||||
|
||||
double avgTfIdf = termCount / tfIdfSum;
|
||||
|
||||
if (maskAdjacent == 0) {
|
||||
return 40;
|
||||
return Math.max(-2, 40 - 0.5 * avgTfIdf);
|
||||
}
|
||||
|
||||
if (maskDirectGenerous == 0) {
|
||||
return 20;
|
||||
return Math.max(-1, 20 - 0.3 * avgTfIdf);
|
||||
}
|
||||
|
||||
if (maskDirectRaw == 0) {
|
||||
return 2;
|
||||
return Math.max(-1, 15 - 0.2 * avgTfIdf);
|
||||
}
|
||||
|
||||
return Long.numberOfTrailingZeros(maskDirectGenerous)/5. - Long.bitCount(maskDirectGenerous);
|
||||
|
@ -3,7 +3,7 @@ package nu.marginalia.wmsa.edge.integration.arxiv;
|
||||
import nu.marginalia.util.TestLanguageModels;
|
||||
import nu.marginalia.util.language.conf.LanguageModels;
|
||||
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
|
||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.model.KeywordMetadata;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||
import nu.marginalia.wmsa.edge.integration.arxiv.model.ArxivMetadata;
|
||||
|
Loading…
Reference in New Issue
Block a user