Misc tweaks and cleanups

This commit is contained in:
Viktor Lofgren 2023-01-30 09:44:09 +01:00
parent 1dac4e7e67
commit 9320a457a5
6 changed files with 14 additions and 13 deletions

View File

@ -42,7 +42,7 @@ public abstract class ParallelPipe<INPUT,INTERMEDIATE> {
@SneakyThrows
private void runProcessThread() {
while (expectingInput || !inputs.isEmpty()) {
var in = inputs.poll(1, TimeUnit.SECONDS);
var in = inputs.poll(10, TimeUnit.SECONDS);
if (in != null) {
try {

View File

@ -18,7 +18,6 @@ public record KeywordMetadata(HashSet<String> titleKeywords,
{
private static final KeywordCounter.WordFrequencyData empty = new KeywordCounter.WordFrequencyData(0, 0);
private static final int TF_IDF_HIGH_LIMIT = 64;
public KeywordMetadata(EnumSet<EdgePageWordFlags> flags) {
this(new HashSet<>(50), new HashSet<>(10), new HashSet<>(50),

View File

@ -46,17 +46,12 @@ public class ConverterMain {
InstructionsCompiler compiler,
Gson gson
) throws Exception {
;
logger.info("Starting pipe");
try (WorkLog processLog = plan.createProcessWorkLog();
ConversionLog log = new ConversionLog(plan.process.getDir())) {
instructionWriter = new LoadInstructionWriter(log, plan.process.getDir(), gson);
var pipe = new ParallelPipe<CrawledDomain, ProcessingInstructions>("Crawler", 20, 4, 2) {
var pipe = new ParallelPipe<CrawledDomain, ProcessingInstructions>("Crawler", 16, 4, 2) {
@Override
protected ProcessingInstructions onProcess(CrawledDomain domainData) {

View File

@ -11,7 +11,7 @@ public class CommonKeywordExtractor {
private static final int MIN_REQUIRED_DOCUMENTS = 25;
private static final int REQUIRED_TOTAL_COUNT_FOR_CONSIDERATION = 100;
private static final int REQUIRED_TOTAL_COUNT_FOR_CONSIDERATION = 15;
private static final double QUALIFYING_PROPORTION_FOR_KEYWORD = .25;
private static final int MAX_SITE_KEYWORDS_TO_EXTRACT = 5;

View File

@ -139,6 +139,9 @@ public class IndexResultValuator {
| EdgePageWordFlags.Subjects.asBit()
| EdgePageWordFlags.Synthetic.asBit();
int termCount = 0;
double tfIdfSum = 1.;
for (String term : termList) {
var meta = termMetadata.getTermMetadata(termToId.get(term), urlId);
long positions;
@ -156,18 +159,22 @@ public class IndexResultValuator {
maskDirectGenerous &= positions;
}
termCount++;
tfIdfSum += EdgePageWordMetadata.decodeTfidf(meta);
}
double avgTfIdf = termCount / tfIdfSum;
if (maskAdjacent == 0) {
return 40;
return Math.max(-2, 40 - 0.5 * avgTfIdf);
}
if (maskDirectGenerous == 0) {
return 20;
return Math.max(-1, 20 - 0.3 * avgTfIdf);
}
if (maskDirectRaw == 0) {
return 2;
return Math.max(-1, 15 - 0.2 * avgTfIdf);
}
return Long.numberOfTrailingZeros(maskDirectGenerous)/5. - Long.bitCount(maskDirectGenerous);

View File

@ -3,7 +3,7 @@ package nu.marginalia.wmsa.edge.integration.arxiv;
import nu.marginalia.util.TestLanguageModels;
import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
import nu.marginalia.util.language.processing.model.KeywordMetadata;
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
import nu.marginalia.wmsa.edge.integration.arxiv.model.ArxivMetadata;