Merge pull request 'master' (#80) from master into release
Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/80
This commit is contained in:
commit
10d8678f63
@ -6,6 +6,9 @@ import lombok.AllArgsConstructor;
|
||||
import java.util.Arrays;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
/**
|
||||
* @see nu.marginalia.util.language.processing.SentenceExtractor
|
||||
*/
|
||||
@AllArgsConstructor
|
||||
public class DocumentLanguageData {
|
||||
public final DocumentSentence[] sentences;
|
||||
|
@ -112,7 +112,7 @@ public class DocumentProcessor {
|
||||
return ret;
|
||||
}
|
||||
|
||||
private boolean isAcceptedContentType(CrawledDocument crawledDocument) {
|
||||
public static boolean isAcceptedContentType(CrawledDocument crawledDocument) {
|
||||
if (crawledDocument.contentType == null) {
|
||||
return false;
|
||||
}
|
||||
|
@ -7,7 +7,9 @@ public enum HtmlFeature {
|
||||
JS("special:scripts"),
|
||||
AFFILIATE_LINK( "special:affiliate"),
|
||||
TRACKING("special:tracking"),
|
||||
COOKIES("special:cookies")
|
||||
COOKIES("special:cookies"),
|
||||
|
||||
CATEGORY_FOOD("category:food"),
|
||||
;
|
||||
|
||||
private final String keyword;
|
||||
@ -31,4 +33,8 @@ public enum HtmlFeature {
|
||||
public static boolean hasFeature(int value, HtmlFeature feature) {
|
||||
return (value & (1<< feature.ordinal())) != 0;
|
||||
}
|
||||
|
||||
public int getFeatureBit() {
|
||||
return (1<< ordinal());
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,234 @@
|
||||
package nu.marginalia.wmsa.edge.converting.processor.logic;
|
||||
|
||||
import ca.rmen.porterstemmer.PorterStemmer;
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import static java.lang.Math.max;
|
||||
import static java.lang.Math.sqrt;
|
||||
|
||||
public class RecipeDetector {
|
||||
private static final int AVG_RECIPE_LENGTH = 250;
|
||||
|
||||
private final Map<String, Double> termValues = new HashMap<>();
|
||||
|
||||
public RecipeDetector() {
|
||||
PorterStemmer ps = new PorterStemmer();
|
||||
|
||||
// these terms appear in most recipes
|
||||
termValues.put(ps.stemWord("ingredients"), 0.3);
|
||||
termValues.put(ps.stemWord("recipe"), 0.1);
|
||||
termValues.put(ps.stemWord("preparations"), 0.1);
|
||||
termValues.put(ps.stemWord("instructions"), 0.1);
|
||||
|
||||
// penalize restaurant menus
|
||||
termValues.put(ps.stemWord("menu"), -0.5);
|
||||
|
||||
// error non habet ius
|
||||
termValues.put(ps.stemWord("email"), -0.15);
|
||||
termValues.put(ps.stemWord("checkout"), -0.15);
|
||||
termValues.put(ps.stemWord("reviews"), -0.15);
|
||||
termValues.put(ps.stemWord("newsletter"), -0.15);
|
||||
|
||||
// measures
|
||||
termValues.put(ps.stemWord("dl"), 0.05);
|
||||
termValues.put(ps.stemWord("l"), 0.05);
|
||||
termValues.put(ps.stemWord("g"), 0.05);
|
||||
termValues.put(ps.stemWord("ml"), 0.05);
|
||||
termValues.put(ps.stemWord("tsp"), 0.05);
|
||||
termValues.put(ps.stemWord("teaspoons"), 0.05);
|
||||
termValues.put(ps.stemWord("tbsp"), 0.05);
|
||||
termValues.put(ps.stemWord("tablespoons"), 0.05);
|
||||
termValues.put(ps.stemWord("cups"), 0.05);
|
||||
termValues.put(ps.stemWord("quarts"), 0.05);
|
||||
termValues.put(ps.stemWord("pints"), 0.05);
|
||||
|
||||
// techniques
|
||||
termValues.put(ps.stemWord("grate"), 0.05);
|
||||
termValues.put(ps.stemWord("cut"), 0.05);
|
||||
termValues.put(ps.stemWord("peel"), 0.05);
|
||||
termValues.put(ps.stemWord("chop"), 0.05);
|
||||
termValues.put(ps.stemWord("slice"), 0.05);
|
||||
termValues.put(ps.stemWord("debone"), 0.05);
|
||||
termValues.put(ps.stemWord("julienne"), 0.05);
|
||||
termValues.put(ps.stemWord("saute"), 0.05);
|
||||
termValues.put(ps.stemWord("fry"), 0.05);
|
||||
termValues.put(ps.stemWord("boil"), 0.05);
|
||||
termValues.put(ps.stemWord("parboil"), 0.05);
|
||||
termValues.put(ps.stemWord("roast"), 0.05);
|
||||
termValues.put(ps.stemWord("grill"), 0.05);
|
||||
termValues.put(ps.stemWord("sear"), 0.05);
|
||||
termValues.put(ps.stemWord("heat"), 0.05);
|
||||
termValues.put(ps.stemWord("dice"), 0.05);
|
||||
termValues.put(ps.stemWord("bake"), 0.05);
|
||||
termValues.put(ps.stemWord("strain"), 0.05);
|
||||
termValues.put(ps.stemWord("melt"), 0.05);
|
||||
termValues.put(ps.stemWord("garnish"), 0.05);
|
||||
termValues.put(ps.stemWord("preheat"), 0.05);
|
||||
termValues.put(ps.stemWord("sprinkle"), 0.05);
|
||||
termValues.put(ps.stemWord("spritz"), 0.05);
|
||||
|
||||
// utensils
|
||||
termValues.put(ps.stemWord("colander"), 0.05);
|
||||
termValues.put(ps.stemWord("pot"), 0.05);
|
||||
termValues.put(ps.stemWord("pan"), 0.05);
|
||||
termValues.put(ps.stemWord("oven"), 0.05);
|
||||
termValues.put(ps.stemWord("stove"), 0.05);
|
||||
termValues.put(ps.stemWord("skillet"), 0.05);
|
||||
termValues.put(ps.stemWord("wok"), 0.05);
|
||||
termValues.put(ps.stemWord("knife"), 0.05);
|
||||
termValues.put(ps.stemWord("grater"), 0.05);
|
||||
|
||||
// baking
|
||||
termValues.put(ps.stemWord("yeast"), 0.025);
|
||||
termValues.put(ps.stemWord("sourdough"), 0.025);
|
||||
termValues.put(ps.stemWord("flour"), 0.025);
|
||||
termValues.put(ps.stemWord("sugar"), 0.025);
|
||||
termValues.put(ps.stemWord("rye"), 0.025);
|
||||
termValues.put(ps.stemWord("wheat"), 0.025);
|
||||
termValues.put(ps.stemWord("dough"), 0.025);
|
||||
termValues.put(ps.stemWord("rise"), 0.025);
|
||||
|
||||
// vegetables
|
||||
termValues.put(ps.stemWord("lettuce"), 0.025);
|
||||
termValues.put(ps.stemWord("onions"), 0.025);
|
||||
termValues.put(ps.stemWord("parsnips"), 0.025);
|
||||
termValues.put(ps.stemWord("beets"), 0.025);
|
||||
termValues.put(ps.stemWord("carrots"), 0.025);
|
||||
termValues.put(ps.stemWord("chilies"), 0.025);
|
||||
termValues.put(ps.stemWord("peppers"), 0.025);
|
||||
termValues.put(ps.stemWord("chives"), 0.025);
|
||||
termValues.put(ps.stemWord("tomatoes"), 0.025);
|
||||
termValues.put(ps.stemWord("salad"), 0.025);
|
||||
termValues.put(ps.stemWord("leeks"), 0.025);
|
||||
termValues.put(ps.stemWord("shallots"), 0.025);
|
||||
termValues.put(ps.stemWord("avocado"), 0.025);
|
||||
termValues.put(ps.stemWord("asparagus"), 0.025);
|
||||
termValues.put(ps.stemWord("cucumbers"), 0.025);
|
||||
termValues.put(ps.stemWord("eggplants"), 0.025);
|
||||
termValues.put(ps.stemWord("broccoli"), 0.025);
|
||||
termValues.put(ps.stemWord("kale"), 0.05);
|
||||
|
||||
termValues.put(ps.stemWord("jalapeno"), 0.025);
|
||||
termValues.put(ps.stemWord("habanero"), 0.025);
|
||||
|
||||
termValues.put(ps.stemWord("mushrooms"), 0.025);
|
||||
termValues.put(ps.stemWord("shiitake"), 0.025);
|
||||
termValues.put(ps.stemWord("chanterelles"), 0.025);
|
||||
|
||||
// brotein
|
||||
termValues.put(ps.stemWord("meat"), 0.025);
|
||||
termValues.put(ps.stemWord("beef"), 0.025);
|
||||
termValues.put(ps.stemWord("chicken"), 0.025);
|
||||
termValues.put(ps.stemWord("turkey"), 0.025);
|
||||
termValues.put(ps.stemWord("cheese"), 0.025);
|
||||
termValues.put(ps.stemWord("pork"), 0.025);
|
||||
termValues.put(ps.stemWord("tofu"), 0.025);
|
||||
termValues.put(ps.stemWord("salmon"), 0.025);
|
||||
termValues.put(ps.stemWord("cod"), 0.025);
|
||||
termValues.put(ps.stemWord("veal"), 0.025);
|
||||
termValues.put(ps.stemWord("eggs"), 0.025);
|
||||
termValues.put(ps.stemWord("lentils"), 0.025);
|
||||
termValues.put(ps.stemWord("chickpeas"), 0.025);
|
||||
|
||||
// carbs
|
||||
termValues.put(ps.stemWord("rice"), 0.025);
|
||||
termValues.put(ps.stemWord("noodles"), 0.025);
|
||||
termValues.put(ps.stemWord("beans"), 0.025);
|
||||
termValues.put(ps.stemWord("ramen"), 0.025);
|
||||
|
||||
// japan
|
||||
termValues.put(ps.stemWord("miso"), 0.025);
|
||||
termValues.put(ps.stemWord("natto"), 0.025);
|
||||
termValues.put(ps.stemWord("udon"), 0.025);
|
||||
termValues.put(ps.stemWord("soba"), 0.025);
|
||||
termValues.put(ps.stemWord("shichimi"), 0.025);
|
||||
termValues.put(ps.stemWord("nori"), 0.025);
|
||||
|
||||
// korea
|
||||
termValues.put(ps.stemWord("kimchi"), 0.025);
|
||||
|
||||
// fat of the land
|
||||
termValues.put(ps.stemWord("salt"), 0.025);
|
||||
termValues.put(ps.stemWord("oil"), 0.025);
|
||||
termValues.put(ps.stemWord("olive"), 0.025);
|
||||
termValues.put(ps.stemWord("feta"), 0.025);
|
||||
termValues.put(ps.stemWord("parmesan"), 0.025);
|
||||
termValues.put(ps.stemWord("mozzarella"), 0.025);
|
||||
termValues.put(ps.stemWord("gouda"), 0.025);
|
||||
termValues.put(ps.stemWord("cheese"), 0.025);
|
||||
termValues.put(ps.stemWord("mayonnaise"), 0.025);
|
||||
termValues.put(ps.stemWord("butter"), 0.025);
|
||||
|
||||
// spices and sauces
|
||||
termValues.put(ps.stemWord("pepper"), 0.025);
|
||||
termValues.put(ps.stemWord("garlic"), 0.025);
|
||||
termValues.put(ps.stemWord("sriracha"), 0.025);
|
||||
termValues.put(ps.stemWord("sambal"), 0.025);
|
||||
termValues.put(ps.stemWord("soy"), 0.025);
|
||||
termValues.put(ps.stemWord("cumin"), 0.025);
|
||||
termValues.put(ps.stemWord("thyme"), 0.025);
|
||||
termValues.put(ps.stemWord("basil"), 0.025);
|
||||
termValues.put(ps.stemWord("oregano"), 0.025);
|
||||
termValues.put(ps.stemWord("cilantro"), 0.025);
|
||||
termValues.put(ps.stemWord("ginger"), 0.025);
|
||||
termValues.put(ps.stemWord("curry"), 0.025);
|
||||
|
||||
termValues.put(ps.stemWord("water"), 0.025);
|
||||
|
||||
// dessert
|
||||
termValues.put(ps.stemWord("lemons"), 0.025);
|
||||
termValues.put(ps.stemWord("melons"), 0.025);
|
||||
termValues.put(ps.stemWord("cherries"), 0.025);
|
||||
termValues.put(ps.stemWord("apples"), 0.025);
|
||||
termValues.put(ps.stemWord("pears"), 0.025);
|
||||
|
||||
termValues.put(ps.stemWord("chocolate"), 0.025);
|
||||
termValues.put(ps.stemWord("vanilla"), 0.025);
|
||||
|
||||
// dairy
|
||||
termValues.put(ps.stemWord("milk"), 0.025);
|
||||
termValues.put(ps.stemWord("creamer"), 0.025);
|
||||
termValues.put(ps.stemWord("quark"), 0.025);
|
||||
termValues.put(ps.stemWord("cream"), 0.025);
|
||||
|
||||
|
||||
// dishes
|
||||
termValues.put(ps.stemWord("cake"), 0.025);
|
||||
termValues.put(ps.stemWord("pie"), 0.025);
|
||||
termValues.put(ps.stemWord("crust"), 0.025);
|
||||
termValues.put(ps.stemWord("bread"), 0.025);
|
||||
termValues.put(ps.stemWord("omelet"), 0.025);
|
||||
termValues.put(ps.stemWord("soup"), 0.025);
|
||||
|
||||
}
|
||||
|
||||
public double recipeP(DocumentLanguageData dld) {
|
||||
|
||||
Map<String, Double> values = new HashMap<>();
|
||||
int count = 0;
|
||||
for (var sentence : dld.sentences) {
|
||||
|
||||
for (var word : sentence) {
|
||||
count++;
|
||||
|
||||
final String stemmed = word.stemmed();
|
||||
final Double value = termValues.get(stemmed);
|
||||
|
||||
if (value != null) {
|
||||
values.put(stemmed, value);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (count == 0) return 0.;
|
||||
|
||||
double lengthPenalty = sqrt(AVG_RECIPE_LENGTH)/sqrt(max(AVG_RECIPE_LENGTH, count));
|
||||
|
||||
return values.values().stream().mapToDouble(Double::valueOf).sum() * lengthPenalty;
|
||||
}
|
||||
|
||||
}
|
@ -26,6 +26,8 @@ import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.util.EnumMap;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.Callable;
|
||||
|
||||
import static nu.marginalia.wmsa.edge.index.EdgeIndexService.DYNAMIC_BUCKET_LENGTH;
|
||||
@ -39,7 +41,7 @@ public class IndexServicesFactory {
|
||||
|
||||
private final PartitionedDataFile writerIndexFile;
|
||||
private final RootDataFile keywordLexiconFile;
|
||||
private final PartitionedDataFile preconverterOutputFile;
|
||||
private final DoublePartitionedDataFile preconverterOutputFile;
|
||||
private final DoublePartitionedDataFile indexReadWordsFile;
|
||||
private final DoublePartitionedDataFile indexReadUrlsFile;
|
||||
private final DoublePartitionedDataFile indexWriteWordsFile;
|
||||
@ -75,7 +77,7 @@ public class IndexServicesFactory {
|
||||
this.indexReadUrlsFile = new DoublePartitionedDataFile(partitionRootFast, indexReadUrlsFile);
|
||||
this.indexWriteWordsFile = new DoublePartitionedDataFile(partitionRootFast, indexWriteWordsFile);
|
||||
this.indexWriteUrlsFile = new DoublePartitionedDataFile(partitionRootFast, indexWriteUrlsFile);
|
||||
this.preconverterOutputFile = new PartitionedDataFile(partitionRootSlowTmp, "preconverted.dat");
|
||||
this.preconverterOutputFile = new DoublePartitionedDataFile(partitionRootSlowTmp, "preconverted.dat");
|
||||
this.partitioner = partitioner;
|
||||
}
|
||||
|
||||
@ -101,7 +103,7 @@ public class IndexServicesFactory {
|
||||
|
||||
public void convertIndex(int id, IndexBlock block) throws ConversionUnnecessaryException, IOException {
|
||||
var converter = new SearchIndexConverter(block, id, tmpFileDir,
|
||||
preconverterOutputFile.get(id),
|
||||
preconverterOutputFile.get(id, block.ordinal()),
|
||||
indexWriteWordsFile.get(id, block.id),
|
||||
indexWriteUrlsFile.get(id, block.id),
|
||||
partitioner,
|
||||
@ -112,19 +114,23 @@ public class IndexServicesFactory {
|
||||
|
||||
@SneakyThrows
|
||||
public SearchIndexPreconverter getIndexPreconverter() {
|
||||
File[] outputFiles = new File[DYNAMIC_BUCKET_LENGTH+1];
|
||||
for (int i = 0; i < outputFiles.length; i++) {
|
||||
outputFiles[i] = getPreconverterOutputFile(i);
|
||||
Map<SearchIndexPreconverter.Shard, File> shards = new HashMap<>();
|
||||
|
||||
for (int index = 0; index < (DYNAMIC_BUCKET_LENGTH + 1); index++) {
|
||||
for (IndexBlock block : IndexBlock.values()) {
|
||||
shards.put(new SearchIndexPreconverter.Shard(index, block.ordinal()), getPreconverterOutputFile(index, block.ordinal()));
|
||||
}
|
||||
}
|
||||
|
||||
return new SearchIndexPreconverter(writerIndexFile.get(0),
|
||||
outputFiles,
|
||||
shards,
|
||||
partitioner,
|
||||
domainBlacklist
|
||||
);
|
||||
}
|
||||
|
||||
private File getPreconverterOutputFile(int i) {
|
||||
return preconverterOutputFile.get(i);
|
||||
private File getPreconverterOutputFile(int index, int block) {
|
||||
return preconverterOutputFile.get(index, block);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
|
@ -10,26 +10,42 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.RandomAccessFile;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Files;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
|
||||
public class SearchIndexPreconverter {
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
public record Shard(int bucket, int block) {}
|
||||
|
||||
private record ShardOutput(Shard shard, RandomAccessFile raf, FileChannel fc) {
|
||||
public static ShardOutput fromFile(Shard s, File f) {
|
||||
try {
|
||||
var v = new RandomAccessFile(f, "rw");
|
||||
v.seek(SearchIndexJournalReader.FILE_HEADER_SIZE_BYTES);
|
||||
return new ShardOutput(s, v, v.getChannel());
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@SneakyThrows
|
||||
@Inject
|
||||
public SearchIndexPreconverter(File inputFile,
|
||||
File[] outputFiles,
|
||||
Map<Shard, File> outputFiles,
|
||||
SearchIndexPartitioner partitioner,
|
||||
EdgeDomainBlacklist blacklist)
|
||||
{
|
||||
TIntHashSet spamDomains = blacklist.getSpamDomains();
|
||||
logger.info("Preconverting {}", inputFile);
|
||||
|
||||
for (File f : outputFiles) {
|
||||
for (File f : outputFiles.values()) {
|
||||
if (f.exists()) {
|
||||
Files.deleteIfExists(Objects.requireNonNull(f).toPath());
|
||||
}
|
||||
@ -41,15 +57,7 @@ public class SearchIndexPreconverter {
|
||||
|
||||
logger.info("{}", indexJournalReader.fileHeader);
|
||||
|
||||
RandomAccessFile[] randomAccessFiles = new RandomAccessFile[outputFiles.length];
|
||||
for (int i = 0; i < randomAccessFiles.length; i++) {
|
||||
randomAccessFiles[i] = new RandomAccessFile(outputFiles[i], "rw");
|
||||
randomAccessFiles[i].seek(SearchIndexJournalReader.FILE_HEADER_SIZE_BYTES);
|
||||
}
|
||||
FileChannel[] fileChannels = new FileChannel[outputFiles.length];
|
||||
for (int i = 0; i < fileChannels.length; i++) {
|
||||
fileChannels[i] = randomAccessFiles[i].getChannel();
|
||||
}
|
||||
ShardOutput[] outputs = outputFiles.entrySet().stream().map(entry -> ShardOutput.fromFile(entry.getKey(), entry.getValue())).toArray(ShardOutput[]::new);
|
||||
|
||||
var lock = partitioner.getReadLock();
|
||||
try {
|
||||
@ -65,12 +73,14 @@ public class SearchIndexPreconverter {
|
||||
buffer.clear();
|
||||
entry.copyToBuffer(buffer);
|
||||
|
||||
for (int i = 0; i < randomAccessFiles.length; i++) {
|
||||
if (partitioner.filterUnsafe(domainId, i)) {
|
||||
for (int i = 0; i < outputs.length; i++) {
|
||||
if (outputs[i].shard.block == entry.header.block().id
|
||||
&& partitioner.filterUnsafe(domainId, outputs[i].shard.bucket))
|
||||
{
|
||||
buffer.flip();
|
||||
|
||||
while (buffer.position() < buffer.limit())
|
||||
fileChannels[i].write(buffer);
|
||||
outputs[i].fc.write(buffer);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -80,14 +90,14 @@ public class SearchIndexPreconverter {
|
||||
}
|
||||
logger.info("Finalizing preconversion");
|
||||
|
||||
for (int i = 0; i < randomAccessFiles.length; i++) {
|
||||
long pos = randomAccessFiles[i].getFilePointer();
|
||||
randomAccessFiles[i].seek(0);
|
||||
randomAccessFiles[i].writeLong(pos);
|
||||
randomAccessFiles[i].writeLong(wordCountOriginal);
|
||||
fileChannels[i].force(true);
|
||||
fileChannels[i].close();
|
||||
randomAccessFiles[i].close();
|
||||
for (int i = 0; i < outputs.length; i++) {
|
||||
long pos = outputs[i].raf.getFilePointer();
|
||||
outputs[i].raf.seek(0);
|
||||
outputs[i].raf.writeLong(pos);
|
||||
outputs[i].raf.writeLong(wordCountOriginal);
|
||||
outputs[i].fc.force(true);
|
||||
outputs[i].fc.close();
|
||||
outputs[i].raf.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,8 +1,15 @@
|
||||
package nu.marginalia.wmsa.edge.model;
|
||||
|
||||
import lombok.*;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.wmsa.edge.crawling.CrawledDomainReader;
|
||||
import nu.marginalia.wmsa.edge.crawling.WorkLog;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
@AllArgsConstructor @NoArgsConstructor @ToString
|
||||
public class EdgeCrawlPlan {
|
||||
@ -38,4 +45,16 @@ public class EdgeCrawlPlan {
|
||||
String sp2 = fileName.substring(2, 4);
|
||||
return process.getDir().resolve(sp1).resolve(sp2).resolve(fileName);
|
||||
}
|
||||
|
||||
public void forEachCrawledDomain(Consumer<CrawledDomain> consumer) {
|
||||
final CrawledDomainReader reader = new CrawledDomainReader();
|
||||
|
||||
WorkLog.readLog(crawl.getLogFile(), entry -> {
|
||||
try {
|
||||
consumer.accept(reader.read(getCrawledFilePath(entry.path())));
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,82 @@
|
||||
package nu.marginalia.wmsa.edge.tools;
|
||||
|
||||
import nu.marginalia.util.language.conf.LanguageModels;
|
||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.RecipeDetector;
|
||||
import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
|
||||
import nu.marginalia.wmsa.edge.crawling.CrawledDomainReader;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.ForkJoinPool;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import static nu.marginalia.wmsa.edge.converting.processor.DocumentProcessor.isAcceptedContentType;
|
||||
|
||||
public class RecipeDetectorTool {
|
||||
private static final CrawledDomainReader reader = new CrawledDomainReader();
|
||||
private static final RecipeDetector detector = new RecipeDetector();
|
||||
private static final LanguageModels lm = WmsaHome.getLanguageModels();
|
||||
private static final SentenceExtractor sentenceExtractor = new SentenceExtractor(lm);
|
||||
|
||||
private static final Set<String> urls = new HashSet<>(50_000_000);
|
||||
|
||||
public static void main(String... args) throws IOException {
|
||||
EdgeCrawlPlan plan = new CrawlPlanLoader().load(Path.of(args[0]));
|
||||
DatabaseModule module = new DatabaseModule();
|
||||
|
||||
try (var ds = module.provideConnection();
|
||||
var conn = ds.getConnection();
|
||||
var stmt = conn.createStatement()) {
|
||||
var rsp = stmt.executeQuery("SELECT URL FROM EC_URL_VIEW WHERE TITLE IS NOT NULL");
|
||||
while (rsp.next()) {
|
||||
urls.add(rsp.getString(1));
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
ex.printStackTrace();
|
||||
}
|
||||
|
||||
ForkJoinPool pool = new ForkJoinPool(16);
|
||||
plan.forEachCrawledDomain(data -> pool.execute(() -> processDomain(data)));
|
||||
|
||||
while (!pool.awaitQuiescence(1, TimeUnit.HOURS));
|
||||
}
|
||||
|
||||
private static void processDomain(CrawledDomain domain) {
|
||||
if (domain.doc == null) return;
|
||||
for (var doc : domain.doc) {
|
||||
if (!urls.contains(doc.url))
|
||||
continue;
|
||||
|
||||
if (isAcceptedContentType(doc) && "OK".equals(doc.crawlerStatus)) {
|
||||
processDocument(doc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static void processDocument(CrawledDocument doc) {
|
||||
Document parsedDocument = Jsoup.parse(doc.documentBody);
|
||||
|
||||
parsedDocument.getElementsByTag("a").remove();
|
||||
parsedDocument.getElementsByTag("nav").remove();
|
||||
|
||||
DocumentLanguageData dld = sentenceExtractor.extractSentences(parsedDocument);
|
||||
double prob = 100*detector.recipeP(dld);
|
||||
if (prob > 50) {
|
||||
System.out.printf("%3.2f\t%s\n", prob, doc.url);
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,86 @@
|
||||
package nu.marginalia.wmsa.edge.tools;
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.configuration.server.Context;
|
||||
import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.Connection;
|
||||
import java.sql.PreparedStatement;
|
||||
import java.sql.SQLException;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
|
||||
import static nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature.CATEGORY_FOOD;
|
||||
|
||||
public class RecipesLoaderTool {
|
||||
public static void main(String... args) {
|
||||
|
||||
try (EdgeIndexClient client = new EdgeIndexClient();
|
||||
HikariDataSource ds = new DatabaseModule().provideConnection();
|
||||
Connection conn = ds.getConnection();
|
||||
PreparedStatement ps = conn.prepareStatement("UPDATE EC_PAGE_DATA SET FEATURES = FEATURES | ? WHERE ID=?");
|
||||
var linesStream = Files.lines(Path.of(args[0]))) {
|
||||
|
||||
var urls = getUrls(ds);
|
||||
var wordSet = new EdgePageWordSet(new EdgePageWords(IndexBlock.Meta, List.of(CATEGORY_FOOD.getKeyword())));
|
||||
linesStream
|
||||
.map(urls::get)
|
||||
.filter(Objects::nonNull)
|
||||
.forEach(id -> {
|
||||
int urlId = (int)(id & 0xFFFF_FFFFL);
|
||||
int domainId = (int)(id >>> 32L);
|
||||
|
||||
try {
|
||||
ps.setInt(2, urlId);
|
||||
ps.setInt(1, CATEGORY_FOOD.getFeatureBit());
|
||||
ps.executeUpdate();
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
client.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId), -5, wordSet, 0)
|
||||
.blockingSubscribe();
|
||||
});
|
||||
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
} catch (SQLException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private static Map<String, Long> getUrls(HikariDataSource ds) {
|
||||
|
||||
Map<String, Long> urls = new HashMap<>(100_000);
|
||||
|
||||
try (var conn = ds.getConnection();
|
||||
var stmt = conn.createStatement())
|
||||
{
|
||||
var rsp = stmt.executeQuery("SELECT URL, ID, DOMAIN_ID FROM EC_URL_VIEW WHERE TITLE IS NOT NULL");
|
||||
|
||||
while (rsp.next()) {
|
||||
long val = rsp.getInt(3);
|
||||
val = (val << 32L) | rsp.getInt(2);
|
||||
|
||||
urls.put(rsp.getString(1), val);
|
||||
}
|
||||
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
return urls;
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user