From 0a35a7c1d0c3a1e6489d3d1bae0ff4c89f46a4cf Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 20 Oct 2022 21:57:08 +0200 Subject: [PATCH] master (#119) Co-authored-by: vlofgren Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/119 --- .../wmsa/edge/EdgeCrawlBehaviorE2ETest.java | 105 +++++ .../wmsa/edge/EdgeSearchE2ETest.java | 8 +- marginalia_nu/src/e2e/resources/crawl-mock.sh | 19 + .../jmh/java/nu/marginalia/BitSetTest.java | 313 +++++++++++++ .../ByteBufferBlockReadVsIndividualRead.java | 85 ---- .../nu/marginalia/util/AndCardIntSet.java | 205 +++++++++ .../util/BrailleBlockPunchCards.java | 52 +++ .../java/nu/marginalia/util/ListChunker.java | 6 +- .../nu/marginalia/util/btree/BTreeDogEar.java | 33 ++ .../util/btree/BTreeQueryBuffer.java | 146 ++++++ .../nu/marginalia/util/btree/BTreeReader.java | 279 +++++++++-- .../nu/marginalia/util/btree/BTreeWriter.java | 14 +- .../util/btree/CachingBTreeReader.java | 136 ------ .../util/btree/model/BTreeContext.java | 2 +- .../marginalia/util/dict/DictionaryData.java | 1 - .../util/dict/DictionaryHashMap.java | 5 +- .../marginalia/util/dict/DictionaryMap.java | 9 + .../util/language/LanguageFilter.java | 12 +- .../util/language/UnicodeRanges.java | 17 +- .../util/language/WordPatterns.java | 14 +- .../processing/DocumentKeywordExtractor.java | 175 +++---- .../language/processing/KeywordCounter.java | 88 ++-- .../language/processing/LongNameCounter.java | 64 --- .../util/language/processing/NameCounter.java | 3 +- .../processing/SentenceExtractor.java | 51 ++- .../language/processing/SubjectCounter.java | 20 +- .../processing/model/KeywordMetadata.java | 58 +++ .../language/processing/model/WordRep.java | 13 +- .../util/multimap/MultimapFileLong.java | 432 +++++++++++++++++- .../multimap/MultimapFileLongOffsetSlice.java | 42 ++ .../util/multimap/MultimapFileLongSlice.java | 14 + .../util/multimap/MultimapSearcher.java | 14 +- .../util/multimap/MultimapSearcherBase.java | 73 +-- .../util/multimap/MultimapSorter.java | 158 +++++-- .../ranking/tool/UpdateDomainRanksTool2.java | 20 - .../EdgeDomainLinkConsineSimilarityMain.java | 298 ++++++++++++ .../wmsa/api/model/ApiSearchResult.java | 26 ++ .../model/ApiSearchResultQueryDetails.java | 16 + .../wmsa/configuration/ServiceDescriptor.java | 6 +- .../configuration/command/CrawlCommand.java | 24 + .../screenshot/ScreenshotService.java | 9 +- .../wmsa/edge/converting/ConversionLog.java | 69 +++ .../converting/ConvertedDomainReader.java | 1 - .../wmsa/edge/converting/ConverterMain.java | 10 +- .../converting/LinkKeywordExtractorMain.java | 2 +- .../converting/LinkKeywordLoaderMain.java | 17 +- .../converting/LoadInstructionWriter.java | 9 +- .../edge/converting/ReindexTriggerMain.java | 34 +- .../converting/UpdateDomainStatistics.java | 66 +++ .../compiler/DocumentsCompiler.java | 58 +++ .../converting/compiler/FeedsCompiler.java | 23 + .../compiler/InstructionsCompiler.java | 57 +++ .../converting/compiler/LinksCompiler.java | 26 ++ .../converting/compiler/RedirectCompiler.java | 19 + .../converting/compiler/UrlsCompiler.java | 49 ++ .../instruction/DocumentKeywords.java | 36 +- .../LoadProcessedDocumentWithError.java | 3 +- .../edge/converting/loader/SqlLoadUrls.java | 27 +- .../model/DisqualifiedException.java | 25 +- .../converting/model/ProcessedDocument.java | 4 + .../processor/DocumentProcessor.java | 120 +++-- .../converting/processor/DomainProcessor.java | 91 +++- .../processor/InstructionsCompiler.java | 116 ----- .../processor/logic/DocumentValuator.java | 10 +- .../processor/logic/FeatureExtractor.java | 13 +- .../processor/logic/HtmlFeature.java | 6 +- .../processor/logic/InternalLinkGraph.java | 54 +++ .../processor/logic/LinkParser.java | 2 - .../processor/logic/QueryParams.java | 54 ++- .../logic/topic/GoogleAnwersSpamDetector.java | 36 ++ .../edge/crawling/CrawlJobExtractorMain.java | 12 +- .../wmsa/edge/crawling/CrawlerTestMain.java | 116 +++++ .../edge/crawling/blocklist/UrlBlocklist.java | 33 +- .../crawling/model/CrawlingSpecification.java | 4 + .../crawling/retreival/CrawlerRetreiver.java | 16 +- .../edge/crawling/retreival/HttpFetcher.java | 2 +- .../wmsa/edge/data/dao/EdgeDataStoreDao.java | 3 +- .../edge/data/dao/EdgeDataStoreDaoImpl.java | 15 +- .../dao/task/EdgeDomainBlacklistImpl.java | 4 - .../wmsa/edge/explorer/ExplorerMain.java | 34 ++ .../wmsa/edge/explorer/ExplorerService.java | 253 ++++++++++ .../wmsa/edge/index/EdgeIndexBucket.java | 58 ++- .../wmsa/edge/index/EdgeIndexControl.java | 5 +- .../wmsa/edge/index/EdgeIndexService.java | 7 +- .../wmsa/edge/index/IndexServicesFactory.java | 32 +- .../edge/index/client/EdgeIndexClient.java | 3 + .../index/client/EdgeIndexLocalService.java | 22 +- .../conversion/SearchIndexConverter.java | 54 ++- .../edge/index/conversion/SearchIndexDao.java | 22 +- .../conversion/SearchIndexPartitioner.java | 15 +- .../conversion/SearchIndexPreconverter.java | 4 +- .../words/WordIndexOffsetsTable.java | 8 +- .../conversion/words/WordsTableWriter.java | 12 +- .../journal/SearchIndexJournalReader.java | 8 +- .../journal/SearchIndexJournalWriterImpl.java | 4 +- .../model/SearchIndexJournalEntry.java | 24 +- .../edge/index/lexicon/KeywordLexicon.java | 5 +- .../index/model/EdgeIndexSearchTerms.java | 16 - .../edge/index/model/EdgePageWordFlags.java | 32 ++ .../index/model/EdgePageWordMetadata.java | 90 ++++ .../edge/index/model/EdgePutWordsRequest.java | 20 - .../wmsa/edge/index/model/IndexBlock.java | 46 +- .../wmsa/edge/index/model/IndexBlockType.java | 7 +- .../edge/index/reader/IndexWordsTable.java | 15 +- .../wmsa/edge/index/reader/SearchIndex.java | 185 +------- .../edge/index/reader/SearchIndexReader.java | 71 ++- .../index/reader/SearchIndexURLRange.java | 100 ++++ .../svc/EdgeIndexDomainQueryService.java | 111 +++++ .../index/svc/EdgeIndexLexiconService.java | 27 +- .../edge/index/svc/EdgeIndexQueryService.java | 313 +++++++------ .../svc/query/IndexDomainQueryFactory.java | 36 +- .../wmsa/edge/index/svc/query/IndexQuery.java | 52 ++- .../index/svc/query/IndexQueryCachePool.java | 60 --- .../index/svc/query/IndexQueryFactory.java | 88 +++- .../index/svc/query/IndexQueryParams.java | 16 + .../svc/query/types/EmptyEntrySource.java | 19 + .../index/svc/query/types/EntrySource.java | 7 +- .../svc/query/types/EntrySourceFromBTree.java | 108 +++++ .../query/types/EntrySourceFromMapRange.java | 60 +++ .../query/types/filter/QueryFilterAnyOf.java | 40 +- .../types/filter/QueryFilterBTreeRange.java | 33 -- .../filter/QueryFilterBTreeRangeReject.java | 27 ++ .../filter/QueryFilterBTreeRangeRetain.java | 27 ++ .../query/types/filter/QueryFilterNoPass.java | 6 +- .../filter/QueryFilterStepFromPredicate.java | 9 - .../query/types/filter/QueryFilterStepIf.java | 55 +-- .../types/filter/QueryRankLimitingFilter.java | 37 ++ .../integration/model/BasicDocumentData.java | 2 +- .../StackOverflowPostProcessor.java | 13 +- .../wikipedia/WikipediaProcessor.java | 13 +- .../model/crawl/EdgeDomainIndexingState.java | 22 +- .../edge/model/crawl/EdgePageWordSet.java | 12 +- .../wmsa/edge/model/crawl/EdgePageWords.java | 70 ++- .../wmsa/edge/model/crawl/EdgeUrlState.java | 4 +- .../model/search/EdgeSearchResultItem.java | 14 +- .../search/EdgeSearchResultKeywordScore.java | 75 +-- .../model/search/EdgeSearchSpecification.java | 15 + .../edge/model/search/EdgeUrlDetails.java | 54 ++- .../wmsa/edge/search/EdgeSearchService.java | 104 +---- .../edge/search/command/SearchParameters.java | 4 +- .../command/commands/SearchCommand.java | 6 +- .../command/commands/SiteListCommand.java | 24 +- .../edge/search/model/DomainInformation.java | 13 +- .../search/{ => model}/EdgeSearchProfile.java | 38 +- .../model/EdgeSearchRankingSymbols.java | 10 +- .../wmsa/edge/search/model/SearchOrder.java | 10 + .../edge/search/query/NearQueryProcessor.java | 65 +++ .../wmsa/edge/search/query/QueryFactory.java | 57 ++- .../wmsa/edge/search/query/QueryParser.java | 18 +- .../query/model/EdgeUserSearchParameters.java | 2 +- .../search/results/SearchResultDecorator.java | 27 +- .../search/results/SearchResultValuator.java | 204 --------- .../siteinfo/DomainInformationService.java | 53 ++- .../svc/EdgeSearchAddToCrawlQueueService.java | 70 +++ .../search/svc/EdgeSearchApiQueryService.java | 55 +++ .../search/svc/EdgeSearchFlagSiteService.java | 125 +++++ .../svc/EdgeSearchQueryIndexService.java | 16 +- .../search/svc/EdgeSearchQueryService.java | 70 +++ .../valuation/SearchResultValuator.java | 292 ++++++++++++ .../edge/tools/ConverterLogicTestTool.java | 7 +- .../wmsa/edge/tools/FeaturesLoaderTool.java | 2 +- .../edge/tools/SearchIndexScrubberMain.java | 2 +- .../wmsa/encyclopedia/EncyclopediaModule.java | 2 - .../main/resources/sql/edge-crawler-cache.sql | 56 ++- .../main/resources/static/edge/style-new.css | 175 ++++++- .../main/resources/static/explore/style.css | 20 + .../main/resources/templates/edge/index.hdb | 5 + .../templates/edge/indict/indict-form.hdb | 80 ++++ .../templates/edge/parts/search-footer.hdb | 103 ++++- .../templates/edge/parts/search-form.hdb | 3 +- .../templates/edge/parts/site-info-index.hdb | 50 ++ .../templates/edge/parts/site-info-links.hdb | 18 + .../templates/edge/search-result-metadata.hdb | 13 +- .../templates/edge/search-result.hdb | 8 +- .../resources/templates/edge/site-info.hdb | 24 +- .../templates/explorer/explorer-about.hdb | 15 + .../templates/explorer/explorer-messages.hdb | 2 + .../templates/explorer/explorer-results.hdb | 23 + .../templates/explorer/explorer-search.hdb | 7 + .../resources/templates/explorer/explorer.hdb | 25 + .../nu/marginalia/util/AndCardIntSetTest.java | 24 + .../util/BrailleBlockPunchCardsTest.java | 17 + .../util/btree/BTreeWriterTest.java | 42 +- .../btree/BTreeWriterTestCachedReader.java | 335 -------------- .../edge/crawling/LanguageFilterTest.java | 5 +- .../edge/crawling/SentenceExtractorTest.java | 15 +- .../query/types/QueryFilterStepIfTest.java | 29 -- .../edge/index/service/MultimapFileTest.java | 125 ++++- .../service/SearchIndexJournalWriterTest.java | 104 ++++- .../edge/index/svc/query/IndexQueryTest.java | 218 +++++++++ .../integration/arxiv/ArxivParserTest.java | 3 +- .../model/crawl/EdgePageWordMetadataTest.java | 64 +++ .../search/query/BodyQueryParserTest.java | 9 + .../edge/search/query/QueryVariantsTest.java | 2 + protocol/def/index.proto | 1 + .../com/upserve/uppend/blobs/NativeIO.java | 6 +- .../sentdetect/DefaultSDContextGenerator.java | 296 ++++++++++++ .../tools/sentdetect/SentenceDetectorME.java | 336 ++++++++++++++ 198 files changed, 7815 insertions(+), 2603 deletions(-) create mode 100644 marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeCrawlBehaviorE2ETest.java create mode 100644 marginalia_nu/src/e2e/resources/crawl-mock.sh create mode 100644 marginalia_nu/src/jmh/java/nu/marginalia/BitSetTest.java delete mode 100644 marginalia_nu/src/jmh/java/nu/marginalia/ByteBufferBlockReadVsIndividualRead.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/AndCardIntSet.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/BrailleBlockPunchCards.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeDogEar.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeQueryBuffer.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/btree/CachingBTreeReader.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryMap.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/language/processing/LongNameCounter.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/KeywordMetadata.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/tool/EdgeDomainLinkConsineSimilarityMain.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/api/model/ApiSearchResultQueryDetails.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/command/CrawlCommand.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConversionLog.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/UpdateDomainStatistics.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/DocumentsCompiler.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/FeedsCompiler.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/InstructionsCompiler.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/LinksCompiler.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/RedirectCompiler.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/UrlsCompiler.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/InstructionsCompiler.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/InternalLinkGraph.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/GoogleAnwersSpamDetector.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerTestMain.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/explorer/ExplorerMain.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/explorer/ExplorerService.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgeIndexSearchTerms.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePageWordFlags.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePageWordMetadata.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePutWordsRequest.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexURLRange.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexDomainQueryService.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryCachePool.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryParams.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/EmptyEntrySource.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/EntrySourceFromBTree.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/EntrySourceFromMapRange.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterBTreeRange.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterBTreeRangeReject.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterBTreeRangeRetain.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryRankLimitingFilter.java rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/{ => model}/EdgeSearchProfile.java (66%) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/SearchOrder.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/NearQueryProcessor.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultValuator.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchAddToCrawlQueueService.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchApiQueryService.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchFlagSiteService.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchQueryService.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/valuation/SearchResultValuator.java create mode 100644 marginalia_nu/src/main/resources/static/explore/style.css create mode 100644 marginalia_nu/src/main/resources/templates/edge/indict/indict-form.hdb create mode 100644 marginalia_nu/src/main/resources/templates/edge/parts/site-info-index.hdb create mode 100644 marginalia_nu/src/main/resources/templates/edge/parts/site-info-links.hdb create mode 100644 marginalia_nu/src/main/resources/templates/explorer/explorer-about.hdb create mode 100644 marginalia_nu/src/main/resources/templates/explorer/explorer-messages.hdb create mode 100644 marginalia_nu/src/main/resources/templates/explorer/explorer-results.hdb create mode 100644 marginalia_nu/src/main/resources/templates/explorer/explorer-search.hdb create mode 100644 marginalia_nu/src/main/resources/templates/explorer/explorer.hdb create mode 100644 marginalia_nu/src/test/java/nu/marginalia/util/AndCardIntSetTest.java create mode 100644 marginalia_nu/src/test/java/nu/marginalia/util/BrailleBlockPunchCardsTest.java delete mode 100644 marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTestCachedReader.java delete mode 100644 marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/reader/query/types/QueryFilterStepIfTest.java create mode 100644 marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryTest.java create mode 100644 marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWordMetadataTest.java create mode 100644 third_party/src/main/java/opennlp/tools/sentdetect/DefaultSDContextGenerator.java create mode 100644 third_party/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java diff --git a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeCrawlBehaviorE2ETest.java b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeCrawlBehaviorE2ETest.java new file mode 100644 index 00000000..f8325d9d --- /dev/null +++ b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeCrawlBehaviorE2ETest.java @@ -0,0 +1,105 @@ +package nu.marginalia.wmsa.edge; + + +import nu.marginalia.wmsa.edge.crawling.CrawlJobExtractorMain; +import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; +import org.slf4j.LoggerFactory; +import org.testcontainers.containers.BindMode; +import org.testcontainers.containers.GenericContainer; +import org.testcontainers.containers.output.Slf4jLogConsumer; +import org.testcontainers.containers.wait.strategy.Wait; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; +import org.testcontainers.utility.MountableFile; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.time.Duration; +import java.util.ArrayList; +import java.util.List; + +@Tag("e2e") +@Testcontainers +public class EdgeCrawlBehaviorE2ETest extends E2ETestBase { + @Container + public static GenericContainer mockContainer = new GenericContainer<>("openjdk:17-alpine") + .withCopyFileToContainer(jarFile(), "/WMSA.jar") + .withNetwork(network) + .withNetworkAliases("mock", "mock2") + .withExposedPorts(8080) + .withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger("mock"))) + .withCommand("java","-cp","WMSA.jar","nu.marginalia.wmsa.edge.crawling.CrawlerTestMain") + ; + + + @Container + public static GenericContainer crawlerContainer = new GenericContainer<>("openjdk:17-alpine") + .dependsOn(mockContainer) + .withNetwork(network) + .withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger("crawler"))) + .withFileSystemBind(modelsPath(), "/var/lib/wmsa/model", BindMode.READ_ONLY) + .withCopyFileToContainer(ipDatabasePath(), "/var/lib/wmsa/data/IP2LOCATION-LITE-DB1.CSV") + .withCopyFileToContainer(jarFile(), "/WMSA.jar") + .withCopyFileToContainer(MountableFile.forClasspathResource("crawl-mock.sh"), "/crawl-mock.sh") + .withFileSystemBind(getMockCrawlPath(), "/crawl/", BindMode.READ_WRITE) + .withCommand("sh", "crawl-mock.sh") + .waitingFor(Wait.forLogMessage(".*ALL DONE.*", 1).withStartupTimeout(Duration.ofMinutes(10))); + + + private static String getMockCrawlPath() { + Path crawlFiles = getCrawlPath(); + + + List urls = new ArrayList<>(); + try { + Files.createDirectories(crawlFiles); + + Files.writeString(crawlFiles.resolve("crawl.plan"), """ + jobSpec: "/crawl/crawl.spec" + crawl: + dir: "/crawl/crawl" + logName: "crawl.log" + process: + dir: "/crawl/process" + logName: "process.log" + """); + + Files.createDirectories(crawlFiles.resolve("crawl")); + Files.createDirectories(crawlFiles.resolve("process")); + Files.deleteIfExists(crawlFiles.resolve("process").resolve("process.log")); + Files.deleteIfExists(crawlFiles.resolve("crawl").resolve("crawl.log")); + + CrawlJobExtractorMain.writeSpec(crawlFiles.resolve("crawl.spec"), + new CrawlingSpecification("111111", 20, "mock", List.of("http://mock:8080/rate-limit/")), + new CrawlingSpecification("222222", 20, "mock2", List.of("http://mock2:8080/intermittent-error/"))); + } + catch (IOException ex) { + ex.printStackTrace(); + } + return crawlFiles.toString(); + } + + + public static MountableFile ipDatabasePath() { + Path modelsPath = Path.of(System.getProperty("user.dir")).resolve("data/models/IP2LOC/IP2LOCATION-LITE-DB1.CSV"); + if (!Files.isRegularFile(modelsPath)) { + System.err.println("Could not find models, looked in " + modelsPath.toAbsolutePath()); + throw new RuntimeException(); + } + return MountableFile.forHostPath(modelsPath.toString()); + } + + private static Path getCrawlPath() { + return Path.of(System.getProperty("user.dir")).resolve("build/tmp/crawl"); + } + + @Test + public void testRunTheThing() throws IOException { + // This is a test for examining the interaction between the crawler and various + // set-ups + } + +} diff --git a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java index cc238354..1e1fad4b 100644 --- a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java +++ b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java @@ -175,7 +175,7 @@ public class EdgeSearchE2ETest extends E2ETestBase { driver.get("http://proxyNginx/"); System.out.println(driver.getTitle()); - System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML")); +// System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML")); Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("frontpage")); } @@ -249,7 +249,7 @@ public class EdgeSearchE2ETest extends E2ETestBase { driver.get("http://proxyNginx/search?query=browse:wikipedia.local"); System.out.println(driver.getTitle()); - System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML")); +// System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML")); Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("browse")); } @@ -259,7 +259,7 @@ public class EdgeSearchE2ETest extends E2ETestBase { driver.get("http://proxyNginx/search?query=define:adiabatic"); System.out.println(driver.getTitle()); - System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML")); +// System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML")); Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("define")); } @@ -269,7 +269,7 @@ public class EdgeSearchE2ETest extends E2ETestBase { driver.get("http://proxyNginx/search?query=3%2B3"); System.out.println(driver.getTitle()); - System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML")); +// System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML")); Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("eval")); } diff --git a/marginalia_nu/src/e2e/resources/crawl-mock.sh b/marginalia_nu/src/e2e/resources/crawl-mock.sh new file mode 100644 index 00000000..4270929e --- /dev/null +++ b/marginalia_nu/src/e2e/resources/crawl-mock.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +mkdir -p /var/lib/wmsa/conf/ +mkdir -p /var/lib/wmsa/data/ + +echo "search.marginalia.nu" > /var/lib/wmsa/conf/user-agent + +cat crawl/crawl.plan +cat << EOF + #### ##### ## # # # + # # # # # # # # # + # # # # # # # # + # ##### ###### # ## # # + # # # # # # ## ## # + #### # # # # # # ###### +EOF +java -jar WMSA.jar crawl crawl/crawl.plan + +echo "ALL DONE" \ No newline at end of file diff --git a/marginalia_nu/src/jmh/java/nu/marginalia/BitSetTest.java b/marginalia_nu/src/jmh/java/nu/marginalia/BitSetTest.java new file mode 100644 index 00000000..1594f1a9 --- /dev/null +++ b/marginalia_nu/src/jmh/java/nu/marginalia/BitSetTest.java @@ -0,0 +1,313 @@ +package nu.marginalia; + +import nu.marginalia.util.AndCardIntSet; +import org.openjdk.jmh.annotations.*; +import org.roaringbitmap.RoaringBitmap; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Random; + +public class BitSetTest { + @org.openjdk.jmh.annotations.State(Scope.Benchmark) + public static class State { + List roar = new ArrayList<>(); + List acbs = new ArrayList<>(); + + List roarLow = new ArrayList<>(); + List roarHigh = new ArrayList<>(); + + List acbsLow = new ArrayList<>(); + List acbsHigh = new ArrayList<>(); + + @Setup(Level.Trial) + public void setUp() { + var rand = new Random(); + + for (int i = 0; i < 100; i++) { + int card = 1 + rand.nextInt(10); + + var rb = new RoaringBitmap(); + var cbs = new AndCardIntSet(); + + for (int j = 0; j < card; j++) { + int val = rand.nextInt(1_000_000); + rb.add(val); + cbs.add(val); + } + acbsLow.add(cbs); + roarLow.add(rb); + } + + for (int i = 0; i < 10; i++) { + int card = 1 + rand.nextInt(10000, 20000); + + var rb = new RoaringBitmap(); + + for (int j = 0; j < card; j++) { + int val = rand.nextInt(1_000_000); + rb.add(val); + } + acbsHigh.add(AndCardIntSet.of(rb)); + roarHigh.add(rb); + } + + + + for (int i = 0; i < 100000; i++) { + var rb = new RoaringBitmap(); + var cbs = new AndCardIntSet(); + + int val = rand.nextInt(1_000_000); + rb.add(val); + cbs.add(val); + + acbs.add(cbs); + roar.add(rb); + } + + for (int i = 0; i < 10000; i++) { + int card = 1 + rand.nextInt(10); + + var rb = new RoaringBitmap(); + var cbs = new AndCardIntSet(); + + for (int j = 0; j < card; j++) { + int val = rand.nextInt(1_000_000); + rb.add(val); + cbs.add(val); + } + acbs.add(cbs); + roar.add(rb); + } + for (int i = 0; i < 1000; i++) { + int card = 1 + rand.nextInt(100); + + var rb = new RoaringBitmap(); + var cbs = new AndCardIntSet(); + + for (int j = 0; j < card; j++) { + int val = rand.nextInt(1_000_000); + rb.add(val); + cbs.add(val); + } + acbs.add(cbs); + roar.add(rb); + } + for (int i = 0; i < 100; i++) { + int card = 1 + rand.nextInt(1000); + + var rb = new RoaringBitmap(); + var cbs = new AndCardIntSet(); + + for (int j = 0; j < card; j++) { + int val = rand.nextInt(1_000_000); + rb.add(val); + cbs.add(val); + } + acbs.add(cbs); + roar.add(rb); + } + for (int i = 0; i < 100; i++) { + int card = 1 + rand.nextInt(10000); + + var rb = new RoaringBitmap(); + var cbs = new AndCardIntSet(); + + for (int j = 0; j < card; j++) { + int val = rand.nextInt(1_000_000); + rb.add(val); + cbs.add(val); + } + acbs.add(cbs); + roar.add(rb); + } + + for (int i = 0; i < 2; i++) { + int card = 1 + rand.nextInt(100000); + + var rb = new RoaringBitmap(); + var cbs = new AndCardIntSet(); + + for (int j = 0; j < card; j++) { + int val = rand.nextInt(1_000_000); + rb.add(val); + cbs.add(val); + } + acbs.add(cbs); + roar.add(rb); + } + Collections.shuffle(acbs); + Collections.shuffle(roar); + } + } + +// +// @Benchmark +// @BenchmarkMode(Mode.Throughput) +// @Fork(value = 5, warmups = 5) +// public Object roaringCard(State state) { +// long val = 0; +// +// for (int i = 0; i < state.roar.size(); i++) { +// for (int j = i+1; j < state.roar.size(); j++) { +// val += RoaringBitmap.andCardinality(state.roar.get(i), state.roar.get(j)); +// } +// } +// +// return val; +// } +// @Benchmark +// @BenchmarkMode(Mode.Throughput) +// @Fork(value = 2, warmups = 2) +// public Object roaringCardNorm(State state) { +// long val = 0; +// +// for (int i = 0; i < state.roar.size()/1000; i++) { +// for (int j = i+1; j < state.roar.size(); j++) { +// +// var a = state.roar.get(i); +// var b = state.roar.get(j); +// val += RoaringBitmap.andCardinality(a, b) / (Math.sqrt(a.getCardinality()*b.getCardinality())); +// } +// } +// +// return val; +// } +// @Benchmark +// @BenchmarkMode(Mode.Throughput) +// @Fork(value = 5, warmups = 5) +// public Object cbsCard(State state) { +// long val = 0; +// +// for (int i = 0; i < state.roar.size(); i++) { +// for (int j = i+1; j < state.roar.size(); j++) { +// val += AndCardIntSet.andCardinality(state.acbs.get(i), state.acbs.get(j)); +// } +// } +// +// return val; +// } +// +// @Benchmark +// @BenchmarkMode(Mode.Throughput) +// @Fork(value = 1, warmups = 1) +// public Object cbsCardNorm(State state) { +// double val = 0; +// +// for (int i = 0; i < state.roar.size()/1000; i++) { +// for (int j = i+1; j < state.roar.size(); j++) { +// var a = state.acbs.get(i); +// var b = state.acbs.get(j); +// val += AndCardIntSet.andCardinality(a, b) / (Math.sqrt(a.cardinality()*b.cardinality())); +// } +// } +// +// return val; +// } + + @Benchmark + @BenchmarkMode(Mode.Throughput) + @Fork(value = 1, warmups = 1) + public Object cbsLowLow(State state) { + double val = 0; + + for (int i = 0; i < state.acbsLow.size(); i++) { + for (int j = 0; j < state.acbsLow.size(); j++) { + var a = state.acbsLow.get(i); + var b = state.acbsLow.get(j); + val += AndCardIntSet.andCardinality(a, b) / (Math.sqrt(a.getCardinality()*b.getCardinality())); + } + } + + return val; + } + + + @Benchmark + @BenchmarkMode(Mode.Throughput) + @Fork(value = 1, warmups = 1) + public Object cbsHighHigh(State state) { + double val = 0; + + for (int i = 0; i < state.acbsHigh.size(); i++) { + for (int j = 0; j < state.acbsHigh.size(); j++) { + var a = state.acbsHigh.get(i); + var b = state.acbsHigh.get(j); + val += AndCardIntSet.andCardinality(a, b) / (Math.sqrt(a.getCardinality()*b.getCardinality())); + } + } + + return val; + } + + @Benchmark + @BenchmarkMode(Mode.Throughput) + @Fork(value = 1, warmups = 1) + public Object cbsHighLow(State state) { + double val = 0; + + for (int i = 0; i < state.acbsHigh.size(); i++) { + for (int j = 0; j < state.acbsLow.size(); j++) { + var a = state.acbsHigh.get(i); + var b = state.acbsLow.get(j); + val += AndCardIntSet.andCardinality(a, b) / (Math.sqrt(a.getCardinality()*b.getCardinality())); + } + } + + return val; + } + + @Benchmark + @BenchmarkMode(Mode.Throughput) + @Fork(value = 1, warmups = 1) + public Object roarLowLow(State state) { + double val = 0; + + for (int i = 0; i < state.roarLow.size(); i++) { + for (int j = 0; j < state.roarLow.size(); j++) { + var a = state.roarLow.get(i); + var b = state.roarLow.get(j); + val += RoaringBitmap.andCardinality(a, b) / (Math.sqrt(a.getCardinality()*b.getCardinality())); + } + } + + return val; + } + + + @Benchmark + @BenchmarkMode(Mode.Throughput) + @Fork(value = 1, warmups = 1) + public Object roarHighLow(State state) { + double val = 0; + + for (int i = 0; i < state.roarHigh.size(); i++) { + for (int j = 0; j < state.roarLow.size(); j++) { + var a = state.roarHigh.get(i); + var b = state.roarLow.get(j); + val += RoaringBitmap.andCardinality(a, b) / (Math.sqrt(a.getCardinality()*b.getCardinality())); + } + } + + return val; + } + + @Benchmark + @BenchmarkMode(Mode.Throughput) + @Fork(value = 1, warmups = 1) + public Object roarHighHigh(State state) { + double val = 0; + + for (int i = 0; i < state.roarHigh.size(); i++) { + for (int j = 0; j < state.roarHigh.size(); j++) { + var a = state.roarHigh.get(i); + var b = state.roarHigh.get(j); + val += RoaringBitmap.andCardinality(a, b) / (Math.sqrt(a.getCardinality()*b.getCardinality())); + } + } + + return val; + } +} \ No newline at end of file diff --git a/marginalia_nu/src/jmh/java/nu/marginalia/ByteBufferBlockReadVsIndividualRead.java b/marginalia_nu/src/jmh/java/nu/marginalia/ByteBufferBlockReadVsIndividualRead.java deleted file mode 100644 index 097e1408..00000000 --- a/marginalia_nu/src/jmh/java/nu/marginalia/ByteBufferBlockReadVsIndividualRead.java +++ /dev/null @@ -1,85 +0,0 @@ -package nu.marginalia; - -import lombok.SneakyThrows; -import nu.marginalia.util.multimap.MultimapFileLong; -import org.openjdk.jmh.annotations.*; - -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Arrays; -import java.util.stream.IntStream; -import java.util.stream.LongStream; - -public class ByteBufferBlockReadVsIndividualRead { - - @State(Scope.Benchmark) - public static class ByteBufferState { - private MultimapFileLong mmf; - private Path file; - private static final int size = 800*1024*1024; - @Setup(Level.Iteration) - @SneakyThrows - public void setUp() { - file = Files.createTempFile("jmh", ".dat"); - mmf = MultimapFileLong.forOutput(file, size); - for (int i = 0; i < size; i++) { - mmf.put(i, i); - } - } - - @TearDown(Level.Iteration) - @SneakyThrows - public void tearDown() { - mmf.close(); - Files.delete(file); - } - - LongStream basicStream() { - return IntStream.range(0, size).mapToLong(mmf::get); - } - - LongStream blockStream(int blockSize) { - long urlOffset = 0; - long endOffset = size; - - long[] arry = new long[blockSize]; - - return LongStream - .iterate(urlOffset, i -> i< endOffset, i->i+blockSize) - .flatMap(pos -> { - int sz = (int)(Math.min(pos+blockSize, endOffset) - pos); - mmf.read(arry, sz, pos); - return Arrays.stream(arry, 0, sz); - }); - } - } - - - - // @Benchmark @BenchmarkMode(Mode.Throughput) - // @Fork(value = 1, warmups = 1) - // @Warmup(iterations = 1) - public long testBasic(ByteBufferState state) { - return state.basicStream().sum(); - } - - - @Benchmark @BenchmarkMode(Mode.Throughput) - @Fork(value = 1, warmups = 1) - @Warmup(iterations = 0) - public long testBlock128(ByteBufferState state) { - return state.blockStream(128).sum(); - } - @Benchmark @BenchmarkMode(Mode.Throughput) - @Fork(value = 1, warmups = 1) - @Warmup(iterations = 0) - public long testBlock1024(ByteBufferState state) { - return state.blockStream(1024).sum(); - } - @Benchmark @BenchmarkMode(Mode.Throughput) - @Fork(value = 1, warmups = 1) - @Warmup(iterations = 0) - public long testBlock8192(ByteBufferState state) { - return state.blockStream(8192).sum(); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/AndCardIntSet.java b/marginalia_nu/src/main/java/nu/marginalia/util/AndCardIntSet.java new file mode 100644 index 00000000..ccc3f1c6 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/AndCardIntSet.java @@ -0,0 +1,205 @@ +package nu.marginalia.util; + +import com.google.common.hash.HashFunction; +import com.google.common.hash.Hashing; +import gnu.trove.list.array.TIntArrayList; +import gnu.trove.set.hash.TIntHashSet; +import org.roaringbitmap.RoaringBitmap; + + +public class AndCardIntSet { + final TIntArrayList backingList; + long hash; + + public AndCardIntSet() { + backingList = new TIntArrayList(16); + backingList.sort(); + } + + public static AndCardIntSet of(int... list) { + var set = new TIntHashSet(list); + TIntArrayList lst = new TIntArrayList(set); + lst.sort(); + + return new AndCardIntSet(lst); + } + + public static AndCardIntSet of(RoaringBitmap bmap) { + + TIntArrayList lst = new TIntArrayList(bmap.getCardinality()); + lst.addAll(bmap.toArray()); + + return new AndCardIntSet(lst); + } + + + private AndCardIntSet(TIntArrayList list) { + backingList = list; + hash = 0; + + if (list.size() < 128) { + for (int v : list.toArray()) { + int bit = hasher.hashInt(v).asInt() % 64; + hash |= (1L << bit); + } + } + else { + hash = ~0L; + } + + } + + private static final HashFunction hasher = Hashing.murmur3_128(0); + + public boolean add(int val) { + if (!contains(val)) { + return false; + } + + if (backingList.size() < 128) { + int bit = hasher.hashInt(val).asInt() % 64; + hash |= (1L << bit); + } + else { + hash = ~0L; + } + backingList.add(val); + backingList.sort(); + return true; + } + + public boolean contains(int val) { + return backingList.binarySearch(val) >= 0; + } + + public int getCardinality() { + return backingList.size(); + } + + public static int andCardinality(AndCardIntSet a, AndCardIntSet b) { + + if (!testHash(a,b)) { + return 0; + } + + if (a.getCardinality() + b.getCardinality() < 10) { + return andLinearSmall(a, b); + } + + return andLinear(a,b); + } + + private static int andLinearSmall(AndCardIntSet a, AndCardIntSet b) { + int sum = 0; + for (int i = 0; i < a.getCardinality(); i++) { + for (int j = 0; j < b.getCardinality(); j++) { + if (a.backingList.getQuick(i) == b.backingList.getQuick(j)) + sum++; + } + } + return sum; + } + + private static int andLinear(AndCardIntSet a, AndCardIntSet b) { + + int i = 0, j = 0; + int card = 0; + + do { + int diff = a.backingList.getQuick(i) - b.backingList.getQuick(j); + + if (diff < 0) i++; + else if (diff > 0) j++; + else { + i++; + j++; + card++; + } + } while (i < a.getCardinality() && j < b.getCardinality()); + + return card; + + } + + private static boolean testHash(AndCardIntSet a, AndCardIntSet b) { + return (a.hash & b.hash) != 0; + } + + public boolean cardinalityExceeds(int val) { + return getCardinality() >= val; + } + + public static AndCardIntSet and(AndCardIntSet a, AndCardIntSet b) { + int i = 0; + int j = 0; + + TIntArrayList andVals = new TIntArrayList(1 + (int)Math.sqrt(a.getCardinality())); + + while (i < a.getCardinality() && j < b.getCardinality()) { + int diff = a.backingList.getQuick(i) - b.backingList.getQuick(j); + if (diff < 0) i++; + else if (diff > 0) j++; + else { + andVals.add(a.backingList.getQuick(i)); + i++; + j++; + } + } + + return new AndCardIntSet(andVals); + } + + public static double weightedProduct(float[] weights, AndCardIntSet a, AndCardIntSet b) { + int i = 0; + int j = 0; + + double sum = 0; + + if (a.getCardinality() + b.getCardinality() < 10) { + return weightedProductSmall(weights, a, b); + } + + do { + int diff = a.backingList.getQuick(i) - b.backingList.getQuick(j); + if (diff < 0) i++; + else if (diff > 0) j++; + else { + sum += weights[a.backingList.getQuick(i)]; + i++; + j++; + } + } while (i < a.getCardinality() && j < b.getCardinality()); + + return sum; + } + + + private static double weightedProductSmall(float[] weights, AndCardIntSet a, AndCardIntSet b) { + double sum = 0; + + for (int i = 0; i < a.getCardinality(); i++) { + for (int j = 0; j < b.getCardinality(); j++) { + int av = a.backingList.getQuick(i); + int bv = b.backingList.getQuick(j); + if (av == bv) + sum+=weights[av]; + } + } + + return sum; + } + public double mulAndSum(float[] weights) { + double sum = 0; + for (int i = 0; i < backingList.size(); i++) { + sum += weights[backingList.getQuick(i)]; + } + return sum; + } + public int[] toArray() { + return backingList.toArray(); + } + + public TIntArrayList values() { + return backingList; + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/BrailleBlockPunchCards.java b/marginalia_nu/src/main/java/nu/marginalia/util/BrailleBlockPunchCards.java new file mode 100644 index 00000000..e0a3c9db --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/BrailleBlockPunchCards.java @@ -0,0 +1,52 @@ +package nu.marginalia.util; + +public class BrailleBlockPunchCards { + + public static String printBits(int val, int bits) { + StringBuilder builder = new StringBuilder(); + + for (int b = 0; b < bits; b+=8, val>>>=8) { + builder.append((char)('\u2800'+bin2brail(val))); + } + + return builder.toString(); + } + + /* The braille block in unicode U2800 is neat because it contains + * 8 "bits", but for historical reasons, they're addressed in a bit + * of an awkward way. Braille used to be a 2x6 grid, but it was extended + * to 2x8. + * + * It's addressed as follows + * + * 0 3 + * 1 4 + * 2 5 + * 6 7 <-- extended braille + * + * + * We want to use it as a dot matrix to represent bits. To do that we need + * to do this transformation: + * + * 0 1 2 3 4 5 6 7 native order bits + * | | | \ _\__\/ | + * | | | / \ \ \ | + * 0 1 2 6 3 4 5 7 braille order bits + * + * 01 02 04 08 10 20 40 80 + * 01+02+04 +80 : &0x87 + * << 10+20+40 : &0x70, <<1 + * 08 >> >> >> : &0x08, >>3 + * + * Or in other words we do + * (v & 0x87) + * | ((v & 0x70) >> 1) + * | ((v & 0x08) << 3) + * + * Thanks for coming to my TED talk. + */ + + private static char bin2brail(int v) { + return (char)((v & 0x87) | ((v & 0x70) >> 1) | ((v & 0x08) << 3)); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ListChunker.java b/marginalia_nu/src/main/java/nu/marginalia/util/ListChunker.java index ef27ba1d..0c99d7a1 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ListChunker.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ListChunker.java @@ -1,5 +1,7 @@ package nu.marginalia.util; +import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords; + import java.util.ArrayList; import java.util.Collections; import java.util.List; @@ -14,13 +16,13 @@ public class ListChunker { * * @see List#subList */ - public static List> chopList(List data, int size) { + public static List chopList(DocumentKeywords data, int size) { if (data.isEmpty()) return Collections.emptyList(); else if (data.size() < size) return List.of(data); - final List> ret = new ArrayList<>(1 + data.size() / size); + final List ret = new ArrayList<>(1 + data.size() / size); for (int i = 0; i < data.size(); i+=size) { ret.add(data.subList(i, Math.min(data.size(), i+size))); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeDogEar.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeDogEar.java new file mode 100644 index 00000000..f21eeb9d --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeDogEar.java @@ -0,0 +1,33 @@ +package nu.marginalia.util.btree; + +import nu.marginalia.util.btree.model.BTreeContext; +import nu.marginalia.util.btree.model.BTreeHeader; +import nu.marginalia.util.multimap.MultimapFileLongSlice; + +/* + * End-of-page mark that's used as a sentinel to verify that + * the BTreeWriter's caller actually writes as much as they say + * they want to. (Failing to do so will corrupt the tree) + * + */ +public class BTreeDogEar { + + private MultimapFileLongSlice sentinelSlice; + + public BTreeDogEar(BTreeContext ctx, BTreeHeader header, MultimapFileLongSlice base) { + if (header.numEntries() > 3) { + sentinelSlice = base.atOffset((long) header.numEntries() * ctx.entrySize() - 3); + sentinelSlice.put(0, 4L); + sentinelSlice.put(1, 5L); + sentinelSlice.put(2, 1L); + } + } + + public boolean verify() { + if (sentinelSlice == null) + return true; + + return 4 != sentinelSlice.get(0) || 5 != sentinelSlice.get(1) || 1 != sentinelSlice.get(2); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeQueryBuffer.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeQueryBuffer.java new file mode 100644 index 00000000..3553a97b --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeQueryBuffer.java @@ -0,0 +1,146 @@ +package nu.marginalia.util.btree; + +import java.util.Arrays; + +public class BTreeQueryBuffer { + public final long[] data; + public int end; + + private int read = 0; + private int write = 0; + + public BTreeQueryBuffer(int size) { + this.data = new long[size]; + this.end = size; + } + + public BTreeQueryBuffer(long [] data, int size) { + this.data = data; + this.end = size; + } + + private BTreeQueryBuffer(long [] data) { + this.data = data; + this.end = data.length; + } + + public BTreeQueryBuffer[] split(int... splitPoints) { + BTreeQueryBuffer[] ret = new BTreeQueryBuffer[splitPoints.length+1]; + + ret[0] = new BTreeQueryBuffer(Arrays.copyOfRange(data, 0, splitPoints[0])); + for (int i = 1; i < splitPoints.length; i++) { + ret[i] = new BTreeQueryBuffer(Arrays.copyOfRange(data, splitPoints[i-1], splitPoints[i])); + } + ret[ret.length-1] = new BTreeQueryBuffer(Arrays.copyOfRange(data, splitPoints[splitPoints.length-1], end)); + + return ret; + } + + public void gather(BTreeQueryBuffer... buffers) { + int start = 0; + + for (var buffer : buffers) { + System.arraycopy(buffer.data, 0, data, start, buffer.end); + start += buffer.end; + } + + this.read = 0; + this.write = 0; + this.end = start; + } + + public long[] copyData() { + return Arrays.copyOf(data, end); + } + + public void retainAll() { + read = write = end; + } + + public boolean isEmpty() { + return end == 0; + } + + public int size() { + return end; + } + + public long currentValue() { + return data[read]; + } + + public boolean rejectAndAdvance() { + return ++read < end; + } + + public boolean retainAndAdvance() { + if (read != write) { + long tmp = data[write]; + data[write] = data[read]; + data[read] = tmp; + } + + write++; + + return ++read < end; + } + + public boolean hasMore() { + return read < end; + } + + public void finalizeFiltering() { + end = write; + read = 0; + write = 0; + } + + public void startFilterForRange(int pos, int end) { + read = write = pos; + this.end = end; + } + + public void reset() { + end = data.length; + read = 0; + write = 0; + } + + public void zero() { + end = 0; + read = 0; + write = 0; + Arrays.fill(data, 0); + } + + public void uniq() { + if (end <= 1) return; + + long prev = currentValue(); + retainAndAdvance(); + + while (hasMore()) { + + long val = currentValue(); + + if (prev == val) { + rejectAndAdvance(); + } else { + retainAndAdvance(); + prev = val; + } + + } + + finalizeFiltering(); + } + + public String toString() { + return getClass().getSimpleName() + "[" + + "read = " + read + + ",write = " + write + + ",end = " + end + + ",data = [" + Arrays.toString(Arrays.copyOf(data, end)) + "]]"; + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java index c3794acb..472478ea 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java @@ -1,5 +1,7 @@ package nu.marginalia.util.btree; +import it.unimi.dsi.fastutil.longs.LongLongImmutablePair; +import lombok.SneakyThrows; import nu.marginalia.util.btree.model.BTreeContext; import nu.marginalia.util.btree.model.BTreeHeader; import nu.marginalia.util.multimap.MultimapFileLong; @@ -14,70 +16,275 @@ public class BTreeReader { private final MultimapSearcher indexSearcher; private final MultimapSearcher dataSearcher; + private final BTreeHeader header; - public BTreeReader(MultimapFileLong file, BTreeContext ctx) { + public BTreeReader(MultimapFileLong file, BTreeContext ctx, BTreeHeader header) { this.file = file; this.indexSearcher = MultimapSearcher.forContext(file, ~0, 1); this.dataSearcher = MultimapSearcher.forContext(file, ctx.equalityMask(), ctx.entrySize()); this.ctx = ctx; + this.header = header; } - public BTreeHeader getHeader(long fileOffset) { + public BTreeReader(MultimapFileLong file, BTreeContext ctx, long offset) { + this.file = file; + this.indexSearcher = MultimapSearcher.forContext(file, ~0, 1); + this.dataSearcher = MultimapSearcher.forContext(file, ctx.equalityMask(), ctx.entrySize()); + + this.ctx = ctx; + this.header = createHeader(file, offset); + } + + public static BTreeHeader createHeader(MultimapFileLong file, long fileOffset) { return new BTreeHeader(file.get(fileOffset), file.get(fileOffset+1), file.get(fileOffset+2)); } + public BTreeHeader getHeader() { + return header; + } + + public int numEntries() { + return header.numEntries(); + } + + @SneakyThrows + public void retainEntries(BTreeQueryBuffer buffer) { + if (header.layers() == 0) { + BTreePointer pointer = new BTreePointer(header); + pointer.retainData(buffer); + } + retainSingle(buffer); + } + + @SneakyThrows + public void rejectEntries(BTreeQueryBuffer buffer) { + if (header.layers() == 0) { + BTreePointer pointer = new BTreePointer(header); + pointer.rejectData(buffer); + } + rejectSingle(buffer); + } + + private void retainSingle(BTreeQueryBuffer buffer) { + + BTreePointer pointer = new BTreePointer(header); + + for (; buffer.hasMore(); pointer.resetToRoot()) { + + long val = buffer.currentValue() & ctx.equalityMask(); + + if (!pointer.walkToData(val)) { + buffer.rejectAndAdvance(); + continue; + } + + pointer.retainData(buffer); + } + } + + private void rejectSingle(BTreeQueryBuffer buffer) { + BTreePointer pointer = new BTreePointer(header); + + for (; buffer.hasMore(); pointer.resetToRoot()) { + + long val = buffer.currentValue() & ctx.equalityMask(); + + if (pointer.walkToData(val) && pointer.containsData(val)) { + buffer.rejectAndAdvance(); + } + else { + buffer.retainAndAdvance(); + } + } + } + + /** * * @return file offset of entry matching keyRaw, negative if absent */ - public long findEntry(BTreeHeader header, final long keyRaw) { - final int blockSize = ctx.BLOCK_SIZE_WORDS(); - + public long findEntry(final long keyRaw) { final long key = keyRaw & ctx.equalityMask(); - final long dataAddress = header.dataOffsetLongs(); - final long searchStart; - final long numEntries; + BTreePointer ip = new BTreePointer(header); - if (header.layers() == 0) { // For small data, there is no index block, only a flat data block - searchStart = dataAddress; - numEntries = header.numEntries(); + while (!ip.isDataLayer()) + ip.walkToChild(key); + + return ip.findData(key); + } + + public void readData(long[] data, int n, long pos) { + file.read(data, n, header.dataOffsetLongs() + pos); + } + + public long[] queryData(long[] urls, int offset) { + BTreePointer pointer = new BTreePointer(header); + + long[] ret = new long[urls.length]; + + for (int i = 0; i < urls.length; i++, pointer.resetToRoot()) { + if (pointer.walkToData(urls[i])) { + long dataAddress = pointer.findData(urls[i]); + if (dataAddress >= 0) { + ret[i] = file.get(dataAddress + offset); + } + } } - else { - long dataLayerOffset = searchIndex(header, key); - if (dataLayerOffset < 0) { - return dataLayerOffset; + + return ret; + } + + /** Find the range of values so that prefixStart <= n < prefixNext */ + public LongLongImmutablePair getRangeForPrefix(long prefixStart, long prefixNext) { + long lowerBoundStart = lowerBound(prefixStart); + long lowerBoundEnd = lowerBound(prefixNext); + + return new LongLongImmutablePair(lowerBoundStart, lowerBoundEnd); + } + + private long lowerBound(long key) { + key &= ctx.equalityMask(); + + BTreePointer ip = new BTreePointer(header); + + while (!ip.isDataLayer()) + ip.walkToChild(key); + + return ip.findDataLower(key); + } + + private class BTreePointer { + private final long[] layerOffsets; + + private int layer; + private long offset; + private long boundary; + + public String toString() { + return getClass().getSimpleName() + "[" + + "layer = " + layer + " ," + + "offset = " + offset + "]"; + } + + public BTreePointer(BTreeHeader header) { + layer = header.layers() - 1; + offset = 0; + layerOffsets = header.getRelativeLayerOffsets(ctx); + boundary = Long.MAX_VALUE; + } + + public void resetToRoot() { + this.layer = header.layers() - 1; + this.offset = 0; + this.boundary = Long.MAX_VALUE; + } + + public int layer() { + return layer; + } + + public boolean walkToChild(long key) { + final long indexAddress = header.indexOffsetLongs(); + + final long indexLayerBlockOffset = layerOffsets[layer] + offset; + + final long searchStart = indexAddress + indexLayerBlockOffset; + final long nextLayerOffset = (int)(indexSearcher.binarySearchLower(key, searchStart, ctx.BLOCK_SIZE_WORDS()) - searchStart); + + if (nextLayerOffset < 0) + return false; + + layer --; + boundary = file.get(searchStart + offset); + offset = ctx.BLOCK_SIZE_WORDS() * (offset + nextLayerOffset); + + return true; + } + + public boolean walkToData(long key) { + while (!isDataLayer()) { + if (!walkToChild(key)) { + return false; + } + } + return true; + } + + public boolean isDataLayer() { + return layer < 0; + } + + public boolean containsData(long key) { + return findData(key) >= 0; + } + + public long findData(long key) { + if (layer > 0) { + throw new IllegalStateException("Looking for data in an index layer"); } - searchStart = dataAddress + dataLayerOffset * ctx.entrySize(); - numEntries = min(header.numEntries() - dataLayerOffset, blockSize); + long searchStart = header.dataOffsetLongs() + offset * ctx.entrySize(); + int numEntries = min((int)(header.numEntries() - offset), ctx.BLOCK_SIZE_WORDS()); + + return dataSearcher.binarySearch(key, searchStart, numEntries); } - return dataSearcher.binarySearch(key, searchStart, numEntries); - } + public long findDataLower(long key) { + if (layer > 0) { + throw new IllegalStateException("Looking for data in an index layer"); + } - private long searchIndex(BTreeHeader header, long key) { - final int blockSize = ctx.BLOCK_SIZE_WORDS(); - final long indexAddress = header.indexOffsetLongs(); + long searchStart = header.dataOffsetLongs() + offset * ctx.entrySize(); + int numEntries = min((int)(header.numEntries() - offset), ctx.BLOCK_SIZE_WORDS()); - long layerOffset = 0; - - for (int i = header.layers() - 1; i >= 0; --i) { - final long indexLayerBlockOffset = header.relativeIndexLayerOffset(ctx, i) + layerOffset; - - final long nextLayerOffset = relativePositionInIndex(key, indexAddress + indexLayerBlockOffset, blockSize); - if (nextLayerOffset < 0) - return nextLayerOffset; - - layerOffset = blockSize * (nextLayerOffset + layerOffset); + return dataSearcher.binarySearchLower(key, searchStart, numEntries); } - return layerOffset; + public void retainData(BTreeQueryBuffer buffer) { + + long dataOffset = findData(buffer.currentValue()); + if (dataOffset >= 0) { + buffer.retainAndAdvance(); + + long blockBase = header.dataOffsetLongs() + offset * ctx.entrySize(); + long relOffset = dataOffset - blockBase; + + int numEntries = + min((int) (header.numEntries() - relOffset), ctx.BLOCK_SIZE_WORDS()) / ctx.entrySize(); + + if (buffer.currentValue() <= boundary) { + file.retain(buffer, boundary, dataOffset, numEntries, ctx.equalityMask(), ctx.entrySize()); + } + } + else { + buffer.rejectAndAdvance(); + } + + } + + public void rejectData(BTreeQueryBuffer buffer) { + + long dataOffset = findData(buffer.currentValue()); + if (dataOffset >= 0) { + buffer.rejectAndAdvance(); + + long blockBase = header.dataOffsetLongs() + offset * ctx.entrySize(); + long relOffset = dataOffset - blockBase; + + int numEntries = + min((int) (header.numEntries() - relOffset), ctx.BLOCK_SIZE_WORDS()) / ctx.entrySize(); + + if (buffer.currentValue() <= boundary) { + file.reject(buffer, boundary, dataOffset, numEntries, ctx.equalityMask(), ctx.entrySize()); + } + } + else { + buffer.retainAndAdvance(); + } + } } - private long relativePositionInIndex(long key, long start, long n) { - return indexSearcher.binarySearchUpper(key, start, n) - start; - } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeWriter.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeWriter.java index 0c1f0789..bb68a3c1 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeWriter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeWriter.java @@ -3,6 +3,8 @@ package nu.marginalia.util.btree; import nu.marginalia.util.btree.model.BTreeContext; import nu.marginalia.util.btree.model.BTreeHeader; import nu.marginalia.util.multimap.MultimapFileLongSlice; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.IOException; @@ -10,6 +12,7 @@ import java.io.IOException; public class BTreeWriter { private final BTreeContext ctx; private final MultimapFileLongSlice map; + private final Logger logger = LoggerFactory.getLogger(getClass()); public BTreeWriter(MultimapFileLongSlice map, BTreeContext ctx) { this.map = map; @@ -39,7 +42,16 @@ public class BTreeWriter { header.write(map, offset); - writeIndexCallback.write(map.atOffset(header.dataOffsetLongs())); + + var slice = map.atOffset(header.dataOffsetLongs()); + + BTreeDogEar dogEar = new BTreeDogEar(ctx, header, slice); + + writeIndexCallback.write(slice); + + if (!dogEar.verify()) { + logger.error("Dog ear was not overwritten: {}", header); + } if (header.layers() < 1) { // The data is too small to benefit from indexing return ctx.calculateSize(numEntries); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/CachingBTreeReader.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/CachingBTreeReader.java deleted file mode 100644 index feea6bde..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/CachingBTreeReader.java +++ /dev/null @@ -1,136 +0,0 @@ -package nu.marginalia.util.btree; - -import nu.marginalia.util.btree.model.BTreeContext; -import nu.marginalia.util.btree.model.BTreeHeader; -import nu.marginalia.util.multimap.MultimapFileLong; -import nu.marginalia.util.multimap.MultimapSearcher; - -import static java.lang.Math.min; - -public class CachingBTreeReader { - - private final MultimapFileLong file; - public final BTreeContext ctx; - - private final MultimapSearcher dataSearcher; - - public CachingBTreeReader(MultimapFileLong file, BTreeContext ctx) { - this.file = file; - this.dataSearcher = MultimapSearcher.forContext(file, ctx.equalityMask(), ctx.entrySize()); - - this.ctx = ctx; - } - - public BTreeHeader getHeader(long fileOffset) { - return new BTreeHeader(file.get(fileOffset), file.get(fileOffset+1), file.get(fileOffset+2)); - } - - public BTreeCachedIndex prepareCache(BTreeHeader header) { - return new BTreeCachedIndex(header); - } - /** - * - * @return file offset of entry matching keyRaw, negative if absent - */ - public long findEntry(BTreeCachedIndex cache, final long keyRaw) { - BTreeHeader header = cache.header; - - final int blockSize = ctx.BLOCK_SIZE_WORDS(); - - final long key = keyRaw & ctx.equalityMask(); - final long dataAddress = header.dataOffsetLongs(); - - final long searchStart; - final long numEntries; - - if (header.layers() == 0) { // For small data, there is no index block, only a flat data block - searchStart = dataAddress; - numEntries = header.numEntries(); - } - else { - cache.load(); - - long dataLayerOffset = searchIndex(header, cache, key); - if (dataLayerOffset < 0) { - return dataLayerOffset; - } - - searchStart = dataAddress + dataLayerOffset * ctx.entrySize(); - numEntries = min(header.numEntries() - dataLayerOffset, blockSize); - } - - return dataSearcher.binarySearch(key, searchStart, numEntries); - } - - private long searchIndex(BTreeHeader header, BTreeCachedIndex cache, long key) { - final int blockSize = ctx.BLOCK_SIZE_WORDS(); - long layerOffset = 0; - - for (int i = header.layers() - 1; i >= 0; --i) { - final long indexLayerBlockOffset = header.relativeIndexLayerOffset(ctx, i) + layerOffset; - - final long nextLayerOffset = cache.relativePositionInIndex(key, (int) indexLayerBlockOffset, blockSize); - if (nextLayerOffset < 0) - return nextLayerOffset; - - layerOffset = blockSize * (nextLayerOffset + layerOffset); - } - - return layerOffset; - } - - /** A cache for the BTree index data that will drastically reduce the number of disk reads - * for repeated queries against the same tree. The memory consumption is typically very low - * and the disk access pattern for reading the entire index relatively cheap. - */ - public class BTreeCachedIndex { - long[] indexData; - final BTreeHeader header; - - final int indexedDataSize; - - public BTreeCachedIndex(BTreeHeader header) { - this.header = header; - indexedDataSize = header.numEntries(); - } - - public void load() { - if (indexData != null) - return; - - int size = (int)(header.dataOffsetLongs() - header.indexOffsetLongs()); - indexData = new long[size]; - file.read(indexData, header.indexOffsetLongs()); - } - - long relativePositionInIndex(long key, int fromIndex, int n) { - int low = 0; - int high = n - 1; - - while (low <= high) { - int mid = (low + high) >>> 1; - long midVal = indexData[fromIndex + mid]; - - if (midVal < key) - low = mid + 1; - else if (midVal > key) - high = mid - 1; - else - return mid; - } - return low; - } - - public long sizeBytes() { - return isLoaded() ? 8L*indexData.length : 0; - } - - public int getIndexedDataSize() { - return indexedDataSize; - } - - public boolean isLoaded() { - return indexData != null; - } - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeContext.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeContext.java index e91b71fd..3179db70 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeContext.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeContext.java @@ -19,7 +19,7 @@ public record BTreeContext(int MAX_LAYERS, } public int numIndexLayers(int numEntries) { - if (numEntries <= BLOCK_SIZE_WORDS*2) { + if (numEntries <= BLOCK_SIZE_WORDS*2/entrySize) { return 0; } for (int i = 1; i < MAX_LAYERS; i++) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryData.java b/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryData.java index 557ad991..9e89b730 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryData.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryData.java @@ -26,7 +26,6 @@ public class DictionaryData { if (rb == -1) { int end = activeBank.getEnd(); - logger.debug("Switching bank @ {}", end); var newBank = new DictionaryDataBank(end, DICTIONARY_BANK_SIZE); rb = newBank.add(key); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryHashMap.java b/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryHashMap.java index 1c76b116..f66599d3 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryHashMap.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryHashMap.java @@ -16,7 +16,7 @@ import static nu.marginalia.util.FileSizeUtil.readableSize; * Spiritually influenced by GNU Trove's hash maps * LGPL 2.1 */ -public class DictionaryHashMap { +public class DictionaryHashMap implements DictionaryMap { private static final Logger logger = LoggerFactory.getLogger(DictionaryHashMap.class); private static final Gauge probe_count_metrics = Gauge.build("wmsa_dictionary_hash_map_probe_count", "Probing Count") @@ -81,6 +81,7 @@ public class DictionaryHashMap { } } + @Override public int size() { return sz.get(); } @@ -97,6 +98,7 @@ public class DictionaryHashMap { buffers[buffer].put(bufferIdx, val); } + @Override public int put(long key) { long hash = key & 0x7FFF_FFFF_FFFF_FFFFL; @@ -143,6 +145,7 @@ public class DictionaryHashMap { return di; } + @Override public int get(long key) { final long hash = key & 0x7FFF_FFFF_FFFF_FFFFL; final long cell = hash % hashTableSize; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryMap.java b/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryMap.java new file mode 100644 index 00000000..fad45130 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryMap.java @@ -0,0 +1,9 @@ +package nu.marginalia.util.dict; + +public interface DictionaryMap { + int size(); + + int put(long key); + + int get(long key); +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/LanguageFilter.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/LanguageFilter.java index c3653bc9..7649201b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/LanguageFilter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/LanguageFilter.java @@ -9,7 +9,10 @@ import javax.inject.Inject; import javax.inject.Singleton; import java.io.BufferedReader; import java.io.InputStreamReader; -import java.util.*; +import java.util.HashSet; +import java.util.Objects; +import java.util.Optional; +import java.util.Set; @Singleton public class LanguageFilter { @@ -78,7 +81,10 @@ public class LanguageFilter { } public boolean isBlockedUnicodeRange(String data) { - return Arrays.stream(UnicodeRanges.values()) - .parallel().anyMatch(range -> range.test(data)); + for (var range: UnicodeRanges.values()) { + if (range.test(data)) + return true; + } + return false; } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/UnicodeRanges.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/UnicodeRanges.java index ef46ee0b..bd1d3043 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/UnicodeRanges.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/UnicodeRanges.java @@ -68,10 +68,19 @@ public enum UnicodeRanges { this.max = max; } - boolean test(String text) { - return text.chars().limit(1000).parallel() - .filter(i -> i >= min && i < max) - .count() >= (sensitive ? 15 : 100); + public boolean test(String text) { + int count = 0; + int max = sensitive ? 15 : 100; + for (int i = 0; i < Math.min(2000, text.length()); i++) { + char c = text.charAt(i); + if (c >= min && c <= this.max) { + if (count++ > max) { + return true; + } + } + } + + return false; } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/WordPatterns.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/WordPatterns.java index d87f304f..4766706e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/WordPatterns.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/WordPatterns.java @@ -88,6 +88,9 @@ public class WordPatterns { } public static boolean hasWordQualities(String s) { + if (s.isBlank()) + return false; + int start = 0; int end = s.length(); if (s.charAt(0) == '#') start++; @@ -95,13 +98,14 @@ public class WordPatterns { for (int i = start; i < end; i++) { char c = s.charAt(i); - if (!("_@.'+-".indexOf(c) >= 0) + if (("_@.'+-".indexOf(c) < 0) && !(c >= 'a' && c <= 'z') && !(c >= 'A' && c <= 'Z') && !(c >= '0' && c <= '9') && !(c >= '\u00C0' && c <= '\u00D6') && !(c >= '\u00D8' && c <= '\u00f6') - && !(c >= '\u00f8' && c <= '\u00ff')) { + && !(c >= '\u00f8' && c <= '\u00ff')) + { return false; } } @@ -119,10 +123,14 @@ public class WordPatterns { if (!filter(s)) { return true; } - if (topWords.contains(s.toLowerCase())) { + if (isTopWord(s)) { return true; } return false; } + public static boolean isTopWord(String s) { + return topWords.contains(s.toLowerCase()); + } + } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java index 58b7c198..978d8b63 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java @@ -2,8 +2,10 @@ package nu.marginalia.util.language.processing; import nu.marginalia.util.language.WordPatterns; import nu.marginalia.util.language.processing.model.DocumentLanguageData; +import nu.marginalia.util.language.processing.model.KeywordMetadata; import nu.marginalia.util.language.processing.model.WordRep; import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; +import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet; import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords; @@ -20,14 +22,9 @@ public class DocumentKeywordExtractor { private final NameCounter nameCounter; private final SubjectCounter subjectCounter; - private final TermFrequencyDict dict; - private final double docCount; @Inject public DocumentKeywordExtractor(TermFrequencyDict dict) { - this.dict = dict; - docCount = dict.docCount(); - keywordExtractor = new KeywordExtractor(); tfIdfCounter = new KeywordCounter(dict, keywordExtractor); @@ -36,69 +33,105 @@ public class DocumentKeywordExtractor { } - public EdgePageWordSet extractKeywordsMinimal(DocumentLanguageData documentLanguageData) { + public EdgePageWordSet extractKeywordsMinimal(DocumentLanguageData documentLanguageData, KeywordMetadata keywordMetadata) { List titleWords = extractTitleWords(documentLanguageData); - - KeywordCounter.WordHistogram wordsTfIdf = tfIdfCounter.countHisto(documentLanguageData); List wordsNamesAll = nameCounter.count(documentLanguageData, 2); List subjects = subjectCounter.count(documentLanguageData); - List midKeywords = new ArrayList<>(wordsTfIdf.mid()); - List topKeywords = new ArrayList<>(wordsTfIdf.top()); + tfIdfCounter.countHisto(keywordMetadata, documentLanguageData); - Collection artifacts = getArtifacts(documentLanguageData); + for (var rep : titleWords) keywordMetadata.titleKeywords().add(rep.stemmed); + for (var rep : wordsNamesAll) keywordMetadata.namesKeywords().add(rep.stemmed); + for (var rep : subjects) keywordMetadata.subjectKeywords().add(rep.stemmed); + + List artifacts = getArtifacts(documentLanguageData); + + keywordMetadata.flagsTemplate().add(EdgePageWordFlags.Simple); return new EdgePageWordSet( - createWords(IndexBlock.Subjects, subjects), - createWords(IndexBlock.Title, titleWords), - createWords(IndexBlock.NamesWords, wordsNamesAll), - createWords(IndexBlock.Tfidf_Top, topKeywords), - createWords(IndexBlock.Tfidf_Middle, midKeywords), - new EdgePageWords(IndexBlock.Artifacts, artifacts) + createWords(keywordMetadata, IndexBlock.Title, titleWords), + EdgePageWords.withBlankMetadata(IndexBlock.Artifacts, artifacts) ); } - - - public EdgePageWordSet extractKeywords(DocumentLanguageData documentLanguageData) { + public EdgePageWordSet extractKeywords(DocumentLanguageData documentLanguageData, KeywordMetadata keywordMetadata) { List titleWords = extractTitleWords(documentLanguageData); - KeywordCounter.WordHistogram wordsTfIdf = tfIdfCounter.countHisto(documentLanguageData); - List wordsNamesAll = nameCounter.count(documentLanguageData, 1); + getWordPositions(keywordMetadata, documentLanguageData); + + List wordsNamesAll = nameCounter.count(documentLanguageData, 2); List subjects = subjectCounter.count(documentLanguageData); - List lowKeywords = new ArrayList<>(wordsTfIdf.lower()); - List midKeywords = new ArrayList<>(wordsTfIdf.mid()); - List topKeywords = new ArrayList<>(wordsTfIdf.top()); + List wordsTfIdf = tfIdfCounter.countHisto(keywordMetadata, documentLanguageData); - Collection artifacts = getArtifacts(documentLanguageData); + for (var rep : titleWords) keywordMetadata.titleKeywords().add(rep.stemmed); + for (var rep : wordsNamesAll) keywordMetadata.namesKeywords().add(rep.stemmed); + for (var rep : subjects) keywordMetadata.subjectKeywords().add(rep.stemmed); + + List artifacts = getArtifacts(documentLanguageData); var wordSet = new EdgePageWordSet( - createWords(IndexBlock.Subjects, subjects), - createWords(IndexBlock.Title, titleWords), - createWords(IndexBlock.NamesWords, wordsNamesAll), - createWords(IndexBlock.Tfidf_Top, topKeywords), - createWords(IndexBlock.Tfidf_Middle, midKeywords), - createWords(IndexBlock.Tfidf_Lower, lowKeywords), - new EdgePageWords(IndexBlock.Artifacts, artifacts) + createWords(keywordMetadata, IndexBlock.Title, titleWords), + createWords(keywordMetadata, IndexBlock.Tfidf_High, wordsTfIdf), + createWords(keywordMetadata, IndexBlock.Subjects, subjects), + EdgePageWords.withBlankMetadata(IndexBlock.Artifacts, artifacts) ); - getSimpleWords(wordSet, documentLanguageData, + getSimpleWords(keywordMetadata, wordSet, documentLanguageData, IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus); return wordSet; } - private void getSimpleWords(EdgePageWordSet wordSet, DocumentLanguageData documentLanguageData, IndexBlock... blocks) { + + public void getWordPositions(KeywordMetadata keywordMetadata, DocumentLanguageData dld) { + Map ret = keywordMetadata.positionMask(); + + int posCtr = 0; + for (var sent : dld.titleSentences) { + int posBit = (int)((1L << (posCtr/4)) & 0xFFFF_FFFFL); + + for (var word : sent) { + ret.merge(word.stemmed(), posBit, this::bitwiseOr); + } + + for (var span : keywordExtractor.getNames(sent)) { + ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr); + } + } + posCtr+=4; + for (var sent : dld.sentences) { + int posBit = (int)((1L << (posCtr/4)) & 0xFFFF_FFFFL); + + for (var word : sent) { + ret.merge(word.stemmed(), posBit, this::bitwiseOr); + } + + for (var span : keywordExtractor.getNames(sent)) { + ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr); + } + + posCtr++; + } + } + + private int bitwiseOr(int a, int b) { + return a | b; + } + + + private void getSimpleWords(KeywordMetadata metadata, EdgePageWordSet wordSet, DocumentLanguageData documentLanguageData, IndexBlock... blocks) { + + EnumSet flagsTemplate = EnumSet.noneOf(EdgePageWordFlags.class); int start = 0; int lengthGoal = 32; - for (int blockIdx = 0; blockIdx < blocks.length-1 && start < documentLanguageData.sentences.length; blockIdx++) { + for (int blockIdx = 0; blockIdx < blocks.length && start < documentLanguageData.sentences.length; blockIdx++) { IndexBlock block = blocks[blockIdx]; - Set words = new HashSet<>(lengthGoal+100); + Set words = new HashSet<>(lengthGoal+100); int pos; int length = 0; @@ -110,55 +143,26 @@ public class DocumentKeywordExtractor { if (!word.isStopWord()) { String w = AsciiFlattener.flattenUnicode(word.wordLowerCase()); if (WordPatterns.singleWordQualitiesPredicate.test(w)) { - words.add(w); + words.add(new EdgePageWords.Entry(w, metadata.forWord(flagsTemplate, word.stemmed()))); } } } + + for (var names : keywordExtractor.getNames(sent)) { + var rep = new WordRep(sent, names); + String w = AsciiFlattener.flattenUnicode(rep.word); + + words.add(new EdgePageWords.Entry(w, metadata.forWord(flagsTemplate, rep.stemmed))); + } } wordSet.append(block, words); start = pos; lengthGoal+=32; } - - if (start < documentLanguageData.sentences.length) { - - Map counts = new HashMap<>(documentLanguageData.totalNumWords()); - for (int pos = start; pos < documentLanguageData.sentences.length && counts.size() < lengthGoal; pos++) { - var sent = documentLanguageData.sentences[pos]; - for (var word : sent) { - if (!word.isStopWord()) { - String w = AsciiFlattener.flattenUnicode(word.wordLowerCase()); - if (counts.containsKey(w) || (WordPatterns.singleWordQualitiesPredicate.test(w))) { - counts.merge(w, 1, Integer::sum); - } - } - } - } - - Set lastSet; - if (counts.size() < 1024) { - lastSet = counts.keySet(); - } - else { - lastSet = counts.entrySet().stream() - .sorted(Comparator.comparing(e -> { - double N = docCount; // Number of documents in term freq dictionary - - // Caveat: This is actually the *negated* term score, because the second logarithm has - // its parameter inverted (log(a^b) = b log(a); here b = -1) - return (1 + Math.log(e.getValue())) * Math.log((1. + dict.getTermFreq(e.getKey())) / N); - })) - .map(Map.Entry::getKey) - .limit(1024) - .collect(Collectors.toCollection(LinkedHashSet::new)); - } - - wordSet.append(blocks[blocks.length - 1], lastSet); - } } private static final Pattern mailLikePattern = Pattern.compile("[a-zA-Z0-9._\\-]+@[a-zA-Z0-9]+(\\.[a-zA-Z0-9]+)+"); - private Collection getArtifacts(DocumentLanguageData documentLanguageData) { + private List getArtifacts(DocumentLanguageData documentLanguageData) { Set reps = new HashSet<>(); for (var sent : documentLanguageData.sentences) { @@ -167,6 +171,7 @@ public class DocumentKeywordExtractor { if (lc.length() > 6 && lc.indexOf('@') > 0 && mailLikePattern.matcher(lc).matches()) { + reps.add(lc); String domain = lc.substring(lc.indexOf('@')); @@ -182,7 +187,7 @@ public class DocumentKeywordExtractor { } } } - return reps; + return new ArrayList<>(reps); } private List extractTitleWords(DocumentLanguageData documentLanguageData) { @@ -192,7 +197,21 @@ public class DocumentKeywordExtractor { .collect(Collectors.toList()); } - public EdgePageWords createWords(IndexBlock block, Collection words) { - return new EdgePageWords(block, words.stream().map(w -> w.word).map(AsciiFlattener::flattenUnicode).filter(WordPatterns::hasWordQualities).collect(Collectors.toSet())); + public EdgePageWords createWords(KeywordMetadata metadata, + IndexBlock block, + Collection words) { + + Set entries = new HashSet<>(words.size()); + for (var word : words) { + + String flatWord = AsciiFlattener.flattenUnicode(word.word); + if (!WordPatterns.hasWordQualities(flatWord)) { + continue; + } + + entries.add(new EdgePageWords.Entry(flatWord, metadata.forWord(metadata.flagsTemplate(), word.stemmed))); + } + + return new EdgePageWords(block, entries); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordCounter.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordCounter.java index efa57bd2..5bee1a5d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordCounter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordCounter.java @@ -1,15 +1,19 @@ package nu.marginalia.util.language.processing; +import com.github.jknack.handlebars.internal.lang3.StringUtils; +import gnu.trove.map.hash.TObjectIntHashMap; import nu.marginalia.util.language.WordPatterns; import nu.marginalia.util.language.processing.model.DocumentLanguageData; +import nu.marginalia.util.language.processing.model.KeywordMetadata; import nu.marginalia.util.language.processing.model.WordRep; import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; +import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; -import java.util.Map; -import java.util.Set; -import java.util.regex.Pattern; +import java.util.List; + +import static java.lang.Math.max; public class KeywordCounter { private final KeywordExtractor keywordExtractor; @@ -19,72 +23,78 @@ public class KeywordCounter { public KeywordCounter(TermFrequencyDict dict, KeywordExtractor keywordExtractor) { this.dict = dict; this.keywordExtractor = keywordExtractor; - this.docCount = (double) dict.docCount(); + this.docCount = dict.docCount(); } - public WordHistogram countHisto(DocumentLanguageData dld) { - HashMap counts = new HashMap<>(15000); + public List countHisto(KeywordMetadata keywordMetadata, DocumentLanguageData dld) { + TObjectIntHashMap counts = new TObjectIntHashMap<>(10_000, 0.7f); HashMap> instances = new HashMap<>(15000); for (var sent : dld.sentences) { var keywords = keywordExtractor.getKeywordsFromSentence(sent); for (var span : keywords) { - if (span.size() == 1 && - WordPatterns.isStopWord(sent.words[span.start])) + + if (span.size() == 1 && WordPatterns.isStopWord(sent.words[span.start])) { continue; + } - String stemmed = sent.constructStemmedWordFromSpan(span); + var rep = new WordRep(sent, span); - counts.merge(stemmed, 1, Integer::sum); - instances.computeIfAbsent(stemmed, k -> new HashSet<>(500)).add(new WordRep(sent, span)); + counts.adjustOrPutValue(rep.stemmed, 1, 1); + var instanceSet = instances.computeIfAbsent(rep.stemmed, k -> new HashSet<>(500)); + if (instanceSet.size() < 250) { + instanceSet.add(rep); + } } } - double maxC = counts.values().stream().mapToDouble(Double::valueOf).max().orElse(1); + HashMap tfIdf = keywordMetadata.wordsTfIdf(); + List tfIdfHigh = new ArrayList<>(); - Set h5 = new HashSet<>(2500); - Set h10 = new HashSet<>(500); - Set h15 = new HashSet<>(500); + int maxVal = maxValue(counts); - int doubleWordCount = 0; + counts.forEachEntry((key, cnt) -> { + int value = getTermValue(key, cnt, maxVal); - for (var entry : counts.entrySet()) { - double value = getTermValue(entry, maxC); + tfIdf.put(key, new WordFrequencyData(cnt, value)); - double avgCnt = entry.getValue(); - String wordStemmed = entry.getKey(); + if (cnt > 1 && value > 100) { + tfIdfHigh.addAll(instances.get(key)); + } - Set histogram; - if (value < -3 && avgCnt>1) histogram = h15; - else if (value < -1.75 && avgCnt>1) histogram = h10; - else if (value < -1 && - (!wordStemmed.contains("_") || doubleWordCount++ < 50)) - histogram = h5; - else continue; + return true; + }); - histogram.addAll(instances.get(wordStemmed)); - } - return new WordHistogram(h5, h10, h15); + return tfIdfHigh; } - private static final Pattern separator = Pattern.compile("_"); + private int maxValue(TObjectIntHashMap map) { + int maxC = 0; + for (int c : map.values()) { + maxC = max(c, maxC); + } + return maxC; + } - public double getTermValue(Map.Entry e, double maxValue) { - String key = e.getKey(); - if (key.contains("_")) { - String[] parts = separator.split(e.getKey()); + public int getTermValue(String key, int count, double maxValue) { + if (key.indexOf('_') >= 0) { + String[] parts = StringUtils.split(key, '_'); double totalValue = 0.; for (String part : parts) { - totalValue += value(part, e.getValue(), maxValue); + totalValue += value(part, count, maxValue); } - return totalValue / parts.length; + return normalizeValue(totalValue / parts.length); } else { - return value(key, e.getValue(), maxValue); + return normalizeValue(value(key, count, maxValue)); } } + int normalizeValue(double v) { + return (int)(-v*75); + } + double value(String key, double value, double maxValue) { double freq = dict.getTermFreqStemmed(key); if (freq < 1) { @@ -93,5 +103,5 @@ public class KeywordCounter { return (0.1 + 0.9*value/maxValue) * Math.log(freq/docCount); } - public record WordHistogram(Set lower, Set mid, Set top) { } + public record WordFrequencyData(int count, int tfIdfNormalized) { } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/LongNameCounter.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/LongNameCounter.java deleted file mode 100644 index e2dfd3ad..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/LongNameCounter.java +++ /dev/null @@ -1,64 +0,0 @@ -package nu.marginalia.util.language.processing; - -import nu.marginalia.util.language.processing.model.DocumentLanguageData; -import nu.marginalia.util.language.processing.model.DocumentSentence; -import nu.marginalia.util.language.processing.model.WordRep; -import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; - -import java.util.*; -import java.util.regex.Pattern; -import java.util.stream.Collectors; - -public class LongNameCounter { - private final KeywordExtractor keywordExtractor; - private final TermFrequencyDict dict; - private final double docCount; - public LongNameCounter(TermFrequencyDict dict, KeywordExtractor keywordExtractor) { - this.dict = dict; - docCount = (double) dict.docCount(); - this.keywordExtractor = keywordExtractor; - } - - public List count(DocumentLanguageData dld) { - HashMap counts = new HashMap<>(1000); - HashMap> instances = new HashMap<>(1000); - - for (int i = 0; i < dld.sentences.length; i++) { - DocumentSentence sent = dld.sentences[i]; - var keywords = keywordExtractor.getNamesStrict(sent); - for (var span : keywords) { - var stemmed = sent.constructStemmedWordFromSpan(span); - counts.merge(stemmed, 1., Double::sum); - instances.computeIfAbsent(stemmed, k -> new HashSet<>()).add(new WordRep(sent, span)); - } - } - - return counts.entrySet().stream().filter(e -> termSize(e.getKey()) > 1) - .sorted(Comparator.comparing(this::getTermValue)) - .limit(Math.min(50, counts.size()/3)) - .map(Map.Entry::getKey) - .flatMap(w -> instances.get(w).stream()).collect(Collectors.toList()); - } - - int termSize(String word) { - return 1 + (int) word.chars().filter(c -> c == '_').count(); - } - - - final Pattern separator = Pattern.compile("_"); - - public double getTermValue(Map.Entry e) { - String[] parts = separator.split(e.getKey()); - double totalValue = 0.; - for (String part : parts) { - totalValue += value(part, e.getValue()); - } - return totalValue / Math.sqrt(parts.length); - } - - double value(String key, double value) { - return (1+Math.log(value)) * Math.log((1.1+dict.getTermFreqStemmed(key))/11820118.); - } - - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/NameCounter.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/NameCounter.java index 142f1477..476b7b5d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/NameCounter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/NameCounter.java @@ -37,7 +37,8 @@ public class NameCounter { .sorted(Comparator.comparing(e -> -e.getValue())) .limit(150) .map(Map.Entry::getKey) - .flatMap(w -> instances.get(w).stream()).collect(Collectors.toList()); + .flatMap(w -> instances.get(w).stream()) + .collect(Collectors.toList()); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SentenceExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SentenceExtractor.java index 87283f71..ea071bf3 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SentenceExtractor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SentenceExtractor.java @@ -1,6 +1,7 @@ package nu.marginalia.util.language.processing; import com.github.datquocnguyen.RDRPOSTagger; +import com.github.jknack.handlebars.internal.lang3.StringUtils; import gnu.trove.list.array.TIntArrayList; import gnu.trove.map.hash.TObjectIntHashMap; import lombok.AllArgsConstructor; @@ -125,11 +126,45 @@ public class SentenceExtractor { return counts; } - private static final Pattern dotPattern = Pattern.compile("\\.+$"); private static final Pattern splitPattern = Pattern.compile("( -|- |\\|)"); - private static final Pattern spacesPattern = Pattern.compile("\\s+"); - private static final Pattern badCharPattern = Pattern.compile("([^_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|(\\.(\\s+|$))"); +// private static final Pattern badCharPattern = Pattern.compile("([^_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|(\\.(\\s+|$))"); + + private boolean isBadChar(char c) { + if (c >= 'a' && c <= 'z') return false; + if (c >= 'A' && c <= 'Z') return false; + if (c >= '0' && c <= '9') return false; + if ("_#@.".indexOf(c) >= 0) return false; + if (c >= '\u00C0' && c <= '\u00D6') return false; + if (c >= '\u00D8' && c <= '\u00F6') return false; + if (c >= '\u00F8' && c <= '\u00FF') return false; + + return true; + } + private String sanitizeString(String s) { + char[] newChars = new char[s.length()]; + int pi = 0; + + for (int i = 0; i < newChars.length; i++) { + char c = s.charAt(i); + if (!isBadChar(c)) { + newChars[pi++] = c; + } + else { + newChars[pi++] = ' '; + } + } + + s = new String(newChars, 0, pi); + + if (s.startsWith(".")) { + s = s.substring(1); + if (s.isBlank()) + return ""; + } + return s; + + } public DocumentSentence extractSentence(String text) { var wordsAndSeps = splitSegment(text); @@ -139,7 +174,7 @@ public class SentenceExtractor { var lc = toLc(wordsAndSeps.words); return new DocumentSentence( - badCharPattern.matcher(text).replaceAll(" "), words, seps, lc, rdrposTagger.tagsForEnSentence(words), stemSentence(lc) + sanitizeString(text), words, seps, lc, rdrposTagger.tagsForEnSentence(words), stemSentence(lc) ); } @@ -161,7 +196,7 @@ public class SentenceExtractor { sentences = sentenceDetector.sentDetect(textNormalizedSpaces); } catch (Exception ex) { - sentences = textNormalizedSpaces.split("[.]"); + sentences = StringUtils.split(textNormalizedSpaces, '.'); } if (sentences.length > 250) { @@ -196,8 +231,8 @@ public class SentenceExtractor { separators[i] = Arrays.copyOf(separators[i], 250); } for (int j = 0; j < tokens[i].length; j++) { - if (tokens[i][j].endsWith(".")) { - tokens[i][j] = dotPattern.matcher(tokens[i][j]).replaceAll(""); + while (tokens[i][j].endsWith(".")) { + tokens[i][j] = StringUtils.removeEnd(tokens[i][j], "."); } } } @@ -216,7 +251,7 @@ public class SentenceExtractor { DocumentSentence[] ret = new DocumentSentence[sentences.length]; for (int i = 0; i < ret.length; i++) { - ret[i] = new DocumentSentence(badCharPattern.matcher(sentences[i]).replaceAll(" "), tokens[i], separators[i], tokensLc[i], posTags[i], stemmedWords[i]); + ret[i] = new DocumentSentence(sanitizeString(sentences[i]), tokens[i], separators[i], tokensLc[i], posTags[i], stemmedWords[i]); } return ret; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SubjectCounter.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SubjectCounter.java index 80ff77f5..af774898 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SubjectCounter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SubjectCounter.java @@ -5,9 +5,7 @@ import nu.marginalia.util.language.processing.model.WordRep; import nu.marginalia.util.language.processing.model.WordSpan; import nu.marginalia.util.language.processing.model.tag.WordSeparator; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; import java.util.stream.Collectors; public class SubjectCounter { @@ -27,7 +25,9 @@ public class SubjectCounter { public List count(DocumentLanguageData dld) { - Map counts = new HashMap<>(); + Map counts = new HashMap<>(); + Map> instances = new HashMap<>(); + for (var sentence : dld.sentences) { for (WordSpan kw : keywordExtractor.getNames(sentence)) { if (kw.end + 2 >= sentence.length()) { @@ -41,7 +41,13 @@ public class SubjectCounter { String nextNextTag = sentence.posTags[kw.end+1]; if (isVerb(nextTag) && isDetOrAdverbOrVerb(nextNextTag)) { - counts.merge(new WordRep(sentence, new WordSpan(kw.start, kw.end)), -1, Integer::sum); + var span = new WordSpan(kw.start, kw.end); + var rep = new WordRep(sentence, span); + + String stemmed = rep.stemmed; + + counts.merge(stemmed, -1, Integer::sum); + instances.computeIfAbsent(stemmed, s -> new HashSet<>()).add(rep); } } } @@ -49,8 +55,8 @@ public class SubjectCounter { int best = counts.values().stream().mapToInt(Integer::valueOf).min().orElse(0); return counts.entrySet().stream().sorted(Map.Entry.comparingByValue()) - .filter(e -> e.getValue()<-2 && e.getValue() e.getValue()<-2 && e.getValue()<=best*0.75) + .flatMap(e -> instances.getOrDefault(e.getKey(), Collections.emptySet()).stream()) .collect(Collectors.toList()); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/KeywordMetadata.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/KeywordMetadata.java new file mode 100644 index 00000000..a8b093f6 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/KeywordMetadata.java @@ -0,0 +1,58 @@ +package nu.marginalia.util.language.processing.model; + +import nu.marginalia.util.language.processing.KeywordCounter; +import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags; +import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata; + +import java.util.EnumSet; +import java.util.HashMap; +import java.util.HashSet; + +public record KeywordMetadata(HashSet titleKeywords, + HashSet subjectKeywords, + HashSet namesKeywords, + HashMap wordsTfIdf, + HashMap positionMask, + EnumSet flagsTemplate, + int quality +) +{ + + private static final KeywordCounter.WordFrequencyData empty = new KeywordCounter.WordFrequencyData(0, 0); + + public KeywordMetadata(double quality, EnumSet flags) { + this(new HashSet<>(50), new HashSet<>(10), new HashSet<>(50), + new HashMap<>(15_000), + new HashMap<>(10_000), + flags, + (int)(-quality)); + } + + public KeywordMetadata(double quality) { + this(quality, EnumSet.noneOf(EdgePageWordFlags.class)); + } + + public long forWord(EnumSet flagsTemplate, String stemmed) { + + KeywordCounter.WordFrequencyData tfidf = wordsTfIdf.getOrDefault(stemmed, empty); + EnumSet flags = flagsTemplate.clone(); + + if (subjectKeywords.contains(stemmed)) + flags.add(EdgePageWordFlags.Subjects); + + if (namesKeywords.contains(stemmed)) + flags.add(EdgePageWordFlags.NamesWords); + + if (titleKeywords.contains(stemmed)) + flags.add(EdgePageWordFlags.Title); + + int positions = positionMask.getOrDefault(stemmed, 0); + + return new EdgePageWordMetadata(tfidf.tfIdfNormalized(), positions, quality, tfidf.count(), flags).encode(); + } + + public int quality() { + return -quality; + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/WordRep.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/WordRep.java index 764d2fac..c59e13f8 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/WordRep.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/WordRep.java @@ -1,21 +1,22 @@ package nu.marginalia.util.language.processing.model; import lombok.AllArgsConstructor; -import lombok.EqualsAndHashCode; import lombok.Getter; import org.jetbrains.annotations.NotNull; import java.util.Objects; -@AllArgsConstructor @EqualsAndHashCode @Getter +@AllArgsConstructor @Getter public class WordRep implements Comparable { public WordRep(DocumentSentence sent, WordSpan span) { word = sent.constructWordFromSpan(span); stemmed = sent.constructStemmedWordFromSpan(span); length = span.end - span.start; + hashCode = Objects.hash(word); } + public final int length; public final String word; public final String stemmed; @@ -34,4 +35,12 @@ public class WordRep implements Comparable { public int hashCode() { return hashCode; } + + public boolean equals(Object other) { + if (other == this) return true; + if (other instanceof WordRep wr) { + return Objects.equals(wr.word, word); + } + return false; + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java index 6423d18a..d3e1376d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java @@ -2,6 +2,7 @@ package nu.marginalia.util.multimap; import com.upserve.uppend.blobs.NativeIO; import lombok.SneakyThrows; +import nu.marginalia.util.btree.BTreeQueryBuffer; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -100,15 +101,15 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice { public MultimapSearcherBase createSearcher() { return new MultimapSearcherBase(this); } - public MultimapSorter createSorter(Path tmpFile, int internalSortLimit) { - return new MultimapSorter(this, tmpFile, internalSortLimit); + public MultimapSorter createSorter(Path tmpFile, int internalSortLimit, int minStepSize) { + return new MultimapSorter(this, tmpFile, internalSortLimit, minStepSize); } @SneakyThrows public void advice(NativeIO.Advice advice) { this.defaultAdvice = advice; for (var buffer : mappedByteBuffers) { - NativeIO.madvise(buffer, advice); + NativeIO.madvise(buffer, advice); } } @@ -215,6 +216,9 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice { @Override public long get(long idx) { + if (idx < 0) + throw new IllegalArgumentException("get("+idx+")"); + if (idx >= mappedSize) grow(idx); @@ -340,6 +344,49 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice { } + + @Override + public void write(LongBuffer vals, int n, long idx) { + if (idx+n >= mappedSize) { + grow(idx+n); + } + int iN = (int)((idx + n) / bufferSize); + + for (int i = 0; i < n; ) { + int i0 = (int)((idx + i) / bufferSize); + + int bufferOffset = (int) ((idx+i) % bufferSize); + var buffer = buffers.get(i0); + + final int l; + + if (i0 < iN) l = bufferSize - bufferOffset; + else l = Math.min(n - i, bufferSize - bufferOffset); + + buffer.put(bufferOffset, vals, vals.position() + i, l); + i+=l; + } + + } + + @Override + public void swapn(int n, long idx1, long idx2) { + for (int i = 0; i < n; i++) + swap(idx1+i, idx2+i); + } + + private void swap(long idx1, long idx2) { + LongBuffer buff1 = buffers.get((int)(idx1) / bufferSize); + final int o1 = (int) (idx1) % bufferSize; + + LongBuffer buff2 = buffers.get((int)(idx2) / bufferSize); + final int o2 = (int) (idx2) % bufferSize; + + long tmp = buff1.get(o1); + buff1.put(o1, buff2.get(o2)); + buff2.put(o2, tmp); + } + @Override public void setRange(long idx, int n, long val) { if (n == 0) return; @@ -410,6 +457,383 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice { } + @Override + public long binarySearchInternal(long key, long fromIndex, int step, long n, long mask) { + if (fromIndex + n*step >= mappedSize) + grow(fromIndex + n*step); + + long low = 0; + long high = n - 1; + + if (fromIndex/bufferSize == (fromIndex+step*n)/bufferSize) { + int idx = (int)(fromIndex / bufferSize); + + while (low <= high) { + long mid = (low + high) >>> 1; + long off = fromIndex + mid*step; + long midVal = buffers.get(idx).get((int)(off % bufferSize)) & mask; + + if (midVal < key) + low = mid + 1; + else if (midVal > key) + high = mid - 1; + else + return fromIndex + mid*step; + } + } + else { + while (low <= high) { + long mid = (low + high) >>> 1; + long off = fromIndex + mid*step; + long midVal = buffers.get((int)(off / bufferSize)).get((int)(off % bufferSize)) & mask; + + if (midVal < key) + low = mid + 1; + else if (midVal > key) + high = mid - 1; + else + return fromIndex + mid*step; + } + } + + return -1L-(fromIndex + high*step); + } + + @Override + public long binarySearchInternal(long key, long fromIndex, long n, long mask) { + if (fromIndex + n >= mappedSize) + grow(fromIndex + n); + + long low = 0; + long high = n - 1; + + if (fromIndex/bufferSize == (fromIndex+n)/bufferSize) { + int idx = (int)(fromIndex / bufferSize); + + while (low <= high) { + long mid = (low + high) >>> 1; + long off = fromIndex + mid; + long midVal = buffers.get(idx).get((int)(off % bufferSize)) & mask; + + if (midVal < key) + low = mid + 1; + else if (midVal > key) + high = mid - 1; + else + return fromIndex + mid; + } + } + else { + while (low <= high) { + long mid = (low + high) >>> 1; + long off = fromIndex + mid; + long midVal = buffers.get((int)(off / bufferSize)).get((int)(off % bufferSize)) & mask; + + if (midVal < key) + low = mid + 1; + else if (midVal > key) + high = mid - 1; + else + return fromIndex + mid; + } + } + + return -1L-(fromIndex + high); + } + + + + @Override + public long binarySearchInternal(long key, long fromIndex, long n) { + if (fromIndex + n >= mappedSize) + grow(fromIndex + n); + + long low = 0; + long high = n - 1; + + if (fromIndex/bufferSize == (fromIndex+n)/bufferSize) { + int idx = (int)(fromIndex / bufferSize); + + while (low <= high) { + long mid = (low + high) >>> 1; + long off = fromIndex + mid; + long midVal = buffers.get(idx).get((int)(off % bufferSize)); + + if (midVal < key) + low = mid + 1; + else if (midVal > key) + high = mid - 1; + else + return fromIndex + mid; + } + } + else { + while (low <= high) { + long mid = (low + high) >>> 1; + long off = fromIndex + mid; + long midVal = buffers.get((int)(off / bufferSize)).get((int)(off % bufferSize)); + + if (midVal < key) + low = mid + 1; + else if (midVal > key) + high = mid - 1; + else + return fromIndex + mid; + } + } + + return -1L-(fromIndex + high); + } + + + @Override + public long binarySearchUpperInternal(long key, long fromIndex, long n) { + if (fromIndex + n >= mappedSize) + grow(fromIndex + n); + + long low = 0; + long high = n - 1; + + if (fromIndex/bufferSize == (fromIndex+n)/bufferSize) { + int idx = (int)(fromIndex / bufferSize); + + while (low <= high) { + long mid = (low + high) >>> 1; + long off = fromIndex + mid; + long midVal = buffers.get(idx).get((int)(off % bufferSize)); + + if (midVal < key) + low = mid + 1; + else if (midVal > key) + high = mid - 1; + else + return fromIndex + mid; + } + } + else { + while (low <= high) { + long mid = (low + high) >>> 1; + long off = fromIndex + mid; + long midVal = buffers.get((int)(off / bufferSize)).get((int)(off % bufferSize)); + + if (midVal < key) + low = mid + 1; + else if (midVal > key) + high = mid - 1; + else + return fromIndex + mid; + } + } + + return fromIndex + low; + } + + private boolean isSameBuffer(long a, long b) { + return a / bufferSize == b/bufferSize; + } + + @Override + public long quickSortPartition(int wordSize, long low, long high) { + if (high >= mappedSize) + grow(high + wordSize - 1); + + if (isSameBuffer(low, high + wordSize - 1)) { + // Specialization that circumvents the need for expensive calls to + // MultimapFileLong.get() in the most common scenario + + return quickSortPartitionSameBuffer(wordSize, low, high); + } + else { + return quickSortPartitionDifferentBuffers(wordSize, low, high); + } + } + + @Override + public void insertionSort(int wordSize, long start, int n) { + if (start + n + wordSize - 1 >= mappedSize) + grow(start + n + wordSize - 1); + + if (n <= 1) { + return; + } + + if (isSameBuffer(start, start + (long)n*wordSize-1L)) { + final var buffer = buffers.get((int) (start / bufferSize)); + int off = (int) (start % bufferSize); + + for (int i = 1; i < n; i++) { + long key = buffer.get(off + i * wordSize); + + int j = i - 1; + while (j >= 0 && buffer.get(off + wordSize*j) > key) { + for (int w = 0; w < wordSize; w++) { + long tmp = buffer.get(off+wordSize*j+w); + buffer.put(off+wordSize*j+w, buffer.get(off+wordSize*(j+1)+w)); + buffer.put(off+wordSize*(j+1)+w, tmp); + } + j--; + } + buffer.put(off + (j+1) * wordSize, key); + } + } + else for (int i = 1; i < n; i++) { + long key = get(start + (long) i * wordSize); + + int j = i - 1; + while (j >= 0 && get(start + (long)wordSize*j) > key) { + swapn(wordSize, start + (long)wordSize*j, start + (long)wordSize*(j+1)); + j--; + } + put(start + (long) (j+1) * wordSize, key); + } + } + + + private long quickSortPartitionDifferentBuffers(int wordSize, long low, long high) { + + long pivotPoint = ((low + high) / (2L*wordSize)) * wordSize; + long pivot = get(pivotPoint); + + long i = low - wordSize; + long j = high + wordSize; + + for (;;) { + do { + i+=wordSize; + } while (get(i) < pivot); + + do { + j-=wordSize; + } + while (get(j) > pivot); + + if (i >= j) return j; + else swapn(wordSize, i, j); + } + } + + private long quickSortPartitionSameBuffer(int wordSize, long low, long high) { + + final var buffer = buffers.get((int) (low / bufferSize)); + + int pivotPoint = (int) ((low + high) / (2L*wordSize)) * wordSize % bufferSize; + long pivot = buffer.get(pivotPoint); + + int j = (int) (high) % bufferSize + wordSize; + int i = (int) (low) % bufferSize - wordSize; + + long j0 = high + wordSize - j; + + for (;;) { + do { + i+=wordSize; + } while (buffer.get(i) < pivot); + + do { + j-=wordSize; + } + while (buffer.get(j) > pivot); + + if (i >= j) return j0 + j; + else { + for (int w = 0; w < wordSize; w++) { + long tmp = buffer.get(i+w); + buffer.put(i+w, buffer.get(j+w)); + buffer.put(j+w, tmp); + } + } + } + } + + + + public void retain(BTreeQueryBuffer buffer, long boundary, long searchStart, long numEntries, long mask, int stepSize) { + + final long end = searchStart + stepSize * numEntries; + if (end < mappedSize) { + grow(end); + } + + long bv = buffer.currentValue() & mask; + long av = get(searchStart) & mask; + long pos = searchStart; + + int bi = (int)(searchStart / bufferSize); + int bo = (int)(searchStart % bufferSize); + + LongBuffer data = buffers.get(bi); + + while (bv <= boundary && buffer.hasMore()) { + if (bv < av) { + if (!buffer.rejectAndAdvance()) break; + bv = buffer.currentValue() & mask; + continue; + } + else if (bv == av) { + if (!buffer.retainAndAdvance()) break; + bv = buffer.currentValue() & mask; + continue; + } + + pos += stepSize; + if (pos < end) { + bo += stepSize; + if (bo >= bufferSize) { + data = buffers.get(++bi); + bo = 0; + } + av = data.get(bo) & mask; + } + else { + break; + } + } + + } + + public void reject(BTreeQueryBuffer buffer, long boundary, long searchStart, long numEntries, long mask, int stepSize) { + + final long end = searchStart + stepSize * numEntries; + if (end < mappedSize) { + grow(end); + } + + long bv = buffer.currentValue() & mask; + long av = get(searchStart) & mask; + long pos = searchStart; + + int bi = (int)(searchStart / bufferSize); + int bo = (int)(searchStart % bufferSize); + + LongBuffer data = buffers.get(bi); + + while (bv <= boundary && buffer.hasMore()) { + if (bv < av) { + if (!buffer.retainAndAdvance()) break; + bv = buffer.currentValue() & mask; + continue; + } + else if (bv == av) { + if (!buffer.rejectAndAdvance()) break; + bv = buffer.currentValue() & mask; + continue; + } + + pos += stepSize; + if (pos < end) { + bo += stepSize; + if (bo >= bufferSize) { + data = buffers.get(++bi); + bo = 0; + } + av = data.get(bo) & mask; + } + else { + break; + } + } + + } @Override public void close() throws IOException { @@ -424,6 +848,4 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice { System.runFinalization(); System.gc(); } - - } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongOffsetSlice.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongOffsetSlice.java index f379d1c6..d7724d79 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongOffsetSlice.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongOffsetSlice.java @@ -61,6 +61,17 @@ public class MultimapFileLongOffsetSlice implements MultimapFileLongSlice { map.write(vals, idx+off); } + @Override + public void write(LongBuffer vals, int n, long idx) { + map.write(vals, n,idx+off); + } + + @Override + public void swapn(int n, long idx1, long idx2) { + map.swapn(n, idx1+off, idx2+off); + } + + @Override public void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) throws IOException { @@ -75,4 +86,35 @@ public class MultimapFileLongOffsetSlice implements MultimapFileLongSlice { return new MultimapFileLongOffsetSlice(map, this.off + off); } + + @Override + public long binarySearchInternal(long key, long fromIndex, int step, long n, long mask) { + throw new UnsupportedOperationException(); + } + + @Override + public long binarySearchInternal(long key, long fromIndex, long n, long mask) { + throw new UnsupportedOperationException(); + } + + @Override + public long binarySearchInternal(long key, long fromIndex, long n) { + throw new UnsupportedOperationException(); + } + + @Override + public long binarySearchUpperInternal(long key, long fromIndex, long n) { + throw new UnsupportedOperationException(); + + } + + @Override + public long quickSortPartition(int wordSize, long low, long highInclusive) { + return map.quickSortPartition(wordSize, low+off, highInclusive+off); + } + + @Override + public void insertionSort(int wordSize, long start, int n) { + map.insertionSort(wordSize, start+off, n); + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongSlice.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongSlice.java index 29f9994d..14f43169 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongSlice.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongSlice.java @@ -25,9 +25,23 @@ public interface MultimapFileLongSlice { void write(LongBuffer vals, long idx); + void write(LongBuffer vals, int n, long idx); + + void swapn(int n, long idx1, long idx2); + void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) throws IOException; default MultimapFileLongSlice atOffset(long off) { return new MultimapFileLongOffsetSlice(this, off); } + long binarySearchInternal(long key, long fromIndex, int step, long n, long mask); + long binarySearchInternal(long key, long fromIndex, long n, long mask); + + long binarySearchInternal(long key, long fromIndex, long n); + + long binarySearchUpperInternal(long key, long fromIndex, long n); + + long quickSortPartition(int wordSize, long low, long highInclusive); + + void insertionSort(int wordSize, long start, int n); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcher.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcher.java index dd339e40..cc7d5a13 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcher.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcher.java @@ -1,7 +1,7 @@ package nu.marginalia.util.multimap; public interface MultimapSearcher { - long binarySearchUpper(long key, long fromIndex, long n); + long binarySearchLower(long key, long fromIndex, long n); long binarySearch(long key, long fromIndex, long n); static MultimapSearcher forContext(MultimapFileLongSlice slice, long mask, int stepSize) { @@ -25,8 +25,8 @@ class SimpleMultimapSearcher implements MultimapSearcher { } @Override - public long binarySearchUpper(long key, long fromIndex, long n) { - return base.binarySearchUpper(key, fromIndex, n); + public long binarySearchLower(long key, long fromIndex, long n) { + return base.binarySearchLower(key, fromIndex, n); } @Override @@ -46,8 +46,8 @@ class MaskedMultimapSearcher implements MultimapSearcher { } @Override - public long binarySearchUpper(long key, long fromIndex, long n) { - return base.binarySearchUpper(key, fromIndex, n, mask); + public long binarySearchLower(long key, long fromIndex, long n) { + return base.binarySearchLower(key, fromIndex, n, mask); } @Override @@ -69,8 +69,8 @@ class SteppingMaskedMultimapSearcher implements MultimapSearcher { } @Override - public long binarySearchUpper(long key, long fromIndex, long n) { - return base.binarySearchUpper(key, fromIndex, step, n, mask); + public long binarySearchLower(long key, long fromIndex, long n) { + return base.binarySearchLower(key, fromIndex, step, n, mask); } @Override diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcherBase.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcherBase.java index 30549a8c..ed1665df 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcherBase.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcherBase.java @@ -29,26 +29,12 @@ public class MultimapSearcherBase { return false; } - public long binarySearchUpper(long key, long fromIndex, long n) { - long low = 0; - long high = n - 1; - - while (low <= high) { - long mid = (low + high) >>> 1; - long midVal = get(fromIndex + mid); - - if (midVal < key) - low = mid + 1; - else if (midVal > key) - high = mid - 1; - else - return fromIndex + mid; - } - return fromIndex + low; + public long binarySearchLower(long key, long fromIndex, long n) { + return mmf.binarySearchUpperInternal(key, fromIndex, n); } - public long binarySearchUpper(long key, long fromIndex, long n, long mask) { + public long binarySearchLower(long key, long fromIndex, long n, long mask) { long low = 0; long high = n - 1; @@ -67,7 +53,7 @@ public class MultimapSearcherBase { } - public long binarySearchUpper(long key, long fromIndex, int step, long n, long mask) { + public long binarySearchLower(long key, long fromIndex, int step, long n, long mask) { long low = 0; long high = n - 1; @@ -82,62 +68,19 @@ public class MultimapSearcherBase { else return fromIndex + mid*step; } - return fromIndex + low; + return fromIndex + low*step; } public long binarySearch(long key, long fromIndex, long n) { - long low = 0; - long high = n - 1; - - while (low <= high) { - long mid = (low + high) >>> 1; - long midVal = get(fromIndex + mid); - - if (midVal < key) - low = mid + 1; - else if (midVal > key) - high = mid - 1; - else - return fromIndex + mid; - } - return -1; + return mmf.binarySearchInternal(key, fromIndex, n); } public long binarySearch(long key, long fromIndex, long n, long mask) { - long low = 0; - long high = n - 1; - - while (low <= high) { - long mid = (low + high) >>> 1; - long midVal = get(fromIndex + mid) & mask; - - if (midVal < key) - low = mid + 1; - else if (midVal > key) - high = mid - 1; - else - return fromIndex + mid; - } - return -1; + return mmf.binarySearchInternal(key, fromIndex, n, mask); } - public long binarySearch(long key, long fromIndex, int step, long n, long mask) { - long low = 0; - long high = n - 1; - - while (low <= high) { - long mid = (low + high) >>> 1; - long midVal = get(fromIndex + mid*step) & mask; - - if (midVal < key) - low = mid + 1; - else if (midVal > key) - high = mid - 1; - else - return fromIndex + mid*step; - } - return -1; + return mmf.binarySearchInternal(key, fromIndex, step, n, mask); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSorter.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSorter.java index 61dd04c4..88d873a3 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSorter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSorter.java @@ -1,87 +1,147 @@ package nu.marginalia.util.multimap; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import java.io.IOException; import java.io.RandomAccessFile; +import java.nio.ByteBuffer; import java.nio.LongBuffer; import java.nio.channels.FileChannel; import java.nio.file.Files; import java.nio.file.Path; -import java.util.Arrays; import static nu.marginalia.util.multimap.MultimapFileLong.WORD_SIZE; public class MultimapSorter { private final Path tmpFileDir; - private final int internalSortLimit; private final MultimapFileLongSlice multimapFileLong; - private final long[] buffer; + private final LongBuffer buffer; + private final int internalSortLimit; + private final int wordSize; - public MultimapSorter(MultimapFileLongSlice multimapFileLong, Path tmpFileDir, int internalSortLimit) { + private static final Logger logger = LoggerFactory.getLogger(MultimapSorter.class); + + public MultimapSorter(MultimapFileLongSlice multimapFileLong, Path tmpFileDir, int internalSortLimit, int wordSize) { this.multimapFileLong = multimapFileLong; this.tmpFileDir = tmpFileDir; this.internalSortLimit = internalSortLimit; - buffer = new long[internalSortLimit]; + this.wordSize = wordSize; + buffer = ByteBuffer.allocateDirect(internalSortLimit * wordSize * 8).asLongBuffer(); } - public void sort(long start, int length) throws IOException { - if (length <= internalSortLimit) { - multimapFileLong.read(buffer, length, start); - Arrays.sort(buffer, 0, length); - multimapFileLong.write(buffer, length, start); + public void sortRange(long start, long end) throws IOException { + if (end - start < internalSortLimit) { + quickSortLH(start, end - wordSize); } else { - externalSort(start, length); + mergeSort(start, (int) (end - start)); + } + + for (long lp = start + wordSize; lp < end; lp += wordSize) { + if (multimapFileLong.get(lp - wordSize) > multimapFileLong.get(lp)) { + + logger.error("Sort contract breached [{}:{} ({}), ws={}, = end || bufferI < bufferJ)) { - workBuffer.put(k, bufferI); - i++; + if (idxL < right && (idxR >= end || bufferL < bufferR)) { + workBuffer.put(putPos, bufferL); + for (int s = 1; s < wordSize; s++) { + workBuffer.put(putPos + s, multimapFileLong.get(offset + idxL + s)); + } + idxL+= wordSize; } else { - workBuffer.put(k, bufferJ); - j++; + workBuffer.put(putPos, bufferR); + for (int s = 1; s < wordSize; s++) { + workBuffer.put(putPos + s, multimapFileLong.get(offset + idxR + s)); + } + idxR+= wordSize; + } + } + } + + public void insertionSort(long start, int n) { + multimapFileLong.insertionSort(wordSize, start, n); + } + + private void swap(long a, long b) { + multimapFileLong.swapn(wordSize, a, b); + } + + public void quickSort(long start, long length) { + quickSortLH(start, start + length - wordSize); + + } + public void quickSortLH(long low, long highInclusive) { + + if (low >= 0 && highInclusive >= 0 && low < highInclusive) { + + if (highInclusive - low < 32) { + multimapFileLong.insertionSort(wordSize, low, (int) ((wordSize + highInclusive - low) / wordSize)); + } + else { + long p = multimapFileLong.quickSortPartition(wordSize, low, highInclusive); + + quickSortLH(low, p); + quickSortLH(p + wordSize, highInclusive); } } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool2.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool2.java index f46fb390..05159ba9 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool2.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool2.java @@ -11,27 +11,16 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.sql.SQLException; -import java.util.HashSet; -import java.util.Set; import java.util.concurrent.LinkedBlockingQueue; public class UpdateDomainRanksTool2 { private static final Logger logger = LoggerFactory.getLogger(UpdateDomainRanksTool2.class); - public Set originDomains = new HashSet<>(); - public Set originDomainIds = new HashSet<>(); public final long domainIdMax = -1; public int domainCount; private volatile static int rankMax; - public int maxId() { - return (int) domainIdMax; - } - public int domainCount() { - return domainCount; - } - static final LinkedBlockingQueue uploadQueue = new LinkedBlockingQueue<>(10); volatile static boolean running = true; @@ -44,23 +33,14 @@ public class UpdateDomainRanksTool2 { var uploader = new Thread(() -> uploadThread(conn), "Uploader"); logger.info("Ranking"); - // "memex.marginalia.nu", "wiki.xxiivv.com", "bikobatanari.art", "sadgrl.online", "lileks.com", - // "www.rep.routledge.com", "www.personal.kent.edu", "xroads.virginia.edu", "classics.mit.edu", "faculty.washington.edu", "monadnock.net" var ds = new DatabaseModule().provideConnection(); var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds)); var rpr = new BetterReversePageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org"); -// var rpr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), "%edu"); -// var spr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), "memex.marginalia.nu"); var rankVector = rpr.pageRankVector(); - var norm = rankVector.norm(); rankMax = rpr.size(); uploader.start(); - - rankMax = rpr.size(); - - rpr.pageRankWithPeripheralNodes(rankMax).forEach(i -> { try { uploadQueue.put(i); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/tool/EdgeDomainLinkConsineSimilarityMain.java b/marginalia_nu/src/main/java/nu/marginalia/util/tool/EdgeDomainLinkConsineSimilarityMain.java new file mode 100644 index 00000000..122587e6 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/tool/EdgeDomainLinkConsineSimilarityMain.java @@ -0,0 +1,298 @@ +package nu.marginalia.util.tool; + +import com.zaxxer.hikari.HikariDataSource; +import gnu.trove.map.hash.TIntIntHashMap; +import gnu.trove.map.hash.TIntObjectHashMap; +import gnu.trove.set.hash.TIntHashSet; +import lombok.SneakyThrows; +import nu.marginalia.util.AndCardIntSet; +import nu.marginalia.wmsa.configuration.module.DatabaseModule; +import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDaoImpl; +import nu.marginalia.wmsa.edge.model.EdgeDomain; +import nu.marginalia.wmsa.edge.model.id.EdgeId; +import org.roaringbitmap.RoaringBitmap; + +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.*; +import java.util.concurrent.LinkedBlockingDeque; +import java.util.concurrent.TimeUnit; +import java.util.function.Consumer; + +import static nu.marginalia.util.AndCardIntSet.*; + +public class EdgeDomainLinkConsineSimilarityMain { + ArrayList idsList = new ArrayList<>(100_000); + ArrayList itemsList = new ArrayList<>(100_000); + TIntObjectHashMap dToSMap = new TIntObjectHashMap<>(100_000); + TIntIntHashMap aliasMap = new TIntIntHashMap(100_000, 0.75f, -1, -1); + TIntHashSet indexed = new TIntHashSet(100_000); + + float weights[]; + + private HikariDataSource dataSource; + + public EdgeDomainLinkConsineSimilarityMain(HikariDataSource dataSource) throws SQLException { + this.dataSource = dataSource; + + Map tmpMap = new HashMap<>(100_000); + try ( + var conn = dataSource.getConnection(); + var aliasStmt = conn.prepareStatement("SELECT ID, DOMAIN_ALIAS FROM EC_DOMAIN WHERE DOMAIN_ALIAS IS NOT NULL"); + var indexedStmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE INDEXED>0"); + var linksStmt = conn.prepareStatement("SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK")) { + ResultSet rsp; + + aliasStmt.setFetchSize(10_000); + rsp = aliasStmt.executeQuery(); + while (rsp.next()) { + aliasMap.put(rsp.getInt(1), rsp.getInt(2)); + } + + indexedStmt.setFetchSize(10_000); + rsp = indexedStmt.executeQuery(); + while (rsp.next()) { + indexed.add(rsp.getInt(1)); + } + + + linksStmt.setFetchSize(10_000); + rsp = linksStmt.executeQuery(); + while (rsp.next()) { + int source = deAlias(rsp.getInt(1)); + int dest = deAlias(rsp.getInt(2)); + + tmpMap.computeIfAbsent(dest, this::createBitmapWithSelf).add(source); + } + } + + tmpMap.entrySet().stream() + .filter(e -> isEligible(e.getValue())) + .forEach(e -> { + var val = of(e.getValue()); + idsList.add(e.getKey()); + itemsList.add(val); + dToSMap.put(e.getKey(), val); + }); + weights = new float[1 + idsList.stream().mapToInt(i -> i).max().orElse(0)]; + for (int i = 0; i < idsList.size(); i++) { + weights[idsList.get(i)] = getWeight(idsList.get(i)); + } + } + + private boolean isEligible(RoaringBitmap value) { + int cardinality = value.getCardinality(); + + return cardinality < 10000; + } + + private int deAlias(int id) { + int val = aliasMap.get(id); + if (val < 0) + return id; + return val; + } + + LinkedBlockingDeque similaritiesLinkedBlockingDeque = new LinkedBlockingDeque<>(10); + volatile boolean running; + + @SneakyThrows + public void tryDomains(String... domainName) { + var dataStoreDao = new EdgeDataStoreDaoImpl(dataSource); + + System.out.println(Arrays.toString(domainName)); + + int[] domainIds = Arrays.stream(domainName).map(EdgeDomain::new) + .map(dataStoreDao::getDomainId) + .mapToInt(EdgeId::id) + .map(this::deAlias) + .toArray(); + + for (int domainId : domainIds) { + findAdjacentDtoS(domainId, similarities -> { + for (var similarity : similarities.similarities()) { + if (indexed.contains(similarity.domainId)) System.out.print("*"); + System.out.println(dataStoreDao.getDomain(new EdgeId<>(similarity.domainId)).map(Object::toString).orElse("") + " " + prettyPercent(similarity.value)); + } + }); + } + } + + private String prettyPercent(double val) { + return String.format("%2.2f%%", 100. * val); + } + + @SneakyThrows + public void loadAll() { + running = true; + var thread = new Thread(this::insertThreadRun); + thread.start(); + idsList.parallelStream() + .filter(id -> !aliasMap.containsKey(id)) + .forEach(id -> findAdjacent(id, this::addToQueue)); + running = false; + thread.join(); + } + + @SneakyThrows + void addToQueue(DomainSimilarities similarities) { + similaritiesLinkedBlockingDeque.putLast(similarities); + } + + public void insertThreadRun() { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement( + """ + INSERT INTO EC_DOMAIN_NEIGHBORS_2 + (DOMAIN_ID, NEIGHBOR_ID, RELATEDNESS) + VALUES (?, ?, ?) + ON DUPLICATE KEY UPDATE RELATEDNESS = GREATEST(EC_DOMAIN_NEIGHBORS_2.RELATEDNESS, VALUES(RELATEDNESS)) + """) + ) { + while (running || !similaritiesLinkedBlockingDeque.isEmpty()) { + var item = similaritiesLinkedBlockingDeque.pollFirst(60, TimeUnit.SECONDS); + if (item == null) continue; + + for (var similarity : item.similarities) { + stmt.setInt(1, item.domainId); + stmt.setInt(2, similarity.domainId); + stmt.setDouble(3, similarity.value); + stmt.addBatch(); + } + stmt.executeBatch(); + } + } catch (SQLException | InterruptedException e) { + throw new RuntimeException(e); + } + } + + public RoaringBitmap createBitmapWithSelf(int val) { + var bm = new RoaringBitmap(); + bm.add(val); + return bm; + } + + public void findAdjacent(int domainId, Consumer andThen) { + findAdjacentDtoS(domainId, andThen); + } + + double cosineSimilarity(AndCardIntSet a, AndCardIntSet b) { + double andCardinality = andCardinality(a, b); + andCardinality /= Math.sqrt(a.getCardinality()); + andCardinality /= Math.sqrt(b.getCardinality()); + return andCardinality; + } + + double expensiveCosineSimilarity(AndCardIntSet a, AndCardIntSet b) { + return weightedProduct(weights, a, b) / Math.sqrt(a.mulAndSum(weights) * b.mulAndSum(weights)); + } + + float getWeight(int i) { + var vector = dToSMap.get(i); + + if (vector == null) return 1.0f; + return 1.0f / (float) Math.log(2+vector.getCardinality()); + } + + record DomainSimilarities(int domainId, List similarities) {}; + record DomainSimilarity(int domainId, double value) {}; + + @SneakyThrows + private void findAdjacentDtoS(int domainId, Consumer andThen) { + var vector = dToSMap.get(domainId); + if (vector == null || !vector.cardinalityExceeds(10)) { + return; + } + + System.out.println("DtoS " + domainId); + + List similarities = new ArrayList<>(1000); + + /** The minimum cardinality a vector can have so that + * + * a (x) b + * ------- < k is given by k^2 + * |a||b| + * + */ + int cardMin = Math.max(2, (int) (0.01 * vector.getCardinality())); + + for (int i = 0; i < itemsList.size(); i++) { + + int id = idsList.get(i); + if (id == domainId) + continue; + + var otherVec = itemsList.get(i); + if (otherVec.getCardinality() < cardMin) + continue; + + double similarity = cosineSimilarity(vector, otherVec); + if (similarity > 0.1) { + var recalculated = expensiveCosineSimilarity(vector, otherVec); + if (recalculated > 0.1) { + similarities.add(new DomainSimilarity(id, recalculated)); + } + } + } + + if (similarities.size() > 128) { + similarities.sort(Comparator.comparing(DomainSimilarity::value)); + similarities.subList(0, similarities.size() - 128).clear(); + } + + + andThen.accept(new DomainSimilarities(domainId, similarities)); + } + + +// @SneakyThrows +// private void findAdjacentDtoS(Consumer andThen, int... domainIds) { +// var vectors = Arrays.stream(domainIds).mapToObj(dToSMap::get) +// .filter(Objects::nonNull) +// .filter(vec -> vec.cardinalityExceeds(10)) +// .toArray(AndCardIntSet[]::new); +// Set domainIdsSet = new HashSet<>(Arrays.stream(domainIds).boxed().toList()); +// +// if (vectors.length != domainIds.length) +// return; +// +// List similarities = dToSMap.entrySet().parallelStream() +// .filter(e -> !domainIdsSet.contains(e.getKey()) && indexed.contains(e.getKey())) +// .flatMap(entry -> { +// +// double similarity = 0.; +// for (var vector : vectors) { +// similarity += cosineSimilarity(vector, entry.getValue()); +// } +// +// if (similarity > 0.1 * vectors.length) { +// double recalculated = 0; +// for (var vector : vectors) { +// recalculated += expensiveCosineSimilarity(vector, entry.getValue()); +// } +// if (recalculated > 0.1 * vectors.length) { +// return Stream.of(new DomainSimilarity(entry.getKey(), recalculated)); +// } +// } +// return Stream.empty(); +// }).sorted(Comparator.comparing(DomainSimilarity::value)) +// .toList(); +// +// andThen.accept(new DomainSimilarities(domainIds[0], similarities)); +// } + + + public static void main(String[] args) throws SQLException { + DatabaseModule dm = new DatabaseModule(); + + var main = new EdgeDomainLinkConsineSimilarityMain(dm.provideConnection()); + if (args.length == 0) { + main.loadAll(); + } + else { + main.tryDomains(args); + } + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/api/model/ApiSearchResult.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/api/model/ApiSearchResult.java index 0f28d80c..1fe6b53a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/api/model/ApiSearchResult.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/api/model/ApiSearchResult.java @@ -2,8 +2,14 @@ package nu.marginalia.wmsa.api.model; import lombok.AllArgsConstructor; import lombok.Getter; +import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultKeywordScore; import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails; +import java.util.ArrayList; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + @AllArgsConstructor @Getter public class ApiSearchResult { public String url; @@ -11,10 +17,30 @@ public class ApiSearchResult { public String description; public double quality; + public List> details = new ArrayList<>(); + public ApiSearchResult(EdgeUrlDetails url) { this.url = url.url.toString(); this.title = url.getTitle(); this.description = url.getDescription(); this.quality = url.getTermScore(); + + if (url.resultItem != null) { + var bySet = url.resultItem.scores.stream().collect(Collectors.groupingBy(EdgeSearchResultKeywordScore::set)); + + outer: + for (var entries : bySet.values()) { + List lst = new ArrayList<>(); + for (var entry : entries) { + var metadata = entry.metadata(); + if (metadata.isEmpty()) + continue outer; + + Set flags = metadata.flags().stream().map(Object::toString).collect(Collectors.toSet()); + lst.add(new ApiSearchResultQueryDetails(entry.keyword(), metadata.tfIdf(),metadata.count(), flags)); + } + details.add(lst); + } + } } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/api/model/ApiSearchResultQueryDetails.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/api/model/ApiSearchResultQueryDetails.java new file mode 100644 index 00000000..ad146ca8 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/api/model/ApiSearchResultQueryDetails.java @@ -0,0 +1,16 @@ +package nu.marginalia.wmsa.api.model; + +import lombok.AllArgsConstructor; +import lombok.Getter; + +import java.util.Set; + +@AllArgsConstructor @Getter +public class ApiSearchResultQueryDetails { + + String keyword; + int tfIdf; + int count; + + Set flagsUnstableAPI; +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/ServiceDescriptor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/ServiceDescriptor.java index 4fc59afe..46aebfc7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/ServiceDescriptor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/ServiceDescriptor.java @@ -5,6 +5,7 @@ import nu.marginalia.wmsa.auth.AuthMain; import nu.marginalia.wmsa.configuration.command.*; import nu.marginalia.wmsa.edge.assistant.EdgeAssistantMain; import nu.marginalia.wmsa.edge.dating.DatingMain; +import nu.marginalia.wmsa.edge.explorer.ExplorerMain; import nu.marginalia.wmsa.edge.index.EdgeIndexMain; import nu.marginalia.wmsa.edge.search.EdgeSearchMain; import nu.marginalia.wmsa.encyclopedia.EncyclopediaMain; @@ -37,6 +38,7 @@ public enum ServiceDescriptor { ENCYCLOPEDIA("encyclopedia", 5040, EncyclopediaMain.class), DATING("dating", 5070, DatingMain.class), + EXPLORER("explorer", 5071, ExplorerMain.class), TEST_1("test-1", 0, null), TEST_2("test-2", 0, null); @@ -77,9 +79,11 @@ public enum ServiceDescriptor { public static void main(String... args) { MainMapLookup.setMainArguments(args); - Map functions = Stream.of(new ListCommand(), + Map functions = Stream.of( + new ListCommand(), new StartCommand(), new ConvertCommand(), + new CrawlCommand(), new LoadCommand(), new ReindexCommand(), new VersionCommand(), diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/command/CrawlCommand.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/command/CrawlCommand.java new file mode 100644 index 00000000..07c291bb --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/command/CrawlCommand.java @@ -0,0 +1,24 @@ +package nu.marginalia.wmsa.configuration.command; + +import lombok.SneakyThrows; +import nu.marginalia.wmsa.edge.crawling.CrawlerMain; + +import java.util.Arrays; + +public class CrawlCommand extends Command { + public CrawlCommand() { + super("crawl"); + } + + @Override + @SneakyThrows + public void execute(String... args) { + if (args.length < 2) { + System.err.println("Usage: crawl plan.yaml"); + System.exit(255); + } + + String[] args2 = Arrays.copyOfRange(args, 1, args.length); + CrawlerMain.main(args2); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/screenshot/ScreenshotService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/screenshot/ScreenshotService.java index cbde361b..18778496 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/screenshot/ScreenshotService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/screenshot/ScreenshotService.java @@ -12,6 +12,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import spark.Request; import spark.Response; +import spark.Spark; import java.sql.SQLException; @@ -85,6 +86,12 @@ public class ScreenshotService { } private Object serveSvgPlaceholder(Response response, int id) { + + var domainName = edgeDataStoreDao.getDomain(new EdgeId<>(id)).map(Object::toString); + if (domainName.isEmpty()) { + Spark.halt(404); + } + response.type("image/svg+xml"); return String.format("\n" + "%s\n" + " \n" + - "\n", edgeDataStoreDao.getDomain(new EdgeId<>(id))); + "\n", domainName.get()); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConversionLog.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConversionLog.java new file mode 100644 index 00000000..f666474b --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConversionLog.java @@ -0,0 +1,69 @@ +package nu.marginalia.wmsa.edge.converting; + +import com.github.luben.zstd.ZstdOutputStream; +import nu.marginalia.wmsa.edge.converting.interpreter.Interpreter; +import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords; +import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink; +import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocument; +import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocumentWithError; +import nu.marginalia.wmsa.edge.model.EdgeDomain; +import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; + +import java.io.BufferedOutputStream; +import java.io.IOException; +import java.io.PrintWriter; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; +import java.time.LocalDateTime; +import java.time.ZoneOffset; + +public class ConversionLog implements AutoCloseable, Interpreter { + + + + private final PrintWriter writer; + + public ConversionLog(Path rootDir) throws IOException { + String fileName = String.format("conversion-log-%s.zstd", LocalDateTime.now().toEpochSecond(ZoneOffset.UTC)); + Path logFile = rootDir.resolve(fileName); + + writer = new PrintWriter(new ZstdOutputStream( + new BufferedOutputStream(Files.newOutputStream(logFile, StandardOpenOption.WRITE, StandardOpenOption.CREATE)))); + } + + @Override + public void close() throws IOException { + writer.close(); + } + + @Override + public void loadUrl(EdgeUrl[] url) {} + + @Override + public void loadDomain(EdgeDomain[] domain) {} + + @Override + public void loadRssFeed(EdgeUrl[] rssFeed) {} + + @Override + public void loadDomainLink(DomainLink[] links) {} + + @Override + public void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) {} + + @Override + public void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument) {} + + @Override + public synchronized void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError) { + writer.printf("%s\t%s\n", loadProcessedDocumentWithError.url(), loadProcessedDocumentWithError.reason()); + } + + @Override + public void loadKeywords(EdgeUrl url, DocumentKeywords[] words) {} + + @Override + public void loadDomainRedirect(DomainLink link) {} +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConvertedDomainReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConvertedDomainReader.java index eca74633..4efe95e3 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConvertedDomainReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConvertedDomainReader.java @@ -54,5 +54,4 @@ public class ConvertedDomainReader { return ret; } - } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterMain.java index 0bf6b6f5..b0bb7a9e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterMain.java @@ -5,9 +5,9 @@ import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; import nu.marginalia.util.ParallelPipe; +import nu.marginalia.wmsa.edge.converting.compiler.InstructionsCompiler; import nu.marginalia.wmsa.edge.converting.interpreter.Instruction; import nu.marginalia.wmsa.edge.converting.processor.DomainProcessor; -import nu.marginalia.wmsa.edge.converting.processor.InstructionsCompiler; import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader; import nu.marginalia.wmsa.edge.crawling.WorkLog; import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; @@ -47,11 +47,15 @@ public class ConverterMain { Gson gson ) throws Exception { - instructionWriter = new LoadInstructionWriter(plan.process.getDir(), gson); + ; + + logger.info("Starting pipe"); - try (WorkLog processLog = plan.createProcessWorkLog()) { + try (WorkLog processLog = plan.createProcessWorkLog(); + ConversionLog log = new ConversionLog(plan.process.getDir())) { + instructionWriter = new LoadInstructionWriter(log, plan.process.getDir(), gson); var pipe = new ParallelPipe("Crawler", 20, 4, 2) { @Override diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java index b606a274..f8de6c0c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java @@ -136,7 +136,7 @@ public class LinkKeywordExtractorMain { try (var output = new UrlKeywordTsvWriter(Path.of("links.tsv"))) { AnchorTextExtractor anchorTextExtractor = new AnchorTextExtractor(crawledDomains::contains, - url -> url.param != null, + (url) -> true, //url -> crawledUrls.contains(url.toString().hashCode()), output::write); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordLoaderMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordLoaderMain.java index 7443cdfc..f00c54ee 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordLoaderMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordLoaderMain.java @@ -1,11 +1,7 @@ package nu.marginalia.wmsa.edge.converting; import nu.marginalia.wmsa.configuration.module.DatabaseModule; -import nu.marginalia.wmsa.configuration.server.Context; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords; import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.model.id.EdgeId; import java.io.IOException; import java.nio.file.Files; @@ -34,9 +30,9 @@ public class LinkKeywordLoaderMain { private record UrlKeyword(String url, String keyword) { public static UrlKeyword parseLine(String line) { - String[] parts = line.split("\t"); - if (parts.length == 2) { - return new UrlKeyword(parts[0], parts[1]); + int idx = line.indexOf('\t'); + if (idx > 0) { + return new UrlKeyword(line.substring(0, idx), line.substring(idx+1)); } return null; } @@ -73,10 +69,10 @@ public class LinkKeywordLoaderMain { int urlId = (int)(id & 0xFFFF_FFFFL); int domainId = (int)(id >>> 32L); -// System.out.println(lastLine + " -/- " + domainId + ":" + urlId + " : " + keywords); + System.out.println(lastLine + " -/- " + domainId + ":" + urlId + " : " + keywords); - indexClient.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId), - new DocumentKeywords(IndexBlock.Link, keywords.toArray(String[]::new)), 0); +// indexClient.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId), +// new DocumentKeywords(IndexBlock.Link, keywords.toArray(String[]::new)), 0); } lastLine = urlKeyword.url; @@ -94,6 +90,7 @@ public class LinkKeywordLoaderMain { var conn = ds.getConnection(); var stmt = conn.createStatement()) { + stmt.setFetchSize(10000); var rsp = stmt.executeQuery("SELECT URL, ID, DOMAIN_ID FROM EC_URL_VIEW WHERE TITLE IS NOT NULL"); while (rsp.next()) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LoadInstructionWriter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LoadInstructionWriter.java index 332e4a4f..90813636 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LoadInstructionWriter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LoadInstructionWriter.java @@ -24,10 +24,13 @@ import java.util.List; public class LoadInstructionWriter { + private ConversionLog log; private final Path outputDir; private final Gson gson; private static final Logger logger = LoggerFactory.getLogger(LoadInstructionWriter.class); - public LoadInstructionWriter(Path outputDir, Gson gson) { + + public LoadInstructionWriter(ConversionLog log, Path outputDir, Gson gson) { + this.log = log; this.outputDir = outputDir; this.gson = gson; @@ -35,6 +38,7 @@ public class LoadInstructionWriter { throw new IllegalArgumentException("Output dir " + outputDir + " does not exist"); } } + public String accept(String id, List instructionList) throws IOException { Path outputFile = getOutputFile(id); @@ -48,6 +52,8 @@ public class LoadInstructionWriter { logger.info("Writing {} - {} - {}", id, instructionList.size(), summary); for (var instr : instructionList) { + instr.apply(log); + outputStream.append(instr.tag().name()); outputStream.append(' '); gson.toJson(instr, outputStream); @@ -66,6 +72,7 @@ public class LoadInstructionWriter { if (!Files.exists(destDir)) { Files.createDirectories(destDir); } + return destDir.resolve(id + ".pzstd"); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ReindexTriggerMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ReindexTriggerMain.java index 55648dfd..d1c8db01 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ReindexTriggerMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ReindexTriggerMain.java @@ -28,32 +28,8 @@ public class ReindexTriggerMain { .followRedirects(true) .build(); - try (var ds = db.provideConnection(); var conn = ds.getConnection(); var stmt = conn.createStatement()) { - var rs = stmt.executeQuery("SELECT ID, DOMAIN_NAME, STATE, INDEXED FROM EC_DOMAIN LIMIT 100"); - while (rs.next()) { - System.out.printf("%d %s %s %d\n", - rs.getInt(1), - rs.getString(2), - rs.getString(3), - rs.getInt(4)); - } - - rs = stmt.executeQuery("SELECT ID, DOMAIN_ID, PATH, VISITED, STATE FROM EC_URL LIMIT 100"); - while (rs.next()) { - System.out.printf("%d %d %s %d %s\n", - rs.getInt(1), - rs.getInt(2), - rs.getString(3), - rs.getInt(4), - rs.getString(5)); - - } - - stmt.executeUpdate("INSERT IGNORE INTO DOMAIN_METADATA(ID,GOOD_URLS,KNOWN_URLS,VISITED_URLS) SELECT ID,0,0,0 FROM EC_DOMAIN WHERE INDEXED>0"); - stmt.executeUpdate("UPDATE DOMAIN_METADATA INNER JOIN (SELECT DOMAIN_ID,COUNT(*) CNT FROM EC_URL WHERE VISITED AND STATE='ok' GROUP BY DOMAIN_ID) T ON T.DOMAIN_ID=ID SET GOOD_URLS=CNT"); - stmt.executeUpdate("UPDATE DOMAIN_METADATA INNER JOIN (SELECT DOMAIN_ID,COUNT(*) CNT FROM EC_URL GROUP BY DOMAIN_ID) T ON T.DOMAIN_ID=ID SET KNOWN_URLS=CNT"); - stmt.executeUpdate("UPDATE DOMAIN_METADATA INNER JOIN (SELECT DOMAIN_ID,COUNT(*) CNT FROM EC_URL WHERE VISITED GROUP BY DOMAIN_ID) T ON T.DOMAIN_ID=ID SET VISITED_URLS=CNT"); - } + var updateStatistics = new UpdateDomainStatistics(db.provideConnection()); + updateStatistics.run(); var rb = new RequestBody() { @@ -70,7 +46,11 @@ public class ReindexTriggerMain { }; client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/repartition")).build()).execute(); - client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/preconvert")).build()).execute(); + + if (!Boolean.getBoolean("no-preconvert")) { + client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/preconvert")).build()).execute(); + } + for (int i = 0; i < DYNAMIC_BUCKET_LENGTH+1; i++) { client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/reindex/" + i)).build()).execute(); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/UpdateDomainStatistics.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/UpdateDomainStatistics.java new file mode 100644 index 00000000..162c2d8b --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/UpdateDomainStatistics.java @@ -0,0 +1,66 @@ +package nu.marginalia.wmsa.edge.converting; + +import com.zaxxer.hikari.HikariDataSource; +import gnu.trove.map.hash.TIntIntHashMap; + +import java.sql.SQLException; + +public class UpdateDomainStatistics { + private final HikariDataSource dataSource; + + public UpdateDomainStatistics(HikariDataSource dataSource) { + this.dataSource = dataSource; + } + + public void run() throws SQLException { + + // This looks weird, but it's actually much faster than doing the computations with SQL queries + // + // ... in part because we can assume the data is immutable and don't mind consuming egregious + // resources + + try (var conn = dataSource.getConnection(); + var stmt = conn.createStatement(); + var domainInfoQuery = conn.prepareStatement("SELECT DOMAIN_ID, VISITED, STATE='ok' FROM EC_URL"); + var insertDomainInfo = conn.prepareStatement("INSERT INTO DOMAIN_METADATA(ID,KNOWN_URLS,GOOD_URLS,VISITED_URLS) VALUES (?, ?, ?, ?)") + ) { + + stmt.executeUpdate("DELETE FROM DOMAIN_METADATA"); + + TIntIntHashMap knownUrls = new TIntIntHashMap(1_000_000, 0.75f, 0, 0); + TIntIntHashMap visitedUrls = new TIntIntHashMap(1_000_000, 0.75f, 0, 0); + TIntIntHashMap goodUrls = new TIntIntHashMap(1_000_000, 0.75f, 0, 0); + + domainInfoQuery.setFetchSize(10_000); + var rsp = domainInfoQuery.executeQuery(); + while (rsp.next()) { + int domainId = rsp.getInt(1); + boolean visited = rsp.getBoolean(2); + boolean stateOk = rsp.getBoolean(3); + + knownUrls.adjustOrPutValue(domainId, 1, 1); + if (visited) { + visitedUrls.adjustOrPutValue(domainId, 1, 1); + if (stateOk) { + goodUrls.adjustOrPutValue(domainId, 1, 1); + } + } + } + + int i = 0; + for (int domainId : knownUrls.keys()) { + insertDomainInfo.setInt(1, domainId); + insertDomainInfo.setInt(2, knownUrls.get(domainId)); + insertDomainInfo.setInt(3, visitedUrls.get(domainId)); + insertDomainInfo.setInt(4, goodUrls.get(domainId)); + insertDomainInfo.addBatch(); + if ((++i % 1000) == 0) { + insertDomainInfo.executeBatch(); + } + } + if ((i % 1000) != 0) { + insertDomainInfo.executeBatch(); + } + } + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/DocumentsCompiler.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/DocumentsCompiler.java new file mode 100644 index 00000000..1aebe182 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/DocumentsCompiler.java @@ -0,0 +1,58 @@ +package nu.marginalia.wmsa.edge.converting.compiler; + +import nu.marginalia.wmsa.edge.converting.interpreter.Instruction; +import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords; +import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadKeywords; +import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocument; +import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocumentWithError; +import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument; +import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature; +import nu.marginalia.wmsa.edge.index.model.IndexBlockType; +import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords; + +import java.util.List; + +public class DocumentsCompiler { + + public void compile(List ret, List documents) { + + for (var doc : documents) { + compileDocumentDetails(ret, doc); + } + + for (var doc : documents) { + compileWords(ret, doc); + } + + } + + private void compileDocumentDetails(List ret, ProcessedDocument doc) { + var details = doc.details; + + if (details != null) { + ret.add(new LoadProcessedDocument(doc.url, doc.state, details.title, details.description, HtmlFeature.encode(details.features), details.standard, details.length, details.hashCode, details.quality)); + } + else { + ret.add(new LoadProcessedDocumentWithError(doc.url, doc.state, doc.stateReason)); + } + } + + private void compileWords(List ret, ProcessedDocument doc) { + var words = doc.words; + + if (words != null) { + + var wordsArray = words.values().stream() + .filter(this::filterNonTransients) + .map(DocumentKeywords::new) + .toArray(DocumentKeywords[]::new); + + ret.add(new LoadKeywords(doc.url, wordsArray)); + } + } + + private boolean filterNonTransients(EdgePageWords words) { + return words.block.type != IndexBlockType.TRANSIENT; + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/FeedsCompiler.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/FeedsCompiler.java new file mode 100644 index 00000000..e3774288 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/FeedsCompiler.java @@ -0,0 +1,23 @@ +package nu.marginalia.wmsa.edge.converting.compiler; + +import nu.marginalia.wmsa.edge.converting.interpreter.Instruction; +import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadRssFeed; +import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument; +import nu.marginalia.wmsa.edge.model.EdgeUrl; + +import java.util.List; +import java.util.Objects; + +public class FeedsCompiler { + + public void compile(List ret, List documents) { + + EdgeUrl[] feeds = documents.stream().map(doc -> doc.details) + .filter(Objects::nonNull) + .flatMap(dets -> dets.feedLinks.stream()) + .distinct() + .toArray(EdgeUrl[]::new); + + ret.add(new LoadRssFeed(feeds)); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/InstructionsCompiler.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/InstructionsCompiler.java new file mode 100644 index 00000000..1b3614a1 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/InstructionsCompiler.java @@ -0,0 +1,57 @@ +package nu.marginalia.wmsa.edge.converting.compiler; + +import com.google.inject.Inject; +import nu.marginalia.wmsa.edge.converting.interpreter.Instruction; +import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDomain; +import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain; + +import java.util.ArrayList; +import java.util.List; + +public class InstructionsCompiler { + private final UrlsCompiler urlsCompiler; + private final DocumentsCompiler documentsCompiler; + private final FeedsCompiler feedsCompiler; + private final LinksCompiler linksCompiler; + private final RedirectCompiler redirectCompiler; + + @Inject + public InstructionsCompiler(UrlsCompiler urlsCompiler, + DocumentsCompiler documentsCompiler, + FeedsCompiler feedsCompiler, + LinksCompiler linksCompiler, + RedirectCompiler redirectCompiler) + { + this.urlsCompiler = urlsCompiler; + this.documentsCompiler = documentsCompiler; + this.feedsCompiler = feedsCompiler; + this.linksCompiler = linksCompiler; + this.redirectCompiler = redirectCompiler; + } + + public List compile(ProcessedDomain domain) { + List ret = new ArrayList<>(domain.size()*4); + + ret.add(new LoadProcessedDomain(domain.domain, domain.state, domain.ip)); + + if (domain.documents != null) { + urlsCompiler.compile(ret, domain.documents); + documentsCompiler.compile(ret, domain.documents); + + feedsCompiler.compile(ret, domain.documents); + + linksCompiler.compile(ret, domain.domain, domain.documents); + } + if (domain.redirect != null) { + redirectCompiler.compile(ret, domain.domain, domain.redirect); + } + + return ret; + } + + + + + + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/LinksCompiler.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/LinksCompiler.java new file mode 100644 index 00000000..cb115821 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/LinksCompiler.java @@ -0,0 +1,26 @@ +package nu.marginalia.wmsa.edge.converting.compiler; + +import nu.marginalia.wmsa.edge.converting.interpreter.Instruction; +import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink; +import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadDomainLink; +import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument; +import nu.marginalia.wmsa.edge.model.EdgeDomain; + +import java.util.List; +import java.util.Objects; + +public class LinksCompiler { + + public void compile(List ret, EdgeDomain from, List documents) { + + DomainLink[] links = documents.stream().map(doc -> doc.details) + .filter(Objects::nonNull) + .flatMap(dets -> dets.linksExternal.stream()) + .map(link -> link.domain) + .distinct() + .map(domain -> new DomainLink(from, domain)) + .toArray(DomainLink[]::new); + + ret.add(new LoadDomainLink(links)); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/RedirectCompiler.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/RedirectCompiler.java new file mode 100644 index 00000000..2a1e42f7 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/RedirectCompiler.java @@ -0,0 +1,19 @@ +package nu.marginalia.wmsa.edge.converting.compiler; + +import nu.marginalia.wmsa.edge.converting.interpreter.Instruction; +import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink; +import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadDomain; +import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadDomainLink; +import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadDomainRedirect; +import nu.marginalia.wmsa.edge.model.EdgeDomain; + +import java.util.List; + +public class RedirectCompiler { + + public void compile(List ret, EdgeDomain from, EdgeDomain to) { + ret.add(new LoadDomain(to)); + ret.add(new LoadDomainLink(new DomainLink(from, to))); + ret.add(new LoadDomainRedirect(new DomainLink(from, to))); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/UrlsCompiler.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/UrlsCompiler.java new file mode 100644 index 00000000..b847aa21 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/UrlsCompiler.java @@ -0,0 +1,49 @@ +package nu.marginalia.wmsa.edge.converting.compiler; + +import nu.marginalia.wmsa.edge.converting.interpreter.Instruction; +import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadDomain; +import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadUrl; +import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument; +import nu.marginalia.wmsa.edge.model.EdgeDomain; +import nu.marginalia.wmsa.edge.model.EdgeUrl; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +public class UrlsCompiler { + + private static final int MAX_INTERNAL_LINKS = 25; + + public void compile(List ret, List documents) { + Set seenUrls = new HashSet<>(documents.size()*4); + Set seenDomains = new HashSet<>(documents.size()); + + for (var doc : documents) { + seenUrls.add(doc.url); + + if (doc.details != null) { + + for (var url : doc.details.linksExternal) { + if (seenDomains.add(url.domain)) { + seenUrls.add(url); + } + } + + if (doc.isOk()) { + // Don't load more than a few from linksInternal, grows too big for no reason + var linksToAdd = new ArrayList<>(doc.details.linksInternal); + if (linksToAdd.size() > MAX_INTERNAL_LINKS) { + linksToAdd.subList(MAX_INTERNAL_LINKS, linksToAdd.size()).clear(); + } + seenUrls.addAll(linksToAdd); + } + } + } + + ret.add(new LoadDomain(seenDomains.toArray(EdgeDomain[]::new))); + ret.add(new LoadUrl(seenUrls.toArray(EdgeUrl[]::new))); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/DocumentKeywords.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/DocumentKeywords.java index e9d2471f..27e16c5b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/DocumentKeywords.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/DocumentKeywords.java @@ -1,17 +1,47 @@ package nu.marginalia.wmsa.edge.converting.interpreter.instruction; +import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords; import java.util.Arrays; -public record DocumentKeywords(IndexBlock block, String... keywords) { +public record DocumentKeywords(IndexBlock block, + String[] keywords, + long[] metadata) { + public DocumentKeywords(EdgePageWords words) { - this(words.block, words.words.toArray(String[]::new)); + this(words.block, + words.words.toArray(String[]::new), + words.metadata.toArray()); } @Override public String toString() { - return getClass().getSimpleName()+"["+block +", "+Arrays.toString(keywords)+"]"; + StringBuilder sb = new StringBuilder(); + sb.append(getClass().getSimpleName()); + sb.append('[').append(block).append(", "); + for (int i = 0; i < keywords.length; i++) { + sb.append("\n\t "); + if (metadata[i] != 0) { + sb.append(keywords[i]).append("/").append(new EdgePageWordMetadata(metadata[i])); + } + else { + sb.append(keywords[i]); + } + } + return sb.append("\n]").toString(); + } + + public boolean isEmpty() { + return keywords.length == 0; + } + + public int size() { + return keywords.length; + } + + public DocumentKeywords subList(int start, int end) { + return new DocumentKeywords(block, Arrays.copyOfRange(keywords, start, end), Arrays.copyOfRange(metadata, start, end)); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadProcessedDocumentWithError.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadProcessedDocumentWithError.java index 40f6bd31..8d37cb64 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadProcessedDocumentWithError.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadProcessedDocumentWithError.java @@ -8,7 +8,8 @@ import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState; public record LoadProcessedDocumentWithError(EdgeUrl url, - EdgeUrlState state) implements Instruction + EdgeUrlState state, + String reason) implements Instruction { @Override public void apply(Interpreter interpreter) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java index cef529a0..6f835863 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java @@ -25,34 +25,13 @@ public class SqlLoadUrls { @Inject public SqlLoadUrls(HikariDataSource dataSource) { this.dataSource = dataSource; - try (var conn = dataSource.getConnection()) { - try (var stmt = conn.createStatement()) { - stmt.execute("DROP PROCEDURE IF EXISTS INSERT_URL"); - stmt.execute(""" - CREATE PROCEDURE INSERT_URL ( - IN PROTO VARCHAR(255), - IN DOMAIN VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci, - IN PORT INT, - IN PATH VARCHAR(255), - IN PARAM VARCHAR(255), - IN PATH_HASH BIGINT - ) - BEGIN - INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PARAM,PATH_HASH) SELECT PROTO,ID,PORT,PATH,PARAM,PATH_HASH FROM EC_DOMAIN WHERE DOMAIN_NAME=DOMAIN; - END - """); - } - } - catch (SQLException ex) { - throw new RuntimeException("Failed to set up loader", ex); - } } public void load(LoaderData data, EdgeUrl[] urls) { Set affectedDomains = new HashSet<>(); try (var conn = dataSource.getConnection(); - var insertCall = conn.prepareCall("CALL INSERT_URL(?,?,?,?,?,?)"); + var insertCall = conn.prepareStatement("INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PARAM,PATH_HASH) VALUES (?,?,?,?,?,?)"); var queryCall = conn.prepareStatement("SELECT ID, PROTO, PATH, PARAM FROM EC_URL WHERE DOMAIN_ID=?") ) { @@ -67,7 +46,7 @@ public class SqlLoadUrls { affectedDomains.add(url.domain); insertCall.setString(1, url.proto); - insertCall.setString(2, url.domain.toString()); + insertCall.setInt(2, data.getDomainId(url.domain)); if (url.port != null) { insertCall.setInt(3, url.port); } @@ -79,7 +58,7 @@ public class SqlLoadUrls { insertCall.setLong(6, hashPath(url.path, url.param)); insertCall.addBatch(); - if (cnt++ == 250) { + if (cnt++ == 1000) { var ret = insertCall.executeBatch(); conn.commit(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/DisqualifiedException.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/DisqualifiedException.java index 4f30e7da..3c97622a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/DisqualifiedException.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/DisqualifiedException.java @@ -1,11 +1,18 @@ package nu.marginalia.wmsa.edge.converting.model; +import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus; + public class DisqualifiedException extends Exception { public final DisqualificationReason reason; public DisqualifiedException(DisqualificationReason reason) { this.reason = reason; } + + public DisqualifiedException(CrawlerDocumentStatus crawlerStatus) { + this.reason = DisqualificationReason.fromCrawlerStatus(crawlerStatus); + } + @Override public Throwable fillInStackTrace() { return this; @@ -18,6 +25,22 @@ public class DisqualifiedException extends Exception { STATUS, QUALITY, ACCEPTABLE_ADS, - FORBIDDEN + FORBIDDEN, + SHORT_CIRCUIT, + + PROCESSING_EXCEPTION, + + BAD_CONTENT_TYPE, + BAD_CHARSET, + REDIRECT, + ROBOTS_TXT, + ERROR, + Timeout, // Don't you dare + BAD_CANONICAL + ; + + public static DisqualificationReason fromCrawlerStatus(CrawlerDocumentStatus crawlerStatus) { + return DisqualificationReason.valueOf(crawlerStatus.name()); + } } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/ProcessedDocument.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/ProcessedDocument.java index 1b8eb155..67e0f0df 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/ProcessedDocument.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/ProcessedDocument.java @@ -17,6 +17,10 @@ public class ProcessedDocument { public EdgeUrlState state; public String stateReason; + public boolean isOk() { + return EdgeUrlState.OK == state; + } + public OptionalDouble quality() { if (details != null) { return OptionalDouble.of(details.quality); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java index 5037c791..e7ebd4e0 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java @@ -7,6 +7,7 @@ import nu.marginalia.util.language.LanguageFilter; import nu.marginalia.util.language.processing.DocumentKeywordExtractor; import nu.marginalia.util.language.processing.SentenceExtractor; import nu.marginalia.util.language.processing.model.DocumentLanguageData; +import nu.marginalia.util.language.processing.model.KeywordMetadata; import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException; import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException.DisqualificationReason; import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument; @@ -81,32 +82,12 @@ public class DocumentProcessor { return ret; } + public ProcessedDocument process(CrawledDocument crawledDocument, CrawledDomain crawledDomain) { ProcessedDocument ret = new ProcessedDocument(); try { - ret.url = getDocumentUrl(crawledDocument); - ret.state = crawlerStatusToUrlState(crawledDocument.crawlerStatus, crawledDocument.httpStatus); - - if (ret.state == EdgeUrlState.OK) { - - if (AcceptableAds.hasAcceptableAdsHeader(crawledDocument)) { - throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS); - } - - if (isAcceptedContentType(crawledDocument)) { - var detailsWords = createDetails(crawledDomain, crawledDocument); - - ret.details = detailsWords.details(); - ret.words = detailsWords.words(); - } - else { - throw new DisqualifiedException(DisqualificationReason.CONTENT_TYPE); - } - } - else { - throw new DisqualifiedException(DisqualificationReason.STATUS); - } + processDocument(crawledDocument, crawledDomain, ret); } catch (DisqualifiedException ex) { ret.state = EdgeUrlState.DISQUALIFIED; @@ -115,6 +96,7 @@ public class DocumentProcessor { } catch (Exception ex) { ret.state = EdgeUrlState.DISQUALIFIED; + ret.stateReason = DisqualificationReason.PROCESSING_EXCEPTION.toString(); logger.info("Failed to convert " + crawledDocument.url, ex); ex.printStackTrace(); } @@ -122,6 +104,32 @@ public class DocumentProcessor { return ret; } + private void processDocument(CrawledDocument crawledDocument, CrawledDomain crawledDomain, ProcessedDocument ret) throws URISyntaxException, DisqualifiedException { + + var crawlerStatus = CrawlerDocumentStatus.valueOf(crawledDocument.crawlerStatus); + if (crawlerStatus != CrawlerDocumentStatus.OK) { + throw new DisqualifiedException(crawlerStatus); + } + + if (AcceptableAds.hasAcceptableAdsHeader(crawledDocument)) { + throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS); + } + + if (!isAcceptedContentType(crawledDocument)) { + throw new DisqualifiedException(DisqualificationReason.CONTENT_TYPE); + } + + + ret.url = getDocumentUrl(crawledDocument); + ret.state = crawlerStatusToUrlState(crawledDocument.crawlerStatus, crawledDocument.httpStatus); + + var detailsWithWordsLinks = createDetails(crawledDomain, crawledDocument); + + ret.details = detailsWithWordsLinks.details(); + ret.words = detailsWithWordsLinks.words(); + } + + private EdgeUrl getDocumentUrl(CrawledDocument crawledDocument) throws URISyntaxException { @@ -162,16 +170,25 @@ public class DocumentProcessor { private DetailsWithWords createDetails(CrawledDomain crawledDomain, CrawledDocument crawledDocument) throws DisqualifiedException, URISyntaxException { + if (languageFilter.isBlockedUnicodeRange(crawledDocument.documentBody)) { + throw new DisqualifiedException(DisqualificationReason.LANGUAGE); + } + Document doc = Jsoup.parse(crawledDocument.documentBody); if (AcceptableAds.hasAcceptableAdsTag(doc)) { throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS); } + if (doc.select("meta[name=robots]").attr("content").contains("noindex")) { throw new DisqualifiedException(DisqualificationReason.FORBIDDEN); } + final EdgeUrl url = new EdgeUrl(crawledDocument.url); + Document prunedDoc = doc.clone(); + + prunedDoc.getElementsByTag("svg").remove(); prunedDoc.body().filter(new DomPruningFilter(0.5)); var dld = sentenceExtractor.extractSentences(prunedDoc); @@ -184,24 +201,27 @@ public class DocumentProcessor { ret.standard = getHtmlStandard(doc); ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url); - ret.quality = documentValuator.getQuality(ret.standard, doc, dld); + ret.quality = documentValuator.getQuality(crawledDocument, ret.standard, doc, dld); ret.hashCode = HashCode.fromString(crawledDocument.documentBodyHash).asLong(); - final boolean doSimpleProcessing = ret.quality < minDocumentQuality; + KeywordMetadata keywordMetadata = new KeywordMetadata(ret.quality); EdgePageWordSet words; - if (doSimpleProcessing) { + if (shouldDoSimpleProcessing(url, ret)) { + /* Some documents we'll index, but only superficially. This is a compromise + to allow them to be discoverable, without having them show up without specific + queries. This also saves a lot of processing power. + */ ret.features = Set.of(HtmlFeature.UNKNOWN); - words = keywordExtractor.extractKeywordsMinimal(dld); + words = keywordExtractor.extractKeywordsMinimal(dld, keywordMetadata); ret.description = ""; } else { ret.features = featureExtractor.getFeatures(crawledDomain, doc, dld); - words = keywordExtractor.extractKeywords(dld); + words = keywordExtractor.extractKeywords(dld, keywordMetadata); ret.description = getDescription(doc); } - var url = new EdgeUrl(crawledDocument.url); addMetaWords(ret, url, crawledDomain, words); getLinks(url, ret, doc, words); @@ -209,6 +229,33 @@ public class DocumentProcessor { return new DetailsWithWords(ret, words); } + private boolean shouldDoSimpleProcessing(EdgeUrl url, ProcessedDocumentDetails ret) { + if (ret.quality < minDocumentQuality) { + return true; + } + + // These pages shouldn't be publicly accessible + if ("phpinfo()".equals(ret.title)) { + return true; + } + + // Urls that look like /@foo are typically Mastodon or other twitter-like feeds, + // we don't want to index them because they change so rapidly; subdirectories are + // fine though + // + // The first startsWith criteria is a performance optimization, even with a compiled + // pattern it is something like 50x faster + if (url.path.startsWith("/@") && url.path.matches("^/@[^/]+/?$")) { + return true; + } + + // Annoying wordpress crap + if (url.path.startsWith("/tag/") && url.path.endsWith("/")) { + return true; + } + return false; + } + private void addMetaWords(ProcessedDocumentDetails ret, EdgeUrl url, CrawledDomain domain, EdgePageWordSet words) { List tagWords = new ArrayList<>(); @@ -229,7 +276,7 @@ public class DocumentProcessor { ret.features.stream().map(HtmlFeature::getKeyword).forEach(tagWords::add); - words.append(IndexBlock.Meta, tagWords); + words.appendWithNoMeta(IndexBlock.Meta, tagWords); } private void getLinks(EdgeUrl baseUrl, ProcessedDocumentDetails ret, Document doc, EdgePageWordSet words) { @@ -263,14 +310,21 @@ public class DocumentProcessor { .ifPresent(lp::acceptFeed); } + createLinkKeywords(words, lp); + createFileLinkKeywords(words, lp, domain); + } + + private void createLinkKeywords(EdgePageWordSet words, LinkProcessor lp) { final Set linkTerms = new HashSet<>(); for (var fd : lp.getForeignDomains()) { linkTerms.add("links:"+fd.toString().toLowerCase()); linkTerms.add("links:"+fd.getDomain().toLowerCase()); } - words.append(IndexBlock.Meta, linkTerms); + words.appendWithNoMeta(IndexBlock.Meta, linkTerms); + } + private void createFileLinkKeywords(EdgePageWordSet words, LinkProcessor lp, EdgeDomain domain) { Set fileKeywords = new HashSet<>(100); for (var link : lp.getNonIndexableUrls()) { @@ -281,8 +335,8 @@ public class DocumentProcessor { synthesizeFilenameKeyword(fileKeywords, link); } - words.append(IndexBlock.Artifacts, fileKeywords); + words.appendWithNoMeta(IndexBlock.Artifacts, fileKeywords); } private void synthesizeFilenameKeyword(Set fileKeywords, EdgeUrl link) { @@ -331,5 +385,7 @@ public class DocumentProcessor { return doc.text().length(); } - private record DetailsWithWords(ProcessedDocumentDetails details, EdgePageWordSet words) {} + private record DetailsWithWords(ProcessedDocumentDetails details, + EdgePageWordSet words) {} + } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java index eb1c6f1e..4e04e3fa 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java @@ -3,17 +3,22 @@ package nu.marginalia.wmsa.edge.converting.processor; import com.google.common.base.Strings; import com.google.inject.Inject; import com.google.inject.name.Named; +import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException; import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain; import nu.marginalia.wmsa.edge.converting.processor.logic.CommonKeywordExtractor; +import nu.marginalia.wmsa.edge.converting.processor.logic.InternalLinkGraph; import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument; import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; import nu.marginalia.wmsa.edge.crawling.model.CrawlerDomainStatus; +import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags; import nu.marginalia.wmsa.edge.index.model.IndexBlock; +import nu.marginalia.wmsa.edge.index.model.IndexBlockType; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; import java.util.*; +import java.util.stream.Collectors; import static nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus.BAD_CANONICAL; @@ -47,6 +52,8 @@ public class DomainProcessor { fixBadCanonicalTags(crawledDomain.doc); + InternalLinkGraph internalLinkGraph = new InternalLinkGraph(); + DocumentDisqualifier disqualifier = new DocumentDisqualifier(); for (var doc : crawledDomain.doc) { if (disqualifier.isQualified()) { @@ -54,6 +61,9 @@ public class DomainProcessor { if (processedDoc.url != null) { ret.documents.add(processedDoc); + + internalLinkGraph.accept(processedDoc); + processedDoc.quality().ifPresent(disqualifier::offer); } else if ("LANGUAGE".equals(processedDoc.stateReason)) { @@ -62,24 +72,16 @@ public class DomainProcessor { } else { // Short-circuit processing if quality is too low var stub = documentProcessor.makeDisqualifiedStub(doc); + stub.stateReason = DisqualifiedException.DisqualificationReason.SHORT_CIRCUIT.toString(); if (stub.url != null) { ret.documents.add(stub); } } } - Set commonSiteWords = new HashSet<>(10); + flagCommonSiteWords(ret); + flagAdjacentSiteWords(internalLinkGraph, ret); - commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(ret, IndexBlock.Tfidf_Top, IndexBlock.Subjects)); - commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(ret, IndexBlock.Title)); - - if (!commonSiteWords.isEmpty()) { - for (var doc : ret.documents) { - if (doc.words != null) { - doc.words.get(IndexBlock.Site).addAll(commonSiteWords); - } - } - } } else { ret.documents = Collections.emptyList(); @@ -90,6 +92,70 @@ public class DomainProcessor { return ret; } + private void flagCommonSiteWords(ProcessedDomain processedDomain) { + Set commonSiteWords = new HashSet<>(10); + + commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(processedDomain, IndexBlock.Tfidf_High, IndexBlock.Subjects)); + commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(processedDomain, IndexBlock.Title)); + + if (commonSiteWords.isEmpty()) { + return; + } + + for (var doc : processedDomain.documents) { + if (doc.words != null) { + for (var block : IndexBlock.values()) { + if (block.type == IndexBlockType.PAGE_DATA) { + doc.words.get(block).setFlagOnMetadataForWords(EdgePageWordFlags.Site, commonSiteWords); + } + } + } + } + } + + private void flagAdjacentSiteWords(InternalLinkGraph internalLinkGraph, ProcessedDomain processedDomain) { + var invertedGraph = internalLinkGraph.trimAndInvert(); + + Map> linkedKeywords = new HashMap<>(100); + + invertedGraph.forEach((url, linkingUrls) -> { + Map keywords = new HashMap<>(100); + + for (var linkingUrl : linkingUrls) { + for (var keyword : internalLinkGraph.getKeywords(linkingUrl)) { + keywords.merge(keyword, 1, Integer::sum); + } + } + + var words = keywords.entrySet().stream() + .filter(e -> e.getValue() > 3) + .map(Map.Entry::getKey) + .filter(internalLinkGraph.getCandidateKeywords(url)::contains) + .collect(Collectors.toSet()); + if (!words.isEmpty()) { + linkedKeywords.put(url, words); + } + }); + + for (var doc : processedDomain.documents) { + if (doc.words == null) + continue; + + final Set keywords = linkedKeywords.get(doc.url); + if (keywords == null) + continue; + + for (var block : IndexBlock.values()) { + if (block.type == IndexBlockType.PAGE_DATA) { + doc.words.get(block).setFlagOnMetadataForWords(EdgePageWordFlags.SiteAdjacent, keywords); + } + } + } + + + } + + private void fixBadCanonicalTags(List docs) { Map> seenCanonicals = new HashMap<>(); Set seenUrls = new HashSet<>(); @@ -162,7 +228,8 @@ public class DomainProcessor { } boolean isQualified() { - return count < 25 || goodCount*10 >= count; + return true; +// return count < 25 || goodCount*10 >= count; } } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/InstructionsCompiler.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/InstructionsCompiler.java deleted file mode 100644 index b771b911..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/InstructionsCompiler.java +++ /dev/null @@ -1,116 +0,0 @@ -package nu.marginalia.wmsa.edge.converting.processor; - -import nu.marginalia.wmsa.edge.converting.interpreter.Instruction; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.*; -import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument; -import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain; -import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.EdgeUrl; - -import java.util.*; - -public class InstructionsCompiler { - - public List compile(ProcessedDomain domain) { - List ret = new ArrayList<>(domain.size()*4); - - ret.add(new LoadProcessedDomain(domain.domain, domain.state, domain.ip)); - - if (domain.documents != null) { - compileUrls(ret, domain.documents); - compileDocuments(ret, domain.documents); - compileFeeds(ret, domain.documents); - - compileLinks(ret, domain.domain, domain.documents); - } - if (domain.redirect != null) { - compileRedirect(ret, domain.domain, domain.redirect); - } - - return ret; - } - - private void compileRedirect(List ret, EdgeDomain from, EdgeDomain to) { - ret.add(new LoadDomain(to)); - ret.add(new LoadDomainLink(new DomainLink(from, to))); - ret.add(new LoadDomainRedirect(new DomainLink(from, to))); - } - - private void compileUrls(List ret, List documents) { - Set seenUrls = new HashSet<>(documents.size()*4); - Set seenDomains = new HashSet<>(documents.size()); - - for (var doc : documents) { - seenUrls.add(doc.url); - - if (doc.details != null) { - for (var url : doc.details.linksExternal) { - seenDomains.add(url.domain); - } - seenUrls.addAll(doc.details.linksExternal); - seenUrls.addAll(doc.details.linksInternal); - } - } - - ret.add(new LoadDomain(seenDomains.toArray(EdgeDomain[]::new))); - ret.add(new LoadUrl(seenUrls.toArray(EdgeUrl[]::new))); - } - - private void compileLinks(List ret, EdgeDomain from, List documents) { - DomainLink[] links = documents.stream().map(doc -> doc.details) - .filter(Objects::nonNull) - .flatMap(dets -> dets.linksExternal.stream()) - .map(link -> link.domain) - .distinct() - .map(domain -> new DomainLink(from, domain)) - .toArray(DomainLink[]::new); - - ret.add(new LoadDomainLink(links)); - } - - private void compileFeeds(List ret, List documents) { - - EdgeUrl[] feeds = documents.stream().map(doc -> doc.details) - .filter(Objects::nonNull) - .flatMap(dets -> dets.feedLinks.stream()) - .distinct() - .toArray(EdgeUrl[]::new); - - ret.add(new LoadRssFeed(feeds)); - } - - private void compileDocuments(List ret, List documents) { - - for (var doc : documents) { - compileDocumentDetails(ret, doc); - } - - for (var doc : documents) { - compileWords(ret, doc); - } - - } - - private void compileDocumentDetails(List ret, ProcessedDocument doc) { - var details = doc.details; - - if (details != null) { - ret.add(new LoadProcessedDocument(doc.url, doc.state, details.title, details.description, HtmlFeature.encode(details.features), details.standard, details.length, details.hashCode, details.quality)); - } - else { - ret.add(new LoadProcessedDocumentWithError(doc.url, doc.state)); - } - } - - private void compileWords(List ret, ProcessedDocument doc) { - var words = doc.words; - if (words != null) { - var wordsArray = words.values().stream() - .map(DocumentKeywords::new) - .toArray(DocumentKeywords[]::new); - - ret.add(new LoadKeywords(doc.url, wordsArray)); - } - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DocumentValuator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DocumentValuator.java index b0423efa..46fc7925 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DocumentValuator.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DocumentValuator.java @@ -3,6 +3,7 @@ package nu.marginalia.wmsa.edge.converting.processor.logic; import crawlercommons.utils.Strings; import nu.marginalia.util.language.processing.model.DocumentLanguageData; import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException; +import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument; import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; import org.jsoup.nodes.Document; @@ -23,13 +24,12 @@ public class DocumentValuator { ); - public double getQuality(EdgeHtmlStandard htmlStandard, Document doc, DocumentLanguageData dld) throws DisqualifiedException { + public double getQuality(CrawledDocument crawledDocument, EdgeHtmlStandard htmlStandard, Document parsedDocument, DocumentLanguageData dld) throws DisqualifiedException { double smutCoefficient = dld.streamLowerCase().filter(filthTable::contains).count(); - double scriptPenalty = getScriptPenalty(doc); + double scriptPenalty = getScriptPenalty(parsedDocument); - - int textBodyLength = doc.text().length(); - int rawLength = doc.html().length(); + int textBodyLength = parsedDocument.text().length(); + int rawLength = crawledDocument.documentBody.length(); if (textBodyLength == 0) { throw new DisqualifiedException(LENGTH); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/FeatureExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/FeatureExtractor.java index 8e48f719..2681b1c6 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/FeatureExtractor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/FeatureExtractor.java @@ -3,10 +3,7 @@ package nu.marginalia.wmsa.edge.converting.processor.logic; import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.util.language.processing.model.DocumentLanguageData; -import nu.marginalia.wmsa.edge.converting.processor.logic.topic.AdblockSimulator; -import nu.marginalia.wmsa.edge.converting.processor.logic.topic.RecipeDetector; -import nu.marginalia.wmsa.edge.converting.processor.logic.topic.TextileCraftDetector; -import nu.marginalia.wmsa.edge.converting.processor.logic.topic.WoodworkingDetector; +import nu.marginalia.wmsa.edge.converting.processor.logic.topic.*; import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; @@ -43,13 +40,15 @@ public class FeatureExtractor { private final RecipeDetector recipeDetector; private final TextileCraftDetector textileCraftDetector; private final WoodworkingDetector woodworkingDetector; + private final GoogleAnwersSpamDetector googleAnwersSpamDetector; @Inject - public FeatureExtractor(AdblockSimulator adblockSimulator, RecipeDetector recipeDetector, TextileCraftDetector textileCraftDetector, WoodworkingDetector woodworkingDetector) { + public FeatureExtractor(AdblockSimulator adblockSimulator, RecipeDetector recipeDetector, TextileCraftDetector textileCraftDetector, WoodworkingDetector woodworkingDetector, GoogleAnwersSpamDetector googleAnwersSpamDetector) { this.adblockSimulator = adblockSimulator; this.recipeDetector = recipeDetector; this.textileCraftDetector = textileCraftDetector; this.woodworkingDetector = woodworkingDetector; + this.googleAnwersSpamDetector = googleAnwersSpamDetector; } public Set getFeatures(CrawledDomain domain, Document doc, DocumentLanguageData dld) { @@ -57,6 +56,10 @@ public class FeatureExtractor { final Elements scriptTags = doc.getElementsByTag("script"); + if (googleAnwersSpamDetector.testP(doc) > 0.5) { + features.add(HtmlFeature.GA_SPAM); + } + for (var scriptTag : scriptTags) { if (isJavascriptTag(scriptTag)) { features.add(HtmlFeature.JS); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlFeature.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlFeature.java index 5b875442..c4d0181c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlFeature.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlFeature.java @@ -7,14 +7,14 @@ public enum HtmlFeature { JS("special:scripts"), AFFILIATE_LINK( "special:affiliate"), TRACKING("special:tracking"), + COOKIES("special:cookies"), - CATEGORY_FOOD("category:food"), - ADVERTISEMENT("special:ads"), - CATEGORY_CRAFTS("category:crafts"), + GA_SPAM("special:gaspam"), + UNKNOWN("special:uncategorized") ; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/InternalLinkGraph.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/InternalLinkGraph.java new file mode 100644 index 00000000..abb2b619 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/InternalLinkGraph.java @@ -0,0 +1,54 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic; + +import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument; +import nu.marginalia.wmsa.edge.index.model.IndexBlock; +import nu.marginalia.wmsa.edge.model.EdgeUrl; + +import java.util.*; + +public class InternalLinkGraph { + private final Map> internalLinkGraph = new HashMap<>(1000); + private final Set goodUrls = new HashSet<>(1000); + private final Map> topKeywordsByUrl = new HashMap<>(1000); + private final Map> candidateKeywordsByUrl = new HashMap<>(1000); + + public void accept(ProcessedDocument doc) { + if (doc.details == null || doc.details.linksInternal == null) + return; + + goodUrls.add(doc.url); + internalLinkGraph.put(doc.url, new HashSet<>(doc.details.linksInternal)); + + Set topKeywords = new HashSet<>(doc.words.get(IndexBlock.Tfidf_High).words); + topKeywords.addAll(doc.words.get(IndexBlock.Subjects).words); + topKeywordsByUrl.put(doc.url, topKeywords); + + Set candidateKeywords = new HashSet<>(topKeywords); + candidateKeywords.addAll(doc.words.get(IndexBlock.Tfidf_High).words); + candidateKeywords.addAll(doc.words.get(IndexBlock.Subjects).words); + candidateKeywordsByUrl.put(doc.url, candidateKeywords); + } + + public Map> trimAndInvert() { + internalLinkGraph.values().forEach(dest -> dest.retainAll(goodUrls)); + + Map> inverted = new HashMap<>(goodUrls.size()); + + internalLinkGraph.forEach((source, dests) -> { + dests.forEach(dest -> inverted.computeIfAbsent(dest, + d->new HashSet<>(25)) + .add(source)); + }); + + internalLinkGraph.clear(); + + return inverted; + } + + public Set getKeywords(EdgeUrl url) { + return topKeywordsByUrl.getOrDefault(url, Collections.emptySet()); + } + public Set getCandidateKeywords(EdgeUrl url) { + return candidateKeywordsByUrl.getOrDefault(url, Collections.emptySet()); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java index 06313f1d..5e893725 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java @@ -5,7 +5,6 @@ import com.google.common.base.Strings; import lombok.SneakyThrows; import nu.marginalia.wmsa.edge.model.EdgeUrl; import org.jetbrains.annotations.Contract; -import org.jetbrains.annotations.Nullable; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.slf4j.Logger; @@ -202,7 +201,6 @@ public class LinkParser { return binarySuffixList.stream().anyMatch(str::endsWith); } - @Nullable public EdgeUrl getBaseLink(Document parsed, EdgeUrl documentUrl) { var baseTags = parsed.getElementsByTag("base"); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/QueryParams.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/QueryParams.java index 7560cdd1..8b37e4c9 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/QueryParams.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/QueryParams.java @@ -1,9 +1,13 @@ package nu.marginalia.wmsa.edge.converting.processor.logic; +import org.apache.commons.lang3.StringUtils; + import javax.annotation.Nullable; -import java.util.Arrays; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import java.util.StringJoiner; import java.util.regex.Pattern; -import java.util.stream.Collectors; public class QueryParams { @@ -15,10 +19,28 @@ public class QueryParams { return null; } - var ret = Arrays.stream(paramSplitterPattern.split(queryParams)) - .filter(param -> QueryParams.isPermittedParam(path, param)) - .sorted() - .collect(Collectors.joining("&")); + String ret; + if (queryParams.indexOf('&') >= 0) { + + List parts = new ArrayList<>(); + for (var part : StringUtils.split(queryParams, '&')) { + if (QueryParams.isPermittedParam(path, part)) { + parts.add(part); + } + } + if (parts.size() > 1) { + parts.sort(Comparator.naturalOrder()); + } + StringJoiner retJoiner = new StringJoiner("&"); + parts.forEach(retJoiner::add); + ret = retJoiner.toString(); + } + else if (isPermittedParam(path, queryParams)) { + ret = queryParams; + } + else { + return null; + } if (ret.isBlank()) return null; @@ -29,11 +51,29 @@ public class QueryParams { public static boolean isPermittedParam(String path, String param) { if (path.endsWith(".cgi")) return true; + if (path.endsWith("/posting.php")) return false; + if (param.startsWith("id=")) return true; - if (param.startsWith("p=")) return true; + if (param.startsWith("p=")) { + // Don't retain forum links with post-id:s, they're always non-canonical and eat up a lot of + // crawling bandwidth + + if (path.endsWith("showthread.php") || path.endsWith("viewtopic.php")) { + return false; + } + return true; + } + if (param.startsWith("f=")) { + if (path.endsWith("showthread.php") || path.endsWith("viewtopic.php")) { + return false; + } + return true; + } if (param.startsWith("i=")) return true; + if (param.startsWith("start=")) return true; if (param.startsWith("t=")) return true; if (param.startsWith("v=")) return true; + if (param.startsWith("post=")) return true; if (path.endsWith("index.php")) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/GoogleAnwersSpamDetector.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/GoogleAnwersSpamDetector.java new file mode 100644 index 00000000..75e7fea3 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/GoogleAnwersSpamDetector.java @@ -0,0 +1,36 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic.topic; + +import org.jsoup.nodes.Document; + +import java.util.List; + +public class GoogleAnwersSpamDetector { + + private final List prefixes = List.of("What", "Why", "How", "When", "Is"); + + public double testP(Document doc) { + if (trialTag(doc, "h1")) return 1; + if (trialTag(doc, "h2")) return 1; + if (trialTag(doc, "h3")) return 1; + + return 0; + } + + private boolean trialTag(Document doc, String tagName) { + int positive = 0; + int total = 0; + + for (var elem : doc.getElementsByTag(tagName)) { + String text = elem.text(); + for (var prefix : prefixes) { + if (text.startsWith(prefix)) { + positive++; + break; + } + } + total ++; + } + + return positive > 4 && positive / (double) total > 0.5; + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorMain.java index 3281de8c..5a86fdd8 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorMain.java @@ -29,7 +29,7 @@ public class CrawlJobExtractorMain { """ SELECT ID FROM EC_DOMAIN - WHERE URL_PART=? + WHERE DOMAIN_NAME=? """; private static final String domainsSql = @@ -115,6 +115,16 @@ public class CrawlJobExtractorMain { } } + public static void writeSpec(Path outFile, CrawlingSpecification... specs) throws IOException { + Gson gson = GsonFactory.get(); + + try (var out = new PrintWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(outFile.toFile()))))) { + for (var spec : specs) { + out.println(gson.toJson(spec)); + } + } + } + private record DomainWithId(String domainName, int id) { } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerTestMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerTestMain.java new file mode 100644 index 00000000..b26f501a --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerTestMain.java @@ -0,0 +1,116 @@ +package nu.marginalia.wmsa.edge.crawling; + +import io.github.bucket4j.Bandwidth; +import io.github.bucket4j.Bucket; +import io.github.bucket4j.Refill; +import spark.Request; +import spark.Response; +import spark.Spark; + +import java.time.Duration; +import java.util.ArrayList; +import java.util.List; + +public class CrawlerTestMain { + + static Bucket rateLimiter60RPM; + static List successfullyFetched = new ArrayList<>(); + + public static void main(String... args) { + var refill = Refill.greedy(1, Duration.ofSeconds(1)); + + var bw = Bandwidth.classic(10, refill); + rateLimiter60RPM = Bucket.builder().addLimit(bw).build(); + + Spark.port(8080); + Spark.before(CrawlerTestMain::before); + Spark.after(CrawlerTestMain::after); + Spark.get("/rate-limit/", CrawlerTestMain::index); + Spark.get("/rate-limit/:n", CrawlerTestMain::n); + + Spark.before("/rate-limit/:n", CrawlerTestMain::rateLimitRequest); + Spark.before("/intermittent-error/:n", CrawlerTestMain::simulateRandomTimeouts); + + Spark.get("/intermittent-error/", CrawlerTestMain::index); + Spark.get("/intermittent-error/:n", CrawlerTestMain::n); + + } + + private static void rateLimitRequest(Request request, Response response) { + if (!rateLimiter60RPM.tryConsume(1)) { + Spark.halt(429); + } + } + + private static void simulateRandomTimeouts(Request request, Response response) { + if (Math.random() < 0.25) { + System.out.println("Simulating error"); + Spark.halt(503); + } + } + + public static void before(Request request, Response response) { + System.out.println(request.pathInfo()); + successfullyFetched.add(request.pathInfo()); + } + public static void after(Request request, Response response) { + if (response.status() < 300) { + successfullyFetched.add(request.pathInfo()); + } + } + + private static Object n(Request request, Response response) { + + int num = Integer.parseInt(request.params("n")); + return """ + + + Index + +

Index

+ """ + + String.format("Next, Next 2", num+1, num+2) + + + + """ + +

+ Goddess, sing me the anger, of Achilles, Peleus’ son, that fatal anger that brought countless + sorrows on the Greeks, and sent many valiant souls of warriors down to Hades, leaving their + bodies as spoil for dogs and carrion birds: for thus was the will of Zeus brought to fulfilment. + + Sing of it from the moment when Agamemnon, Atreus’ son, that king of men, parted in wrath from noble Achilles. + Which of the gods set these two to quarrel? Apollo, the son of Leto and Zeus, angered by the king, brought an + evil plague on the army, so that the men were dying, for the son of Atreus had dishonoured Chryses the priest. + He it was who came to the swift Achaean ships, to free his daughter, bringing a wealth of ransom, carrying a + golden staff adorned with the ribbons of far-striking Apollo, and called out to the Achaeans, above all to the + two leaders of armies, those sons of Atreus: ‘Atreides, and all you bronze-greaved Achaeans, may the gods who + live on Olympus grant you to sack Priam’s city, and sail back home in safety; but take this ransom, and free + my darling child; show reverence for Zeus’s son, far-striking Apollo.’ + """; + } + + private static Object index(Request request, Response response) { + return """ + + + Index + +

Index

+ Next +

+ Goddess, sing me the anger, of Achilles, Peleus’ son, that fatal anger that brought countless + sorrows on the Greeks, and sent many valiant souls of warriors down to Hades, leaving their + bodies as spoil for dogs and carrion birds: for thus was the will of Zeus brought to fulfilment. + + Sing of it from the moment when Agamemnon, Atreus’ son, that king of men, parted in wrath from noble Achilles. + Which of the gods set these two to quarrel? Apollo, the son of Leto and Zeus, angered by the king, brought an + evil plague on the army, so that the men were dying, for the son of Atreus had dishonoured Chryses the priest. + He it was who came to the swift Achaean ships, to free his daughter, bringing a wealth of ransom, carrying a + golden staff adorned with the ribbons of far-striking Apollo, and called out to the Achaeans, above all to the + two leaders of armies, those sons of Atreus: ‘Atreides, and all you bronze-greaved Achaeans, may the gods who + live on Olympus grant you to sack Priam’s city, and sail back home in safety; but take this ransom, and free + my darling child; show reverence for Zeus’s son, far-striking Apollo.’ + """; + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java index 40dbaa0d..00089eb4 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java @@ -11,6 +11,17 @@ import java.util.regex.Pattern; public class UrlBlocklist { private final List> patterns = new ArrayList<>(); + private record UrlPatternContains(String contains, Pattern pattern) implements Predicate { + public boolean test(String s) { + return s.contains(contains) && pattern.matcher(s).find(); + } + } + private record UrlPatternMinLength(int minLength, Pattern pattern) implements Predicate { + public boolean test(String s) { + return s.length() >= minLength && pattern.matcher(s).find(); + } + } + // domains that have a lot of links but we know we don't want to crawl private final Set badDomains = Set.of("t.co", "facebook.com", "instagram.com", "youtube.com", @@ -18,18 +29,24 @@ public class UrlBlocklist { public UrlBlocklist() { // Don't deep-crawl git repos - patterns.add(Pattern.compile("\\.git/.+").asPredicate()); - patterns.add(Pattern.compile("wp-content/upload").asPredicate()); + patterns.add(s -> s.contains(".git/")); + + patterns.add(s -> s.contains("wp-content/upload")); + patterns.add(s -> s.contains("-download-free")); // long base64-strings in URLs are typically git hashes or the like, rarely worth crawling - patterns.add(Pattern.compile(".*/[^/]*[a-f0-9]{32,}(/|$)").asPredicate()); + patterns.add(new UrlPatternMinLength(48, Pattern.compile(".*/[^/]*[a-f0-9]{32,}(/|$)"))); // link farms &c - patterns.add(Pattern.compile("/download(-([A-Za-z]+|[0-9]+)){4,}\\.(htm|html|php)$").asPredicate()); - patterns.add(Pattern.compile("/permalink/[a-z]+(-([A-Za-z]+|[0-9]+)){3,}\\.(htm|html|php)$").asPredicate()); - patterns.add(Pattern.compile("(webrx3|lib|pdf|book|720p).*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$").asPredicate()); - patterns.add(Pattern.compile("/node/.*/[a-z]+(-[a-z0-9]+)+.htm$").asPredicate()); - patterns.add(Pattern.compile(".*-download-free$").asPredicate()); + patterns.add(new UrlPatternContains("/download", Pattern.compile("/download(-([A-Za-z]+|[0-9]+)){4,}\\.(htm|html|php)$"))); + patterns.add(new UrlPatternContains("/permalink", Pattern.compile("/permalink/[a-z]+(-([A-Za-z]+|[0-9]+)){3,}\\.(htm|html|php)$"))); + patterns.add(new UrlPatternContains("/webrx", Pattern.compile("webrx3.*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$"))); + patterns.add(new UrlPatternContains("/lib", Pattern.compile("lib.*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$"))); + patterns.add(new UrlPatternContains("/pdf", Pattern.compile("pdf.*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$"))); + patterns.add(new UrlPatternContains("/book", Pattern.compile("book.*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$"))); + patterns.add(new UrlPatternContains("/720p", Pattern.compile("720p.*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$"))); + patterns.add(new UrlPatternContains("/node", Pattern.compile("/node/.*/[a-z]+(-[a-z0-9]+)+.htm$"))); + } public boolean isUrlBlocked(EdgeUrl url) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawlingSpecification.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawlingSpecification.java index d55cd2bb..57298c84 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawlingSpecification.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawlingSpecification.java @@ -1,7 +1,11 @@ package nu.marginalia.wmsa.edge.crawling.model; +import lombok.AllArgsConstructor; +import lombok.NoArgsConstructor; + import java.util.List; +@AllArgsConstructor @NoArgsConstructor public class CrawlingSpecification { public String id; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java index 14716fbf..5e60ec3a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java @@ -28,9 +28,11 @@ import static java.lang.Math.max; import static java.lang.Math.min; public class CrawlerRetreiver { - private static final long DEFAULT_CRAWL_DELAY_MIN_MS = Long.getLong("defaultCrawlDelay", 250); + private static final long DEFAULT_CRAWL_DELAY_MIN_MS = Long.getLong("defaultCrawlDelay", 500); private static final long DEFAULT_CRAWL_DELAY_MAX_MS = Long.getLong("defaultCrawlDelaySlow", 2500); + private static final int MAX_ERRORS = 10; + private final LinkedList queue = new LinkedList<>(); private final HttpFetcher fetcher; @@ -50,6 +52,8 @@ public class CrawlerRetreiver { private static final IpBlockList ipBlocklist; private static final UrlBlocklist urlBlocklist = new UrlBlocklist(); + int errorCount = 0; + static { try { ipBlocklist = new IpBlockList(new GeoIpBlocklist()); @@ -75,7 +79,7 @@ public class CrawlerRetreiver { if (queue.peek() != null) { var fst = queue.peek(); - var root = fst.domain.toRootUrl(); + var root = fst.withPathAndParam("/", null); if (known.add(root.toString())) queue.addFirst(root); } @@ -117,7 +121,7 @@ public class CrawlerRetreiver { .build()); } - var fetchResult = fetcher.probeDomain(fst.domain.toRootUrl()); + var fetchResult = fetcher.probeDomain(fst.withPathAndParam("/", null)); if (!fetchResult.ok()) { logger.debug("Bad status on {}", domain); return Optional.of(createErrorPostFromStatus(fetchResult)); @@ -137,7 +141,7 @@ public class CrawlerRetreiver { int fetchedCount = 0; - while (!queue.isEmpty() && visited.size() < depth) { + while (!queue.isEmpty() && visited.size() < depth && errorCount < MAX_ERRORS ) { var top = queue.removeFirst(); if (!robotsRules.isAllowed(top.toString())) { @@ -179,6 +183,10 @@ public class CrawlerRetreiver { EdgeUrl.parse(d.url).map(EdgeUrl::toString).ifPresent(visited::add); } + if ("ERROR".equals(d.crawlerStatus)) { + errorCount++; + } + } long crawledTime = System.currentTimeMillis() - startTime; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java index 3b7239c0..d215d66e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java @@ -111,7 +111,7 @@ public class HttpFetcher { @SneakyThrows public FetchResult probeDomain(EdgeUrl url) { var head = new Request.Builder().head().addHeader("User-agent", userAgent) - .url(url.domain.toRootUrl().toString()) + .url(url.toString()) .build(); var call = client.newCall(head); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDao.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDao.java index e3db4aae..a66101dc 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDao.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDao.java @@ -10,6 +10,7 @@ import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails; import nu.marginalia.wmsa.edge.search.model.BrowseResult; import java.util.List; +import java.util.Optional; @ImplementedBy(EdgeDataStoreDaoImpl.class) public interface EdgeDataStoreDao { @@ -23,7 +24,7 @@ public interface EdgeDataStoreDao { List getUrlDetailsMulti(EdgeIdCollection ids); - EdgeDomain getDomain(EdgeId id); + Optional getDomain(EdgeId id); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java index a695073c..19752b51 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java @@ -93,7 +93,9 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { WORDS_TOTAL, FORMAT, FEATURES, IP, DOMAIN_STATE, DATA_HASH - FROM EC_URL_VIEW WHERE ID IN + FROM EC_URL_VIEW + WHERE TITLE IS NOT NULL + AND ID IN """ + idString)) { stmt.setFetchSize(ids.size()); @@ -113,7 +115,9 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { EdgePageScoreAdjustment.zero(), // urlQualityAdjustment Integer.MAX_VALUE, // rankingId Double.MAX_VALUE, // termScore - 1 // resultsFromSameDomain + 1, // resultsFromSameDomain + "", // positions + null // result item ); if (val.urlQuality <= QUALITY_LOWER_BOUND_CUTOFF && Strings.isNullOrEmpty(val.description) @@ -309,18 +313,17 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { @Override @SneakyThrows - public EdgeDomain getDomain(EdgeId id) { + public Optional getDomain(EdgeId id) { try (var connection = dataSource.getConnection()) { try (var stmt = connection.prepareStatement("SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE ID=?")) { stmt.setInt(1, id.id()); var rsp = stmt.executeQuery(); if (rsp.next()) { - return new EdgeDomain(rsp.getString(1)); + return Optional.of(new EdgeDomain(rsp.getString(1))); } - throw new NoSuchElementException(); + return Optional.empty(); } } } - } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/task/EdgeDomainBlacklistImpl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/task/EdgeDomainBlacklistImpl.java index 8d17d241..2e744ed8 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/task/EdgeDomainBlacklistImpl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/task/EdgeDomainBlacklistImpl.java @@ -4,7 +4,6 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import com.zaxxer.hikari.HikariDataSource; import gnu.trove.set.hash.TIntHashSet; -import io.prometheus.client.Counter; import io.reactivex.rxjava3.schedulers.Schedulers; import lombok.SneakyThrows; import org.slf4j.Logger; @@ -18,8 +17,6 @@ public class EdgeDomainBlacklistImpl implements EdgeDomainBlacklist { private final HikariDataSource dataSource; private final Logger logger = LoggerFactory.getLogger(getClass()); - private static final Counter wmsa_blacklist_intercept = Counter.build("wmsa_blacklist_intercept", - "wmsa_blacklist_intercept").register(); @Inject public EdgeDomainBlacklistImpl(HikariDataSource dataSource) { this.dataSource = dataSource; @@ -65,7 +62,6 @@ public class EdgeDomainBlacklistImpl implements EdgeDomainBlacklist { @Override public boolean isBlacklisted(int domainId) { if (spamDomainSet.contains(domainId)) { - wmsa_blacklist_intercept.inc(); return true; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/explorer/ExplorerMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/explorer/ExplorerMain.java new file mode 100644 index 00000000..cf6a0d9b --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/explorer/ExplorerMain.java @@ -0,0 +1,34 @@ +package nu.marginalia.wmsa.edge.explorer; + +import com.google.inject.Guice; +import com.google.inject.Inject; +import com.google.inject.Injector; +import nu.marginalia.wmsa.configuration.MainClass; +import nu.marginalia.wmsa.configuration.ServiceDescriptor; +import nu.marginalia.wmsa.configuration.module.ConfigurationModule; +import nu.marginalia.wmsa.configuration.module.DatabaseModule; +import nu.marginalia.wmsa.configuration.server.Initialization; +import spark.Spark; + +public class ExplorerMain extends MainClass { + final ExplorerService service; + + @Inject + public ExplorerMain(ExplorerService service) { + this.service = service; + } + + public static void main(String... args) { + init(ServiceDescriptor.EXPLORER, args); + + Spark.staticFileLocation("/static/explore/"); + + Injector injector = Guice.createInjector( + new ConfigurationModule(), + new DatabaseModule() + ); + + injector.getInstance(ExplorerMain.class); + injector.getInstance(Initialization.class).setReady(); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/explorer/ExplorerService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/explorer/ExplorerService.java new file mode 100644 index 00000000..ef5d935c --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/explorer/ExplorerService.java @@ -0,0 +1,253 @@ +package nu.marginalia.wmsa.edge.explorer; + +import com.google.inject.Inject; +import com.google.inject.name.Named; +import com.zaxxer.hikari.HikariDataSource; +import lombok.SneakyThrows; +import nu.marginalia.wmsa.configuration.server.Initialization; +import nu.marginalia.wmsa.configuration.server.MetricsServer; +import nu.marginalia.wmsa.configuration.server.Service; +import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer; +import nu.marginalia.wmsa.renderer.mustache.RendererFactory; +import nu.marginalia.wmsa.resource_store.StaticResources; +import org.jetbrains.annotations.NotNull; +import spark.Request; +import spark.Response; +import spark.Spark; + +import java.sql.SQLException; +import java.util.*; + +public class ExplorerService extends Service { + + private final MustacheRenderer renderer; + private final HikariDataSource dataSource; + private final StaticResources staticResources; + + record SearchResult( + String domain, + String url, + double relatedness, + boolean hasMore, + boolean active, + boolean indexed) implements Comparable { + + @Override + public int compareTo(@NotNull SearchResult o) { + return (int)(o.relatedness - relatedness); + } + } + + record SearchResults(String query, String message, String aliasDomain, List resultList) { } + + @SneakyThrows + @Inject + public ExplorerService(@Named("service-host") String ip, + @Named("service-port") Integer port, + Initialization initialization, + MetricsServer metricsServer, + RendererFactory rendererFactory, + HikariDataSource dataSource, + StaticResources staticResources + ) { + + super(ip, port, initialization, metricsServer); + + renderer = rendererFactory.renderer("explorer/explorer"); + this.dataSource = dataSource; + this.staticResources = staticResources; + Spark.get("/public/", this::serveIndex, this::render); + Spark.get("/public/search", this::search, this::render); + Spark.get("/public/:resource", this::serveStatic); + + } + + + private Object serveStatic(Request request, Response response) { + String resource = request.params("resource"); + staticResources.serveStatic("explore", resource, request, response); + return ""; + } + + public String render(Object results) { + return renderer.render(results); + } + + private SearchResults serveIndex(Request request, Response response) { + + return new SearchResults("", "", null, Collections.emptyList()); + } + + + private SearchResults search(Request request, Response response) throws SQLException { + String query = request.queryParams("domain"); + + query = trimUrlJunk(query); + + DomainIdInformation domainId = getDomainId(query); + if (!domainId.isPresent()) { + return new SearchResults(query, + "Could not find such a domain (maybe try adding/removing www?)", + null, Collections.emptyList()); + } + + var relatedDomains = getRelatedDomains(domainId); + + if (relatedDomains.isEmpty()) { + String message = """ + I've got nothing. This may either be due to the website being far out in the periphery of Marginalia's + search engine index, or it may be due to the website being too big. + A few hundred of the biggest websites are excluded for performance reasons. They are usually + not very interesting to look at either as everyone links to them and there's no real pattern to discern. + """; + + return new SearchResults(query, message, domainId.alias, relatedDomains); + } + + return new SearchResults(query, "", domainId.alias, relatedDomains); + } + + private List getRelatedDomains(DomainIdInformation domainIdInformation) throws SQLException { + List ret = new ArrayList<>(); + Set seen = new HashSet<>(); + + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT + NV.NEIGHBOR_NAME, + NV.RELATEDNESS, + (LV.DOMAIN_ID IS NOT NULL), + (STATE = 'ACTIVE' OR STATE='SOCIAL_MEDIA'), + INDEXED > 0 + FROM EC_NEIGHBORS_VIEW NV + LEFT JOIN EC_NEIGHBORS_VIEW LV ON (NV.NEIGHBOR_ID=LV.DOMAIN_ID) + INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=NV.NEIGHBOR_ID + WHERE NV.DOMAIN_ID=? + GROUP BY NV.NEIGHBOR_ID + ORDER BY NV.RELATEDNESS DESC + """); + var stmtRev = conn.prepareStatement(""" + SELECT + NV.DOMAIN_NAME, + NV.RELATEDNESS, + (LV.NEIGHBOR_ID IS NOT NULL), + (STATE = 'ACTIVE' OR STATE='SOCIAL_MEDIA'), + INDEXED > 0 + FROM EC_NEIGHBORS_VIEW NV + LEFT JOIN EC_NEIGHBORS_VIEW LV ON (NV.DOMAIN_ID=LV.NEIGHBOR_ID) + INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=NV.DOMAIN_ID + WHERE NV.NEIGHBOR_ID=? + GROUP BY NV.DOMAIN_ID + ORDER BY NV.RELATEDNESS DESC + """ + ); + + ) { + + stmt.setInt(1, domainIdInformation.domainId); + var rsp = stmt.executeQuery(); + while (rsp.next()) { + + String domainName = rsp.getString(1); + double relatedness = rsp.getDouble(2); + boolean hasMore = rsp.getBoolean(3); + boolean active = rsp.getBoolean(4); + boolean indexed = rsp.getBoolean(5); + + seen.add(domainName); + + String url = "http://" + domainName + "/"; + + + if (domainName.length() < 48 && domainName.contains(".")) { + ret.add(new SearchResult( + domainName, + url, + relatedness, + hasMore, + active, + indexed + )); + } + } + + stmtRev.setInt(1, domainIdInformation.domainId); + rsp = stmtRev.executeQuery(); + while (rsp.next()) { + + String domainName = rsp.getString(1); + double relatedness = rsp.getDouble(2); + boolean hasMore = rsp.getBoolean(3); + boolean active = rsp.getBoolean(4); + boolean indexed = rsp.getBoolean(5); + + String url = "http://" + domainName + "/"; + + if (!seen.add(domainName)) + continue; + + if (domainName.length() < 48 && domainName.contains(".")) { + ret.add(new SearchResult( + domainName, + url, + relatedness, + hasMore, + active, + indexed + )); + } + } + } + + Comparator comp = SearchResult::compareTo; + comp = comp.thenComparing(SearchResult::domain); + ret.sort(comp); + + return ret; + + } + + private DomainIdInformation getDomainId(String query) throws SQLException { + + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT IFNULL(ALIAS.ID, DOMAIN.ID), DOMAIN.INDEXED>0 OR ALIAS.INDEXED>0, ALIAS.DOMAIN_NAME + FROM EC_DOMAIN DOMAIN + LEFT JOIN EC_DOMAIN ALIAS ON DOMAIN.DOMAIN_ALIAS=ALIAS.ID + WHERE DOMAIN.DOMAIN_NAME=? + """)) { + stmt.setString(1, query); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return new DomainIdInformation( + rsp.getInt(1), + rsp.getBoolean(2), + rsp.getString(3) + ); + } + } + return new DomainIdInformation(-1, false, null); + } + + private String trimUrlJunk(String query) { + if (query.startsWith("http://")) { + query = query.substring(7); + } + if (query.startsWith("https://")) { + query = query.substring(8); + } + + int lastSlash = query.indexOf('/'); + if (lastSlash > 0) { + query = query.substring(0, lastSlash); + } + + return query; + } + + record DomainIdInformation(int domainId, boolean indexed, String alias) { + boolean isPresent() { + return domainId >= 0; + } + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexBucket.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexBucket.java index 89d3dca6..3633f307 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexBucket.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexBucket.java @@ -1,20 +1,19 @@ package nu.marginalia.wmsa.edge.index; import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriter; -import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader; import nu.marginalia.wmsa.edge.index.svc.query.IndexQuery; -import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool; import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryFactory; +import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryParams; import nu.marginalia.wmsa.edge.index.svc.query.ResultDomainDeduplicator; import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepFromPredicate; +import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryRankLimitingFilter; import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.Collections; -import java.util.Comparator; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; @@ -103,54 +102,65 @@ public class EdgeIndexBucket { return indexReader != null; } - public IndexQuery getQuery(IndexQueryCachePool cachePool, IndexBlock block, LongPredicate filter, EdgeIndexSearchTerms searchTerms) { + public IndexQuery getQuery(LongPredicate filter, IndexQueryParams params) { + if (null == indexReader) { - logger.warn("Index reader not neady {}", block); + logger.warn("Index reader not neady {}", params.block()); return new IndexQuery(Collections.emptyList()); } - final int[] orderedIncludes = searchTerms.includes - .stream() - .sorted(Comparator.comparingLong(i -> indexReader.numHits(cachePool, block, i))) - .distinct() - .mapToInt(Integer::intValue) - .toArray(); + final int[] orderedIncludes = params.searchTerms() + .sortedDistinctIncludes((a, b) -> compareKeywords(params.block(), a, b)); - IndexQueryFactory.IndexQueryBuilder query; + IndexQueryFactory.IndexQueryBuilder query = createQueryBuilder(orderedIncludes[0], params); - query = indexReader.findWord(cachePool, block, orderedIncludes[0]); if (query == null) { return new IndexQuery(Collections.emptyList()); } - query.filter(filter); + query.addInclusionFilter(new QueryFilterStepFromPredicate(filter)); + if (params.rankLimit() != null) { + query.addInclusionFilter(new QueryRankLimitingFilter(params.rankLimit())); + } for (int i = 1; i < orderedIncludes.length; i++) { query = query.also(orderedIncludes[i]); } - for (int term : searchTerms.excludes) { + for (int term : params.searchTerms().excludes()) { query = query.not(term); } return query.build(); } + private IndexQueryFactory.IndexQueryBuilder createQueryBuilder(int firstKeyword, IndexQueryParams params) { - public IndexQuery getDomainQuery(IndexQueryCachePool pool, int wordId, ResultDomainDeduplicator localFilter) { - var query = indexReader.findDomain(pool, wordId); + if (params.targetDomains() != null && !params.targetDomains().isEmpty()) { + return indexReader.findWordForDomainList(params.block(), params.targetDomains(), firstKeyword); + } + return indexReader.findWord(params.block(), params.qualityLimit(), firstKeyword); + + } + + private int compareKeywords(IndexBlock block, int a, int b) { + return Long.compare( + indexReader.numHits(block, a), + indexReader.numHits(block, b) + ); + } + + + public IndexQuery getDomainQuery(int wordId, ResultDomainDeduplicator localFilter) { + var query = indexReader.findDomain(wordId); query.addInclusionFilter(new QueryFilterStepFromPredicate(localFilter::filterRawValue)); return query; } - public IndexBlock getTermScore(IndexQueryCachePool cachePool, int termId, long urlId) { - return indexReader.getBlockForResult(cachePool, termId, urlId); + /** Replaces the values of ids with their associated metadata, or 0L if absent */ + public long[] getMetadata(IndexBlock block, int termId, long[] ids) { + return indexReader.getMetadata(block, termId, ids); } - - public boolean isTermInBucket(IndexQueryCachePool cachePool, IndexBlock block, int termId, long urlId) { - return indexReader.isTermInBucket(cachePool, block, termId, urlId); - } - } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java index 8df32c0a..577553b6 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java @@ -2,8 +2,8 @@ package nu.marginalia.wmsa.edge.index; import com.google.inject.Inject; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.index.conversion.ConversionUnnecessaryException; +import nu.marginalia.wmsa.edge.index.model.IndexBlock; import java.io.IOException; @@ -18,9 +18,6 @@ public class EdgeIndexControl { } public void regenerateIndex(int id) { - System.runFinalization(); - System.gc(); - for (IndexBlock block : IndexBlock.values()) { try { servicesFactory.convertIndex(id, block); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java index 9774fd97..3249241d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java @@ -9,6 +9,7 @@ import nu.marginalia.wmsa.configuration.server.Initialization; import nu.marginalia.wmsa.configuration.server.MetricsServer; import nu.marginalia.wmsa.configuration.server.Service; import nu.marginalia.wmsa.edge.index.reader.SearchIndexes; +import nu.marginalia.wmsa.edge.index.svc.EdgeIndexDomainQueryService; import nu.marginalia.wmsa.edge.index.svc.EdgeIndexLexiconService; import nu.marginalia.wmsa.edge.index.svc.EdgeIndexOpsService; import nu.marginalia.wmsa.edge.index.svc.EdgeIndexQueryService; @@ -39,7 +40,9 @@ public class EdgeIndexService extends Service { EdgeIndexOpsService opsService, EdgeIndexLexiconService lexiconService, - EdgeIndexQueryService indexQueryService) + EdgeIndexQueryService indexQueryService, + EdgeIndexDomainQueryService domainQueryService + ) { super(ip, port, init, metricsServer); @@ -51,7 +54,7 @@ public class EdgeIndexService extends Service { Spark.post("/words/", lexiconService::putWords); Spark.post("/search/", indexQueryService::search, gson::toJson); - Spark.post("/search-domain/", indexQueryService::searchDomain, gson::toJson); + Spark.post("/search-domain/", domainQueryService::searchDomain, gson::toJson); Spark.get("/dictionary/*", lexiconService::getWordId, gson::toJson); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java index 869c6f5b..0b516ba4 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java @@ -103,9 +103,9 @@ public class IndexServicesFactory { public void convertIndex(int id, IndexBlock block) throws ConversionUnnecessaryException, IOException { var converter = new SearchIndexConverter(block, id, tmpFileDir, - preconverterOutputFile.get(id, block.ordinal()), - indexWriteWordsFile.get(id, block.id), - indexWriteUrlsFile.get(id, block.id), + preconverterOutputFile.get(id, block), + indexWriteWordsFile.get(id, block), + indexWriteUrlsFile.get(id, block), partitioner, domainBlacklist ); @@ -118,7 +118,7 @@ public class IndexServicesFactory { for (int index = 0; index < (DYNAMIC_BUCKET_LENGTH + 1); index++) { for (IndexBlock block : IndexBlock.values()) { - shards.put(new SearchIndexPreconverter.Shard(index, block.ordinal()), getPreconverterOutputFile(index, block.ordinal())); + shards.put(new SearchIndexPreconverter.Shard(index, block.ordinal()), getPreconverterOutputFile(index, block)); } } @@ -129,7 +129,7 @@ public class IndexServicesFactory { ); } - private File getPreconverterOutputFile(int index, int block) { + private File getPreconverterOutputFile(int index, IndexBlock block) { return preconverterOutputFile.get(index, block); } @@ -141,7 +141,7 @@ public class IndexServicesFactory { indexMap.put(block, createSearchIndex(id, block)); } catch (Exception ex) { - logger.error("Could not create index {}-{}", id, block); + logger.error("Could not create index {}-{} ({})", id, block, ex.getMessage()); } } return new SearchIndexReader(indexMap); @@ -150,8 +150,8 @@ public class IndexServicesFactory { private SearchIndex createSearchIndex(int bucketId, IndexBlock block) { try { return new SearchIndex("IndexReader"+bucketId+":"+ block.name(), - indexReadUrlsFile.get(bucketId, block.id), - indexReadWordsFile.get(bucketId, block.id)); + indexReadUrlsFile.get(bucketId, block), + indexReadWordsFile.get(bucketId, block)); } catch (IOException e) { throw new RuntimeException(e); } @@ -159,9 +159,10 @@ public class IndexServicesFactory { public Callable switchFilesJob(int id) { return () -> { - for (int block = 0; block < IndexBlock.values().length; block++) { + + for (var block : IndexBlock.values()) { if (Files.exists(indexWriteWordsFile.get(id, block).toPath()) && - Files.exists(indexWriteUrlsFile.get(id, block).toPath())) { + Files.exists(indexWriteUrlsFile.get(id, block).toPath())) { Files.move( indexWriteWordsFile.get(id, block).toPath(), indexReadWordsFile.get(id, block).toPath(), @@ -172,6 +173,7 @@ public class IndexServicesFactory { StandardCopyOption.REPLACE_EXISTING); } } + return true; }; } @@ -205,8 +207,8 @@ class PartitionedDataFile { this.pattern = pattern; } - public File get(int id) { - Path partitionDir = partition.resolve(Integer.toString(id)); + public File get(Object id) { + Path partitionDir = partition.resolve(id.toString()); if (!partitionDir.toFile().exists()) { partitionDir.toFile().mkdir(); } @@ -223,13 +225,13 @@ class DoublePartitionedDataFile { this.pattern = pattern; } - public File get(int id, int id2) { - Path partitionDir = partition.resolve(Integer.toString(id)); + public File get(Object id, Object id2) { + Path partitionDir = partition.resolve(id.toString()); if (!partitionDir.toFile().exists()) { partitionDir.toFile().mkdir(); } - partitionDir = partitionDir.resolve(Integer.toString(id2)); + partitionDir = partitionDir.resolve(id2.toString()); if (!partitionDir.toFile().exists()) { partitionDir.toFile().mkdir(); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexClient.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexClient.java index 7b2ed5c6..157e3fb8 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexClient.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexClient.java @@ -47,6 +47,9 @@ public class EdgeIndexClient extends AbstractDynamicClient implements EdgeIndexW var wordSetBuilder = IndexPutKeywordsReq.WordSet.newBuilder(); wordSetBuilder.setIndex(wordSet.block().ordinal()); wordSetBuilder.addAllWords(List.of(wordSet.keywords())); + for (var meta : wordSet.metadata()) { + wordSetBuilder.addMeta(meta); + } keywordBuilder.addWordSet(wordSetBuilder.build()); var req = keywordBuilder.build(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexLocalService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexLocalService.java index 61469083..43ec70f7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexLocalService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexLocalService.java @@ -21,7 +21,6 @@ import org.slf4j.LoggerFactory; import java.io.IOException; import java.nio.file.Path; import java.util.Arrays; -import java.util.List; @Singleton public class EdgeIndexLocalService implements EdgeIndexWriterClient { @@ -53,9 +52,9 @@ public class EdgeIndexLocalService implements EdgeIndexWriterClient { return; } - for (var chunk : ListChunker.chopList(List.of(wordSet.keywords()), SearchIndexJournalEntry.MAX_LENGTH)) { + for (var chunk : ListChunker.chopList(wordSet, SearchIndexJournalEntry.MAX_LENGTH)) { - var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk)); + var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk.keywords(), chunk.metadata())); var header = new SearchIndexJournalEntryHeader(domain, url, wordSet.block()); indexWriter.put(header, entry); @@ -63,19 +62,22 @@ public class EdgeIndexLocalService implements EdgeIndexWriterClient { } - private long[] getOrInsertWordIds(List words) { - long[] ids = new long[words.size()]; - int putId = 0; + private long[] getOrInsertWordIds(String[] words, long[] meta) { + long[] ids = new long[words.length*2]; + int putIdx = 0; + + for (int i = 0; i < words.length; i++) { + String word = words[i]; - for (String word : words) { long id = lexicon.getOrInsert(word); if (id != DictionaryHashMap.NO_VALUE) { - ids[putId++] = id; + ids[putIdx++] = id; + ids[putIdx++] = meta[i]; } } - if (putId != words.size()) { - ids = Arrays.copyOf(ids, putId); + if (putIdx != words.length*2) { + ids = Arrays.copyOf(ids, putIdx); } return ids; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java index 79c47a08..d78ef51a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java @@ -20,12 +20,14 @@ import java.nio.channels.FileChannel; import java.nio.file.Files; import java.nio.file.Path; -import static nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry.MAX_LENGTH; - public class SearchIndexConverter { - public static final BTreeContext urlsBTreeContext = new BTreeContext(5, 1, ~0, 8); + public static final int ENTRY_URL_OFFSET = 0; + public static final int ENTRY_METADATA_OFFSET = 1; + public static final int ENTRY_SIZE = 2; - private final long[] tmpWordsBuffer = new long[MAX_LENGTH]; + public static final BTreeContext urlsBTreeContext = new BTreeContext(5, ENTRY_SIZE, ~0, 8); + + private final long[] tmpWordsBuffer = SearchIndexJournalReader.createAdequateTempBuffer(); private final Path tmpFileDir; @@ -72,7 +74,7 @@ public class SearchIndexConverter { return; } - logger.info("Converting {} ({}) {} {}", block.id, block, inputFile, journalReader.fileHeader); + logger.info("Converting {} ({}) {} {}", block.ordinal(), block, inputFile, journalReader.fileHeader); var lock = partitioner.getReadLock(); try { @@ -80,10 +82,10 @@ public class SearchIndexConverter { var tmpUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat"); - logger.info("Creating word index table {} for block {} ({})", outputFileWords, block.id, block); + logger.info("Creating word index table {} for block {}", outputFileWords, block.ordinal()); WordIndexOffsetsTable wordIndexTable = createWordIndexTable(journalReader, outputFileWords); - logger.info("Creating word urls table {} for block {} ({})", outputFileUrls, block.id, block); + logger.info("Creating word urls table {} for block {}", outputFileUrls, block.ordinal()); createUrlTable(journalReader, tmpUrlsFile, wordIndexTable); Files.delete(tmpUrlsFile); @@ -111,10 +113,10 @@ public class SearchIndexConverter { final SearchIndexJournalEntry entryData = entry.readEntryUsingBuffer(tmpWordsBuffer); - for (int i = 0; i < entryData.size(); i++) { - int wordId = (int) entryData.get(i); + for (var record : entryData) { + int wordId = record.wordId(); if (wordId < 0 || wordId >= topWord) { - logger.warn("Bad wordId {}", wordId); + logger.warn("Bad word {}", record); } wordsTableWriter.acceptWord(wordId); } @@ -138,7 +140,7 @@ public class SearchIndexConverter { try (RandomAccessFile urlsTmpFileRAF = new RandomAccessFile(tmpUrlsFile.toFile(), "rw"); FileChannel urlsTmpFileChannel = urlsTmpFileRAF.getChannel()) { - try (RandomWriteFunnel rwf = new RandomWriteFunnel(tmpFileDir, numberOfWordsTotal, 10_000_000)) { + try (RandomWriteFunnel rwf = new RandomWriteFunnel(tmpFileDir, ENTRY_SIZE * numberOfWordsTotal, 10_000_000)) { int[] wordWriteOffset = new int[wordOffsetsTable.length()]; for (var entry : journalReader) { @@ -146,21 +148,29 @@ public class SearchIndexConverter { var entryData = entry.readEntryUsingBuffer(tmpWordsBuffer); - for (int i = 0; i < entryData.size(); i++) { - int wordId = (int) entryData.get(i); + for (var record : entryData) { + int wordId = record.wordId(); + long metadata = record.metadata(); - if (wordId >= wordWriteOffset.length) + if (wordId >= wordWriteOffset.length) { + logger.warn("Overflowing wordId {}", wordId); continue; + } + if (wordId < 0) { logger.warn("Negative wordId {}", wordId); } final long urlInternal = translateUrl(entry.docId()); - if (wordId > 0) { - rwf.put(wordOffsetsTable.get(wordId - 1) + wordWriteOffset[wordId]++, urlInternal); - } else { - rwf.put(wordWriteOffset[wordId]++, urlInternal); - } + + long offset; + if (wordId > 0) offset = wordOffsetsTable.get(wordId - 1) + wordWriteOffset[wordId]; + else offset = wordWriteOffset[wordId]; + + rwf.put(offset + ENTRY_URL_OFFSET, urlInternal); + rwf.put(offset + ENTRY_METADATA_OFFSET, metadata); + + wordWriteOffset[wordId] += ENTRY_SIZE; } } @@ -171,9 +181,9 @@ public class SearchIndexConverter { try (var urlsTmpFileMap = MultimapFileLong.forOutput(tmpUrlsFile, numberOfWordsTotal)) { if (wordOffsetsTable.length() > 0) { - var urlTmpFileSorter = urlsTmpFileMap.createSorter(tmpFileDir, internalSortLimit); + var urlTmpFileSorter = urlsTmpFileMap.createSorter(tmpFileDir, internalSortLimit, ENTRY_SIZE); - wordOffsetsTable.forEachRange(urlTmpFileSorter::sort); + wordOffsetsTable.forEachRange(urlTmpFileSorter::sortRange); urlsTmpFileMap.force(); } else { @@ -187,7 +197,7 @@ public class SearchIndexConverter { wordOffsetsTable.foldRanges((accumulatorIdx, start, length) -> { // Note: The return value is accumulated into accumulatorIdx! - return writer.write(accumulatorIdx, length, + return writer.write(accumulatorIdx, length/ENTRY_SIZE, slice -> slice.transferFromFileChannel(urlsTmpFileChannel, 0, start, start + length)); }); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexDao.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexDao.java index fcf6d175..ce2c30d2 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexDao.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexDao.java @@ -9,7 +9,6 @@ import gnu.trove.set.hash.TIntHashSet; import lombok.SneakyThrows; import nu.marginalia.util.ranking.BetterReversePageRank; import nu.marginalia.util.ranking.BetterStandardPageRank; -import nu.marginalia.util.ranking.BuggyStandardPageRank; import nu.marginalia.util.ranking.RankingDomainFetcher; import nu.marginalia.wmsa.edge.index.model.RankingSettings; import org.slf4j.Logger; @@ -87,8 +86,25 @@ public class SearchIndexDao { @SneakyThrows public TIntList getStandardDomains() { - var spr = new BuggyStandardPageRank(rankingDomains,rankingSettings.standard.toArray(String[]::new)); - return spr.pageRankWithPeripheralNodes(spr.size()/2); + TIntArrayList results = new TIntArrayList(); + + try (var connection = dataSource.getConnection(); + var stmt = connection.prepareStatement( + """ + SELECT ID FROM EC_DOMAIN + WHERE INDEXED>0 + AND STATE='ACTIVE' + AND DOMAIN_ALIAS IS NULL + ORDER BY ID ASC + """); + ) { + var rs = stmt.executeQuery(); + while (rs.next()) { + results.add(rs.getInt(1)); + } + } + return results; + } @SneakyThrows diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPartitioner.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPartitioner.java index 2f2e9d47..8d7f19eb 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPartitioner.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPartitioner.java @@ -110,11 +110,12 @@ public class SearchIndexPartitioner { return true; if (academiaRanking.hasBucket(bucketId, domainId)) return true; - if (standardRanking.hasBucket(bucketId, domainId)) - return true; if (specialDomainRanking.hasBucket(bucketId, domainId)) return true; + if (standardRanking.hasBucket(bucketId, domainId)) + return true; + return DYNAMIC_BUCKET_LENGTH == bucketId; } @@ -148,15 +149,15 @@ public class SearchIndexPartitioner { if (academiaRanking != null && academiaRanking.ownsBucket(bucketId)) { return academiaRanking.translateId(id); } - if (standardRanking != null && standardRanking.ownsBucket(bucketId)) { - return standardRanking.translateId(id); - } if (specialDomainRanking != null && specialDomainRanking.ownsBucket(bucketId)) { return specialDomainRanking.translateId(id); } - if (retroRanking != null) { - return retroRanking.translateId(id); + + // standard gets passed traight through + if (standardRanking != null && standardRanking.ownsBucket(bucketId)) { + return id; } + return id; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPreconverter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPreconverter.java index 57d63825..4b16a817 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPreconverter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPreconverter.java @@ -52,7 +52,7 @@ public class SearchIndexPreconverter { var lock = partitioner.getReadLock(); try { lock.lock(); - ByteBuffer buffer = ByteBuffer.allocateDirect(8192); + ByteBuffer buffer = ByteBuffer.allocateDirect(65536); for (var entry : indexJournalReader) { if (!partitioner.isGoodUrl(entry.urlId()) || spamDomains.contains(entry.domainId())) { @@ -93,7 +93,7 @@ public class SearchIndexPreconverter { } public boolean shouldWrite(SearchIndexPartitioner partitioner, SearchIndexJournalReader.JournalEntry entry) { - return shard.block == entry.header.block().id + return shard.block == entry.header.block().ordinal() && partitioner.filterUnsafe(entry.domainId(), shard.bucket); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexOffsetsTable.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexOffsetsTable.java index f1308d6e..7a601a4f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexOffsetsTable.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexOffsetsTable.java @@ -23,10 +23,10 @@ public class WordIndexOffsetsTable { for (int i = 1; i < table.length; i++) { long start = table[i-1]; - int length = (int) (table[i] - start); + long end = table[i]; - if (length != 0) { - o.accept(start, length); + if (start != end) { + o.accept(start, end); } } } @@ -58,7 +58,7 @@ public class WordIndexOffsetsTable { } public interface OffsetTableEntryConsumer { - void accept(long start, int length) throws IOException; + void accept(long start, long end) throws IOException; } public interface OffsetTableEntryFoldConsumer { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordsTableWriter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordsTableWriter.java index 15ad0cd3..d3e54d19 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordsTableWriter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordsTableWriter.java @@ -8,8 +8,10 @@ import nu.marginalia.wmsa.edge.index.reader.IndexWordsTable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.*; +import java.io.File; +import java.io.IOException; +import static nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter.ENTRY_SIZE; import static nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter.urlsBTreeContext; public class WordsTableWriter { @@ -23,7 +25,9 @@ public class WordsTableWriter { } public void acceptWord(int wordId) { - table.lengths().increment(wordId); + for (int i = 0; i < ENTRY_SIZE; i++) { + table.lengths().increment(wordId); + } } public WordIndexOffsetsTable getTable() { @@ -58,7 +62,7 @@ public class WordsTableWriter { mapSlice.put(idx++, (long)length<<32); mapSlice.put(idx++, 0); - urlFileOffset += (urlsBTreeContext.calculateSize(length)); + urlFileOffset += (urlsBTreeContext.calculateSize(length / ENTRY_SIZE)); } for (int i = 1; i < offsetTable.length; i++) { @@ -68,7 +72,7 @@ public class WordsTableWriter { mapSlice.put(idx++, (long)length << 32 | i); mapSlice.put(idx++, urlFileOffset); - urlFileOffset += (urlsBTreeContext.calculateSize(length)); + urlFileOffset += (urlsBTreeContext.calculateSize(length / ENTRY_SIZE)); } } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalReader.java index 94ebeacf..65448755 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalReader.java @@ -12,6 +12,8 @@ import org.jetbrains.annotations.NotNull; import java.nio.ByteBuffer; import java.util.Iterator; +import static nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry.ENTRY_SIZE; +import static nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry.MAX_LENGTH; import static nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader.HEADER_SIZE_LONGS; public class SearchIndexJournalReader implements Iterable { @@ -23,6 +25,10 @@ public class SearchIndexJournalReader implements Iterable { private final int size; private final long[] underlyingArray; public static final int MAX_LENGTH = 1000; + public static final int ENTRY_SIZE = 2; public SearchIndexJournalEntry(long[] underlyingArray) { this.size = underlyingArray.length; @@ -46,4 +48,24 @@ public class SearchIndexJournalEntry { return String.format("%s[%s]", getClass().getSimpleName(), Arrays.toString(toArray())); } + public Iterator iterator() { + return new EntryIterator(); + } + + private class EntryIterator implements Iterator { + int pos = -ENTRY_SIZE; + + public boolean hasNext() { + return pos + ENTRY_SIZE < size; + } + + @Override + public Record next() { + pos+=ENTRY_SIZE; + + return new Record((int) underlyingArray[pos], underlyingArray[pos+1]); + } + } + + public record Record(int wordId, long metadata) {} } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java index f7b95ec4..27514b58 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java @@ -5,6 +5,7 @@ import com.google.common.hash.Hashing; import io.prometheus.client.Gauge; import lombok.SneakyThrows; import nu.marginalia.util.dict.DictionaryHashMap; +import nu.marginalia.util.dict.DictionaryMap; import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -16,7 +17,7 @@ import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; public class KeywordLexicon implements AutoCloseable { - private final DictionaryHashMap reverseIndex; + private final DictionaryMap reverseIndex; private final ReadWriteLock memoryLock = new ReentrantReadWriteLock(); private final Logger logger = LoggerFactory.getLogger(getClass()); @@ -30,7 +31,7 @@ public class KeywordLexicon implements AutoCloseable { private final KeywordLexiconJournal journal; @SneakyThrows - public KeywordLexicon(KeywordLexiconJournal keywordLexiconJournal, DictionaryHashMap reverseIndexHashMap) { + public KeywordLexicon(KeywordLexiconJournal keywordLexiconJournal, DictionaryMap reverseIndexHashMap) { journal = keywordLexiconJournal; reverseIndex = reverseIndexHashMap; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgeIndexSearchTerms.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgeIndexSearchTerms.java deleted file mode 100644 index 9c78a2d2..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgeIndexSearchTerms.java +++ /dev/null @@ -1,16 +0,0 @@ -package nu.marginalia.wmsa.edge.index.model; - -import lombok.AllArgsConstructor; - -import java.util.ArrayList; -import java.util.List; - -@AllArgsConstructor -public class EdgeIndexSearchTerms { - public List includes = new ArrayList<>(); - public List excludes = new ArrayList<>(); - - public boolean isEmpty() { - return includes.isEmpty(); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePageWordFlags.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePageWordFlags.java new file mode 100644 index 00000000..848cc870 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePageWordFlags.java @@ -0,0 +1,32 @@ +package nu.marginalia.wmsa.edge.index.model; + +import java.util.EnumSet; + +public enum EdgePageWordFlags { + Title, + Subjects, + NamesWords, + Site, + SiteAdjacent, + Simple; + + public int asBit() { + return 1 << ordinal(); + } + + public boolean isPresent(long value) { + return (asBit() & value) > 0; + } + + public static EnumSet decode(long encodedValue) { + EnumSet ret = EnumSet.noneOf(EdgePageWordFlags.class); + + for (EdgePageWordFlags f : values()) { + if ((encodedValue & f.asBit()) > 0) { + ret.add(f); + } + } + + return ret; + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePageWordMetadata.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePageWordMetadata.java new file mode 100644 index 00000000..84a907f0 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePageWordMetadata.java @@ -0,0 +1,90 @@ +package nu.marginalia.wmsa.edge.index.model; + +import nu.marginalia.util.BrailleBlockPunchCards; + +import java.util.EnumSet; + +import static java.lang.Math.max; +import static java.lang.Math.min; + +public record EdgePageWordMetadata(int tfIdf, + int positions, + int quality, + int count, + EnumSet flags) { + + // If flags are moved from the least significant end of + // this struct, then EntrySourceFromBTree will break. + + public static final long COUNT_MASK = 0xFL; + public static final int COUNT_SHIFT = 8; + + public static final long QUALITY_MASK = 0xFL; + public static final int QUALITY_SHIFT = 12; + + public static final long TF_IDF_MASK = 0xFFFFL; + public static final int TF_IDF_SHIFT = 16; + + public static final int POSITIONS_SHIFT = 32; + + public EdgePageWordMetadata(long value) { + this( + (int)((value >>> TF_IDF_SHIFT) & TF_IDF_MASK), + (int)(value >>> POSITIONS_SHIFT), + (int)((value >>> QUALITY_SHIFT) & QUALITY_MASK), + (int)((value >>> COUNT_SHIFT) & COUNT_MASK), + EdgePageWordFlags.decode(value) + ); + } + + public static int decodeQuality(long encoded) { + return (int)((encoded >>> QUALITY_SHIFT) & QUALITY_MASK); + } + + public static boolean hasFlags(long encoded, long metadataBitMask) { + return (encoded & metadataBitMask) == encoded; + } + + public String toString() { + StringBuilder sb = new StringBuilder(getClass().getSimpleName()); + sb.append('[') + .append("tfidf=").append(tfIdf).append(", ") + .append("quality=").append(quality).append(", ") + .append("count=").append(count).append(", ") + .append("positions=[").append(BrailleBlockPunchCards.printBits(positions, 32)).append(']'); + sb.append(", flags=").append(flags).append(']'); + return sb.toString(); + } + + /* Encoded in a 64 bit long as + 0-8 flags + 8-12 count, + 12-16 quality, + 16-32 tf-idf [0, 65536] + 32-64 position mask + */ + public long encode() { + long ret = 0; + + for (var flag : flags) { + ret |= flag.asBit(); + } + + ret |= min(TF_IDF_MASK, max(0, tfIdf)) << TF_IDF_SHIFT; + ret |= min(COUNT_MASK, max(0, count)) << COUNT_SHIFT; + ret |= min(QUALITY_MASK, max(0, quality)) << QUALITY_SHIFT; + ret |= ((long)(positions)) << POSITIONS_SHIFT; + + return ret; + } + + public boolean isEmpty() { + return count == 0 && positions == 0 && flags.isEmpty() && tfIdf == 0; + } + + public static long emptyValue() { + return 0L; + } + + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePutWordsRequest.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePutWordsRequest.java deleted file mode 100644 index c63c5b1f..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePutWordsRequest.java +++ /dev/null @@ -1,20 +0,0 @@ -package nu.marginalia.wmsa.edge.index.model; - -import lombok.AllArgsConstructor; -import lombok.Getter; -import lombok.ToString; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet; -import nu.marginalia.wmsa.edge.model.id.EdgeId; - -@AllArgsConstructor @Getter -@ToString -public class EdgePutWordsRequest { - public EdgeId domainId; - public EdgeId urlId; - public double quality; - - public EdgePageWordSet wordSet; - private int index = 0; -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java index 67b5df80..108b4be2 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java @@ -1,47 +1,35 @@ package nu.marginalia.wmsa.edge.index.model; public enum IndexBlock { - TitleKeywords(IndexBlockType.QUALITY_SIGNAL, 0, 0), - Title(IndexBlockType.QUALITY_SIGNAL, 1, 1), + Title(IndexBlockType.PAGE_DATA), + Meta(IndexBlockType.PAGE_DATA), - Link(IndexBlockType.QUALITY_SIGNAL, 2, 1.15), + Words_1(IndexBlockType.PAGE_DATA), + Words_2(IndexBlockType.PAGE_DATA), + Words_4(IndexBlockType.PAGE_DATA), + Words_8(IndexBlockType.PAGE_DATA), + Words_16Plus(IndexBlockType.PAGE_DATA), - Subjects(IndexBlockType.QUALITY_SIGNAL, 3, 1.0), - NamesWords(IndexBlockType.QUALITY_SIGNAL, 4, 3.0), + Link(IndexBlockType.QUALITY_SIGNAL), + Site(IndexBlockType.QUALITY_SIGNAL), - Artifacts(IndexBlockType.PAGE_DATA, 5, 10), - Meta(IndexBlockType.PAGE_DATA, 6, 7), + Artifacts(IndexBlockType.PAGE_DATA), - Tfidf_Top(IndexBlockType.TF_IDF, 7, 1.5), - Tfidf_Middle(IndexBlockType.TF_IDF, 8, 2), - Tfidf_Lower(IndexBlockType.TF_IDF, 9, 3.5), - - Words_1(IndexBlockType.PAGE_DATA, 10, 2.0), - Words_2(IndexBlockType.PAGE_DATA,11, 3.5), - Words_4(IndexBlockType.PAGE_DATA,12, 4.0), - Words_8(IndexBlockType.PAGE_DATA,13, 4.5), - Words_16Plus(IndexBlockType.PAGE_DATA,14, 7.0), - - Site(IndexBlockType.QUALITY_SIGNAL, 15, 1.2) + Tfidf_High(IndexBlockType.TRANSIENT), + Subjects(IndexBlockType.TRANSIENT) ; public final IndexBlockType type; - public final int id; - public final double sortOrder; - IndexBlock(IndexBlockType type, int id, double sortOrder) { + IndexBlock(IndexBlockType type) { this.type = type; - this.sortOrder = sortOrder; - this.id = id; } + // This is kind of a hot method, and Enum.values() allocates a new + // array each call. + private static final IndexBlock[] values = IndexBlock.values(); public static IndexBlock byId(int id) { - for (IndexBlock block : values()) { - if (id == block.id) { - return block; - } - } - throw new IllegalArgumentException("Bad block id"); + return values[id]; } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlockType.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlockType.java index b774dadd..9ee6fc49 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlockType.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlockType.java @@ -1,7 +1,10 @@ package nu.marginalia.wmsa.edge.index.model; public enum IndexBlockType { + /** This block is only used for joins */ QUALITY_SIGNAL, - TF_IDF, - PAGE_DATA + /** This block contains page keywords */ + PAGE_DATA, + /** This block is only used for generation */ + TRANSIENT } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/IndexWordsTable.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/IndexWordsTable.java index 013ebf1d..72881f68 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/IndexWordsTable.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/IndexWordsTable.java @@ -2,7 +2,6 @@ package nu.marginalia.wmsa.edge.index.reader; import com.upserve.uppend.blobs.NativeIO; import nu.marginalia.util.btree.BTreeReader; -import nu.marginalia.util.btree.model.BTreeHeader; import nu.marginalia.util.multimap.MultimapFileLong; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -17,7 +16,6 @@ import static nu.marginalia.wmsa.edge.index.conversion.words.WordsTableWriter.wo public class IndexWordsTable implements AutoCloseable { protected final MultimapFileLong words; protected final BTreeReader reader; - protected final BTreeHeader header; protected final int HEADER_OFFSET = 1; final Logger logger = LoggerFactory.getLogger(getClass()); @@ -26,8 +24,7 @@ public class IndexWordsTable implements AutoCloseable { public IndexWordsTable(MultimapFileLong words) { this.words = words; - reader = new BTreeReader(words, wordsBTreeContext); - header = reader.getHeader(HEADER_OFFSET); + reader = new BTreeReader(words, wordsBTreeContext, HEADER_OFFSET); madvise(); } @@ -49,7 +46,7 @@ public class IndexWordsTable implements AutoCloseable { } public long positionForWord(int wordId) { - long offset = reader.findEntry(header, wordId); + long offset = reader.findEntry(wordId); if (offset < 0) { return -1L; @@ -60,7 +57,7 @@ public class IndexWordsTable implements AutoCloseable { public int wordLength(int wordId) { - long offset = reader.findEntry(header, wordId); + long offset = reader.findEntry(wordId); if (offset < 0) { return -1; } @@ -72,7 +69,7 @@ public class IndexWordsTable implements AutoCloseable { words.advice(NativeIO.Advice.Random); words.advice0(NativeIO.Advice.WillNeed); - var h = reader.getHeader(HEADER_OFFSET); + var h = reader.getHeader(); int length = (int)(h.dataOffsetLongs() - h.indexOffsetLongs()); words.adviceRange(NativeIO.Advice.WillNeed, h.indexOffsetLongs(), length); @@ -80,8 +77,8 @@ public class IndexWordsTable implements AutoCloseable { } public void forEachWordsOffset(LongConsumer offsetConsumer) { - int n = header.numEntries(); - long offset = header.dataOffsetLongs(); + int n = reader.numEntries(); + long offset = reader.getHeader().dataOffsetLongs(); for (int i = 0; i < n; i++) { try { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndex.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndex.java index 0c5a8ab0..9e5852e4 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndex.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndex.java @@ -5,21 +5,13 @@ import com.google.inject.name.Named; import com.upserve.uppend.blobs.NativeIO; import io.reactivex.rxjava3.schedulers.Schedulers; import nu.marginalia.util.btree.BTreeReader; -import nu.marginalia.util.btree.CachingBTreeReader; -import nu.marginalia.util.btree.model.BTreeHeader; import nu.marginalia.util.multimap.MultimapFileLong; -import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter; -import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool; -import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySource; -import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepIf; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.File; import java.io.IOException; import java.io.RandomAccessFile; -import java.util.Arrays; -import java.util.stream.LongStream; public class SearchIndex implements AutoCloseable { @@ -27,8 +19,6 @@ public class SearchIndex implements AutoCloseable { private final IndexWordsTable words; public final String name; private final RandomAccessFile wordsFile; - private final BTreeReader bTreeReader; - private final CachingBTreeReader cachingBTreeReader; private final Logger logger; @@ -49,16 +39,13 @@ public class SearchIndex implements AutoCloseable { urls = MultimapFileLong.forReading(inUrls.toPath()); words = IndexWordsTable.ofFile(wordsFile); - bTreeReader = new BTreeReader(urls, SearchIndexConverter.urlsBTreeContext); - cachingBTreeReader = new CachingBTreeReader(urls, SearchIndexConverter.urlsBTreeContext); - - Schedulers.io().scheduleDirect(() -> madvise(urls, bTreeReader)); + Schedulers.io().scheduleDirect(() -> madvise(urls)); } - private void madvise(MultimapFileLong urls, BTreeReader reader) { + private void madvise(MultimapFileLong urls) { words.forEachWordsOffset(offset -> { - var h = reader.getHeader(offset); + var h = BTreeReader.createHeader(urls, offset); long length = h.dataOffsetLongs() - h.indexOffsetLongs(); urls.adviceRange(NativeIO.Advice.WillNeed, offset, 512); @@ -70,174 +57,16 @@ public class SearchIndex implements AutoCloseable { } - public long numUrls(IndexQueryCachePool pool, int wordId) { + public long numUrls(int wordId) { int length = words.wordLength(wordId); if (length < 0) return 0; if (length > 0) return length; - return rangeForWord(pool, wordId).numEntries(); + return rangeForWord(wordId).numEntries(); } - public IndexBTreeRange rangeForWord(IndexQueryCachePool pool, int wordId) { - IndexBTreeRange range = pool.getRange(words, wordId); - - if (range == null) { - range = new IndexBTreeRange(words.positionForWord(wordId)); - pool.cacheRange(words, wordId, range); - } - - return range; - } - - public IndexBTreeRange rangeForWord(int wordId) { - return new IndexBTreeRange(words.positionForWord(wordId)); - } - - public class IndexBTreeRange { - public final long dataOffset; - private BTreeHeader header; - public IndexBTreeRange(long dataOffset) { - this.dataOffset = dataOffset; - } - - public LongStream stream(int bufferSize) { - if (dataOffset < 0) { - return LongStream.empty(); - } - if (header == null) { - header = bTreeReader.getHeader(dataOffset); - } - - long urlOffset = header.dataOffsetLongs(); - long endOffset = header.dataOffsetLongs() + header.numEntries(); - int stepSize = Math.min(bufferSize, header.numEntries()); - - long[] buffer = new long[stepSize]; - - return LongStream - .iterate(urlOffset, i -> i< endOffset, i->i+stepSize) - .flatMap(pos -> { - int sz = (int)(Math.min(pos+stepSize, endOffset) - pos); - urls.read(buffer, sz, pos); - return Arrays.stream(buffer, 0, sz); - }); - } - - public EntrySource asEntrySource() { - return new AsEntrySource(); - } - - public QueryFilterStepIf asExcludeFilterStep(IndexQueryCachePool pool) { - return new AsExcludeQueryFilterStep(pool); - } - - - public LongStream stream() { - return stream(1024); - } - - public boolean isPresent() { - return dataOffset >= 0; - } - - public long numEntries() { - if (header != null) { - return header.numEntries(); - } - else if (dataOffset < 0) return 0L; - else { - header = bTreeReader.getHeader(dataOffset); - return header.numEntries(); - } - } - - public boolean hasUrl(CachingBTreeReader.BTreeCachedIndex cache, long url) { - if (dataOffset < 0) return false; - - return cachingBTreeReader.findEntry(cache, url) >= 0; - } - - public boolean hasUrl(IndexQueryCachePool pool, long url) { - if (dataOffset < 0) - return false; - - CachingBTreeReader.BTreeCachedIndex cache = pool.getIndexCache(SearchIndex.this, this); - - return cachingBTreeReader.findEntry(cache, url) >= 0; - } - - public CachingBTreeReader.BTreeCachedIndex createIndexCache() { - if (dataOffset < 0) - return null; - - if (header == null) { - header = cachingBTreeReader.getHeader(dataOffset); - } - - return cachingBTreeReader.prepareCache(header); - } - - class AsEntrySource implements EntrySource { - long pos; - final long endOffset; - - public SearchIndex getIndex() { - return SearchIndex.this; - }; - - public AsEntrySource() { - if (dataOffset <= 0) { - pos = -1; - endOffset = -1; - return; - } - - if (header == null) { - header = bTreeReader.getHeader(dataOffset); - } - - pos = header.dataOffsetLongs(); - endOffset = header.dataOffsetLongs() + header.numEntries(); - } - - - @Override - public int read(long[] buffer, int n) { - if (pos >= endOffset) { - return 0; - } - - int rb = Math.min(n, (int)(endOffset - pos)); - urls.read(buffer, rb, pos); - pos += rb; - return rb; - } - } - - class AsExcludeQueryFilterStep implements QueryFilterStepIf { - private final CachingBTreeReader.BTreeCachedIndex cache; - - public AsExcludeQueryFilterStep(IndexQueryCachePool pool) { - cache = pool.getIndexCache(SearchIndex.this, IndexBTreeRange.this); - } - - public SearchIndex getIndex() { - return SearchIndex.this; - }; - public double cost() { - return cache.getIndexedDataSize(); - } - - @Override - public boolean test(long value) { - return !hasUrl(cache, value); - } - - public String describe() { - return "Exclude["+name+"]"; - } - } - + public SearchIndexURLRange rangeForWord(int wordId) { + return new SearchIndexURLRange(urls, words.positionForWord(wordId)); } @Override diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java index bbb2b048..bb991898 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java @@ -5,7 +5,6 @@ import lombok.SneakyThrows; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.index.svc.query.IndexDomainQueryFactory; import nu.marginalia.wmsa.edge.index.svc.query.IndexQuery; -import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool; import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -22,31 +21,14 @@ public class SearchIndexReader implements AutoCloseable { private final IndexDomainQueryFactory domainQueryFactory; private final Logger logger = LoggerFactory.getLogger(getClass()); - private static final IndexBlock[] indicesBySearchOrder = new IndexBlock[] { - IndexBlock.Title, - IndexBlock.Tfidf_Top, - IndexBlock.Tfidf_Middle, - IndexBlock.Tfidf_Lower, - IndexBlock.Words_1, - IndexBlock.Words_2, - IndexBlock.Words_4, - IndexBlock.Words_8, - IndexBlock.Words_16Plus, - }; - @Inject public SearchIndexReader( EnumMap indices) { this.indices = indices; - var lowIndex = indices.get(IndexBlock.Tfidf_Lower); - var midIndex = indices.get(IndexBlock.Tfidf_Middle); - var topIndex = indices.get(IndexBlock.Tfidf_Top); var linkIndex = indices.get(IndexBlock.Link); var titleIndex = indices.get(IndexBlock.Title); - var siteIndex = indices.get(IndexBlock.Site); var metaIndex = indices.get(IndexBlock.Meta); - var topicIndex = indices.get(IndexBlock.Subjects); var words1 = indices.get(IndexBlock.Words_1); var words2 = indices.get(IndexBlock.Words_2); @@ -57,7 +39,7 @@ public class SearchIndexReader implements AutoCloseable { queryBuilders = new EnumMap<>(IndexBlock.class); - List excludeIndices = listOfNonNulls(metaIndex, titleIndex, topIndex, midIndex, lowIndex, words1); + List excludeIndices = listOfNonNulls(metaIndex, titleIndex, words1, words2, words4, words8, words16); queryBuilders.put(IndexBlock.Title, new IndexQueryFactory(listOfNonNulls(metaIndex, titleIndex, linkIndex), excludeIndices)); queryBuilders.put(IndexBlock.Words_1, new IndexQueryFactory(listOfNonNulls(metaIndex, words1), excludeIndices)); @@ -66,7 +48,7 @@ public class SearchIndexReader implements AutoCloseable { queryBuilders.put(IndexBlock.Words_8, new IndexQueryFactory(listOfNonNulls(metaIndex, words8), excludeIndices)); queryBuilders.put(IndexBlock.Words_16Plus, new IndexQueryFactory(listOfNonNulls(metaIndex, words16, artifacts), excludeIndices)); - domainQueryFactory = new IndexDomainQueryFactory(siteIndex, listOfNonNulls(topicIndex)); + domainQueryFactory = new IndexDomainQueryFactory(indices.get(IndexBlock.Words_1)); } @SafeVarargs @@ -75,17 +57,31 @@ public class SearchIndexReader implements AutoCloseable { } - public IndexQueryFactory.IndexQueryBuilder findWord(IndexQueryCachePool cachePool, IndexBlock block, int wordId) { + public IndexQueryFactory.IndexQueryBuilder findWord(IndexBlock block, Integer quality, int wordId) { var builder = queryBuilders.get(block); if (builder == null) return null; - return builder.buildQuery(cachePool, wordId); + if (quality == null) { + return builder.buildQuery(wordId); + } + else { + return builder.buildQuery(quality, wordId); + } } - public IndexQuery findDomain(IndexQueryCachePool cachePool, int wordId) { - return domainQueryFactory.buildQuery(cachePool, wordId); + public IndexQueryFactory.IndexQueryBuilder findWordForDomainList(IndexBlock block, List domains, int wordId) { + var builder = queryBuilders.get(block); + + if (builder == null) + return null; + + return builder.buildQuery(domains, wordId); + } + + public IndexQuery findDomain(int wordId) { + return domainQueryFactory.buildQuery(wordId); } @Override @@ -96,7 +92,7 @@ public class SearchIndexReader implements AutoCloseable { } @SneakyThrows - public long numHits(IndexQueryCachePool pool, IndexBlock block, int word) { + public long numHits(IndexBlock block, int word) { IndexQueryFactory builder = queryBuilders.get(block); if (builder == null) @@ -104,31 +100,18 @@ public class SearchIndexReader implements AutoCloseable { long hits = 0; for (var index : builder.getIndicies()) { - hits += index.numUrls(pool, word); + hits += index.numUrls(word); } return hits; } - public IndexBlock getBlockForResult(IndexQueryCachePool cachePool, int searchTerm, long urlId) { - for (var block : indicesBySearchOrder) { - var index = indices.get(block); - - if (null == index) { - continue; - } - - if (cachePool.isUrlPresent(index, searchTerm, urlId)) - return block; + public long[] getMetadata(IndexBlock block, int termId, long[] ids) { + final var index = indices.get(block); + if (null == index) { + return new long[ids.length]; } - return IndexBlock.Words_16Plus; - } - - public boolean isTermInBucket(IndexQueryCachePool cachePool, IndexBlock block, int searchTerm, long urlId) { - final var index = indices.get(block); - if (null == index) return false; - - return cachePool.isUrlPresent(index, searchTerm, urlId); + return indices.get(block).rangeForWord(termId).getMetadata(ids); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexURLRange.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexURLRange.java new file mode 100644 index 00000000..916aab9d --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexURLRange.java @@ -0,0 +1,100 @@ +package nu.marginalia.wmsa.edge.index.reader; + +import it.unimi.dsi.fastutil.longs.LongLongImmutablePair; +import nu.marginalia.util.btree.BTreeQueryBuffer; +import nu.marginalia.util.btree.BTreeReader; +import nu.marginalia.util.multimap.MultimapFileLong; +import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter; +import nu.marginalia.wmsa.edge.index.svc.query.types.EmptyEntrySource; +import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySource; +import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySourceFromBTree; +import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySourceFromMapRange; + +import javax.annotation.Nullable; + +import static nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags.*; + +public class SearchIndexURLRange { + public final long dataOffset; + private final MultimapFileLong urlsFile; + + @Nullable + private final BTreeReader reader; + + public SearchIndexURLRange(MultimapFileLong urlsFile, long dataOffset) { + this.dataOffset = dataOffset; + this.urlsFile = urlsFile; + + if (dataOffset >= 0) { + this.reader = new BTreeReader(urlsFile, SearchIndexConverter.urlsBTreeContext, dataOffset); + } else { + this.reader = null; + } + } + + public EntrySource asPrefixSource(long prefix, long prefixNext) { + if (reader == null) + return new EmptyEntrySource(); + + LongLongImmutablePair startAndEnd = reader.getRangeForPrefix(prefix, prefixNext); + + if (startAndEnd.firstLong() == startAndEnd.secondLong()) { + return new EmptyEntrySource(); + } + + return new EntrySourceFromMapRange(urlsFile, startAndEnd.firstLong(), startAndEnd.secondLong()); + } + + public EntrySource asEntrySource() { + return new EntrySourceFromBTree(reader, EntrySourceFromBTree.NO_MASKING, null); + } + public EntrySource asQualityLimitingEntrySource(int limit) { + return new EntrySourceFromBTree(reader, EntrySourceFromBTree.NO_MASKING, limit); + } + public EntrySource asDomainEntrySource() { + return new EntrySourceFromBTree(reader, Subjects.asBit() | Site.asBit() | Title.asBit(), null); + } + + public boolean isPresent() { + return dataOffset >= 0; + } + + public long numEntries() { + if (reader == null) + return 0L; + + return reader.numEntries(); + } + + public void retainUrls(BTreeQueryBuffer buffer) { + if (reader != null) + reader.retainEntries(buffer); + } + + public void rejectUrls(BTreeQueryBuffer buffer) { + if (reader != null) + reader.rejectEntries(buffer); + } + + public boolean hasUrl(long url) { + if (reader == null) + return false; + + return reader.findEntry(url) >= 0; + } + + + public long[] getMetadata(long[] urls) { + if (reader == null) { + return new long[urls.length]; + } + + return reader.queryData(urls, 1); + } + + @Override + public String toString() { + return String.format("BTreeRange(@" + dataOffset + ", size = " + numEntries() + ")"); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexDomainQueryService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexDomainQueryService.java new file mode 100644 index 00000000..b76b65a6 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexDomainQueryService.java @@ -0,0 +1,111 @@ +package nu.marginalia.wmsa.edge.index.svc; + +import com.google.gson.Gson; +import com.google.inject.Inject; +import com.google.inject.Singleton; +import io.prometheus.client.Histogram; +import nu.marginalia.util.btree.BTreeQueryBuffer; +import nu.marginalia.util.dict.DictionaryHashMap; +import nu.marginalia.wmsa.client.GsonFactory; +import nu.marginalia.wmsa.edge.index.reader.SearchIndexes; +import nu.marginalia.wmsa.edge.index.svc.query.IndexSearchBudget; +import nu.marginalia.wmsa.edge.index.svc.query.ResultDomainDeduplicator; +import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.wmsa.edge.model.id.EdgeIdList; +import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults; +import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification; +import org.apache.http.HttpStatus; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import spark.HaltException; +import spark.Request; +import spark.Response; +import spark.Spark; + +import java.util.OptionalInt; + +import static nu.marginalia.wmsa.edge.index.EdgeIndexService.DYNAMIC_BUCKET_LENGTH; +import static spark.Spark.halt; + +@Singleton +public class EdgeIndexDomainQueryService { + + private final Logger logger = LoggerFactory.getLogger(getClass()); + + private static final Histogram wmsa_edge_index_domain_query_time = Histogram.build().name("wmsa_edge_index_domain_query_time").linearBuckets(25/1000., 25/1000., 15).help("-").register(); + + private final Gson gson = GsonFactory.get(); + + private final SearchIndexes indexes; + + @Inject + public EdgeIndexDomainQueryService(SearchIndexes indexes) { + this.indexes = indexes; + } + + public Object searchDomain(Request request, Response response) { + if (indexes.getLexiconReader() == null) { + logger.warn("Dictionary reader not yet initialized"); + halt(HttpStatus.SC_SERVICE_UNAVAILABLE, "Come back in a few minutes"); + } + + String json = request.body(); + EdgeDomainSearchSpecification specsSet = gson.fromJson(json, EdgeDomainSearchSpecification.class); + + try { + return wmsa_edge_index_domain_query_time.time(() -> queryDomain(specsSet)); + } + catch (HaltException ex) { + logger.warn("Halt", ex); + throw ex; + } + catch (Exception ex) { + logger.info("Error during domain search {}({}) (query: {})", ex.getClass().getSimpleName(), ex.getMessage(), json); + logger.info("Error", ex); + Spark.halt(500, "Error"); + return null; + } + } + + public EdgeDomainSearchResults queryDomain(EdgeDomainSearchSpecification specsSet) { + + final OptionalInt wordId = lookUpWord(specsSet.keyword); + final EdgeIdList urlIds = new EdgeIdList<>(); + + final IndexSearchBudget budget = new IndexSearchBudget(50); + + if (wordId.isEmpty()) { + return new EdgeDomainSearchResults(specsSet.keyword, urlIds); + } + + BTreeQueryBuffer buffer = new BTreeQueryBuffer(512); + + for (int bucket = 0; budget.hasTimeLeft() && bucket < DYNAMIC_BUCKET_LENGTH+1; bucket++) { + + final ResultDomainDeduplicator localFilter = new ResultDomainDeduplicator(1); + var query = indexes.getBucket(bucket).getDomainQuery(wordId.getAsInt(), localFilter); + + while (query.hasMore() && urlIds.size() < specsSet.maxResults) { + query.getMoreResults(buffer); + + for (int i = 0; i < buffer.end && urlIds.size() < specsSet.maxResults; i++) { + long result = buffer.data[i]; + if (localFilter.test(result)) { + urlIds.add((int) (result & 0xFFFF_FFFFL)); + } + } + } + } + + return new EdgeDomainSearchResults(specsSet.keyword, urlIds); + } + + private OptionalInt lookUpWord(String s) { + int ret = indexes.getLexiconReader().get(s); + if (ret == DictionaryHashMap.NO_VALUE) { + return OptionalInt.empty(); + } + return OptionalInt.of(ret); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexLexiconService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexLexiconService.java index a942d892..520b559f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexLexiconService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexLexiconService.java @@ -5,6 +5,7 @@ import com.google.inject.Singleton; import com.google.protobuf.InvalidProtocolBufferException; import nu.marginalia.util.ListChunker; import nu.marginalia.util.dict.DictionaryHashMap; +import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords; import nu.marginalia.wmsa.edge.index.IndexServicesFactory; import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl; import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry; @@ -21,7 +22,6 @@ import spark.Request; import spark.Response; import java.util.Arrays; -import java.util.List; @Singleton public class EdgeIndexLexiconService { @@ -35,6 +35,11 @@ public class EdgeIndexLexiconService { this.keywordLexicon = servicesFactory.getKeywordLexicon(); } + public EdgeIndexLexiconService(SearchIndexes indexes, KeywordLexicon lexicon) { + this.indexes = indexes; + this.keywordLexicon = lexicon; + } + public Object getWordId(Request request, Response response) { final String word = request.splat()[0]; @@ -73,31 +78,37 @@ public class EdgeIndexLexiconService { public void putWords(EdgeId domainId, EdgeId urlId, IndexPutKeywordsReq.WordSet words, int idx ) { - SearchIndexJournalWriterImpl indexWriter = indexes.getIndexWriter(idx); + SearchIndexJournalWriterImpl indexWriter = indexes.getIndexWriter(idx); IndexBlock block = IndexBlock.values()[words.getIndex()]; - for (var chunk : ListChunker.chopList(words.getWordsList(), SearchIndexJournalEntry.MAX_LENGTH)) { + var wordArray = words.getWordsList().toArray(String[]::new); + var metaArray = words.getMetaList().stream().mapToLong(Long::valueOf).toArray(); - var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk)); + DocumentKeywords documentKeywords = new DocumentKeywords(block, wordArray, metaArray); + for (var chunk : ListChunker.chopList(documentKeywords, SearchIndexJournalEntry.MAX_LENGTH)) { + var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk.keywords(), chunk.metadata())); var header = new SearchIndexJournalEntryHeader(domainId, urlId, block); indexWriter.put(header, entry); } } - private long[] getOrInsertWordIds(List words) { - long[] ids = new long[words.size()]; + private long[] getOrInsertWordIds(String[] words, long[] meta) { + long[] ids = new long[words.length*2]; int putIdx = 0; - for (String word : words) { + for (int i = 0; i < words.length; i++) { + String word = words[i]; + long id = keywordLexicon.getOrInsert(word); if (id != DictionaryHashMap.NO_VALUE) { ids[putIdx++] = id; + ids[putIdx++] = meta[i]; } } - if (putIdx != words.size()) { + if (putIdx != words.length*2) { ids = Arrays.copyOf(ids, putIdx); } return ids; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexQueryService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexQueryService.java index fd753f81..34bcc93a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexQueryService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexQueryService.java @@ -7,22 +7,23 @@ import gnu.trove.set.hash.TIntHashSet; import io.prometheus.client.Counter; import io.prometheus.client.Gauge; import io.prometheus.client.Histogram; +import it.unimi.dsi.fastutil.ints.IntArrayList; +import it.unimi.dsi.fastutil.ints.IntComparator; +import it.unimi.dsi.fastutil.ints.IntList; +import it.unimi.dsi.fastutil.ints.IntOpenHashSet; +import it.unimi.dsi.fastutil.longs.LongAVLTreeSet; +import nu.marginalia.util.btree.BTreeQueryBuffer; import nu.marginalia.util.dict.DictionaryHashMap; import nu.marginalia.wmsa.client.GsonFactory; -import nu.marginalia.wmsa.configuration.WmsaHome; import nu.marginalia.wmsa.edge.index.EdgeIndexBucket; -import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms; +import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.index.reader.SearchIndexes; import nu.marginalia.wmsa.edge.index.svc.query.IndexQuery; -import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool; +import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryParams; import nu.marginalia.wmsa.edge.index.svc.query.IndexSearchBudget; import nu.marginalia.wmsa.edge.index.svc.query.ResultDomainDeduplicator; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.id.EdgeIdList; import nu.marginalia.wmsa.edge.model.search.*; -import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults; -import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification; import org.apache.http.HttpStatus; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -36,7 +37,6 @@ import java.util.function.LongPredicate; import java.util.stream.Collectors; import static java.util.Comparator.comparing; -import static nu.marginalia.wmsa.edge.index.EdgeIndexService.DYNAMIC_BUCKET_LENGTH; import static spark.Spark.halt; @Singleton @@ -50,7 +50,6 @@ public class EdgeIndexQueryService { private static final Gauge wmsa_edge_index_query_cost = Gauge.build().name("wmsa_edge_index_query_cost").help("-").register(); private static final Histogram wmsa_edge_index_query_time = Histogram.build().name("wmsa_edge_index_query_time").linearBuckets(25/1000., 25/1000., 15).help("-").register(); - private static final Histogram wmsa_edge_index_domain_query_time = Histogram.build().name("wmsa_edge_index_domain_query_time").linearBuckets(25/1000., 25/1000., 15).help("-").register(); private final Gson gson = GsonFactory.get(); @@ -61,30 +60,6 @@ public class EdgeIndexQueryService { this.indexes = indexes; } - public Object searchDomain(Request request, Response response) { - if (indexes.getLexiconReader() == null) { - logger.warn("Dictionary reader not yet initialized"); - halt(HttpStatus.SC_SERVICE_UNAVAILABLE, "Come back in a few minutes"); - } - - String json = request.body(); - EdgeDomainSearchSpecification specsSet = gson.fromJson(json, EdgeDomainSearchSpecification.class); - - try { - return wmsa_edge_index_domain_query_time.time(() -> queryDomain(specsSet)); - } - catch (HaltException ex) { - logger.warn("Halt", ex); - throw ex; - } - catch (Exception ex) { - logger.info("Error during domain search {}({}) (query: {})", ex.getClass().getSimpleName(), ex.getMessage(), json); - logger.info("Error", ex); - Spark.halt(500, "Error"); - return null; - } - } - public Object search(Request request, Response response) { if (indexes.getLexiconReader() == null) { logger.warn("Dictionary reader not yet initialized"); @@ -94,6 +69,7 @@ public class EdgeIndexQueryService { String json = request.body(); EdgeSearchSpecification specsSet = gson.fromJson(json, EdgeSearchSpecification.class); + try { return wmsa_edge_index_query_time.time(() -> query(specsSet)); } @@ -117,51 +93,20 @@ public class EdgeIndexQueryService { wmsa_edge_index_query_cost.set(searchQuery.getDataCost()); + if (!searchQuery.hasTimeLeft()) { + wmsa_edge_index_query_timeouts.inc(); + } + return new EdgeSearchResultSet(results); } - public EdgeDomainSearchResults queryDomain(EdgeDomainSearchSpecification specsSet) { - - final OptionalInt wordId = lookUpWord(specsSet.keyword); - - final EdgeIdList urlIds = new EdgeIdList<>(); - - final IndexQueryCachePool pool = new IndexQueryCachePool(); - final IndexSearchBudget budget = new IndexSearchBudget(50); - - if (wordId.isEmpty()) { - - return new EdgeDomainSearchResults(specsSet.keyword, urlIds); - } - - for (int bucket = 0; budget.hasTimeLeft() && bucket < DYNAMIC_BUCKET_LENGTH+1; bucket++) { - - final ResultDomainDeduplicator localFilter = new ResultDomainDeduplicator(1); - - var query = indexes.getBucket(bucket).getDomainQuery(pool, wordId.getAsInt(), localFilter); - long[] buffer = new long[512]; - - while (query.hasMore() && urlIds.size() < specsSet.maxResults) { - int cnt = query.getMoreResults(buffer, budget); - for (int i = 0; i < cnt && urlIds.size() < specsSet.maxResults; i++) { - long result = buffer[i]; - if (localFilter.test(result)) { - urlIds.add((int) (result & 0xFFFF_FFFFL)); - } - } - } - } - - return new EdgeDomainSearchResults(specsSet.keyword, urlIds); - } - private class SearchQuery { private final int fetchSize; private final TIntHashSet seenResults; private final EdgeSearchSpecification specsSet; private final IndexSearchBudget budget; - private final IndexQueryCachePool cachePool = new IndexQueryCachePool(); - + private final Integer qualityLimit; + private final Integer rankLimit; private long dataCost = 0; public SearchQuery(EdgeSearchSpecification specsSet) { @@ -169,6 +114,8 @@ public class EdgeIndexQueryService { this.budget = new IndexSearchBudget(specsSet.timeoutMs); this.fetchSize = specsSet.fetchSize; this.seenResults = new TIntHashSet(fetchSize, 0.5f); + this.qualityLimit = specsSet.quality; + this.rankLimit = specsSet.rank; } private List execute() { @@ -178,32 +125,31 @@ public class EdgeIndexQueryService { results.addAll(performSearch(sq)); } - + final SearchTermEvaluator evaluator = new SearchTermEvaluator(specsSet, results); for (var result : results) { - addResultScores(result); + evaluator.addResultScores(result); } - if (!budget.hasTimeLeft()) { - wmsa_edge_index_query_timeouts.inc(); - } + return createResultList(results); + } + + private List createResultList(Set results) { var domainCountFilter = new ResultDomainDeduplicator(specsSet.limitByDomain); - if (WmsaHome.isDebug()) { - cachePool.printSummary(logger); - } - cachePool.clear(); - List resultList = results.stream() .sorted( comparing(EdgeSearchResultItem::getScore) - .thenComparing(EdgeSearchResultItem::getRanking) - .thenComparing(EdgeSearchResultItem::getUrlIdInt) + .thenComparing(EdgeSearchResultItem::getRanking) + .thenComparing(EdgeSearchResultItem::getUrlIdInt) ) .filter(domainCountFilter::test) .collect(Collectors.toList()); if (resultList.size() > specsSet.getLimitTotal()) { + // This can't be made a stream limit() operation because we need domainCountFilter + // to run over the entire list to provide accurate statistics + resultList.subList(specsSet.getLimitTotal(), resultList.size()).clear(); } @@ -219,16 +165,20 @@ public class EdgeIndexQueryService { { final List results = new ArrayList<>(fetchSize); - final EdgeIndexSearchTerms searchTerms = getSearchTerms(sq); + final SearchTerms searchTerms = getSearchTerms(sq); - if (searchTerms.isEmpty()) + if (searchTerms.isEmpty()) { return Collections.emptyList(); + } + + final BTreeQueryBuffer buffer = new BTreeQueryBuffer(fetchSize); for (int indexBucket : specsSet.buckets) { final ResultDomainDeduplicator localFilter = new ResultDomainDeduplicator(QUERY_FIRST_PASS_DOMAIN_LIMIT); if (!budget.hasTimeLeft()) { - logger.info("Query timed out, omitting {}:{} for query {}, ({}), -{}", indexBucket, sq.block, sq.searchTermsInclude, sq.searchTermsAdvice, sq.searchTermsExclude); + logger.info("Query timed out, omitting {}:{} for query {}, ({}), -{}", + indexBucket, sq.block, sq.searchTermsInclude, sq.searchTermsAdvice, sq.searchTermsExclude); continue; } @@ -237,20 +187,22 @@ public class EdgeIndexQueryService { break; } - IndexQuery query = getQuery(cachePool, indexBucket, sq.block, localFilter::filterRawValue, searchTerms); - long[] buf = new long[fetchSize]; + IndexQueryParams queryParams = new IndexQueryParams(sq.block, searchTerms, qualityLimit, rankLimit, specsSet.domains); + + IndexQuery query = getQuery(indexBucket, localFilter::filterRawValue, queryParams); while (query.hasMore() && results.size() < fetchSize && budget.hasTimeLeft()) { - int cnt = query.getMoreResults(buf, budget); + buffer.reset(); + query.getMoreResults(buffer); - for (int i = 0; i < cnt && results.size() < fetchSize; i++) { - final long id = buf[i]; + for (int i = 0; i < buffer.size() && results.size() < fetchSize; i++) { + final long id = buffer.data[i]; if (!seenResults.add((int)(id & 0xFFFF_FFFFL)) || !localFilter.test(id)) { continue; } - results.add(new EdgeSearchResultItem(indexBucket, id)); + results.add(new EdgeSearchResultItem(indexBucket, sq.block, id)); } } @@ -261,40 +213,127 @@ public class EdgeIndexQueryService { return results; } - private IndexQuery getQuery(IndexQueryCachePool cachePool, int bucket, IndexBlock block, - LongPredicate filter, EdgeIndexSearchTerms searchTerms) { + private IndexQuery getQuery(int bucket, LongPredicate filter, IndexQueryParams params) { if (!indexes.isValidBucket(bucket)) { logger.warn("Invalid bucket {}", bucket); return new IndexQuery(Collections.emptyList()); } - return indexes.getBucket(bucket).getQuery(cachePool, block, filter, searchTerms); + return indexes.getBucket(bucket).getQuery(filter, params); } - private void addResultScores(EdgeSearchResultItem searchResult) { + public boolean hasTimeLeft() { + return budget.hasTimeLeft(); + } + + private record IndexAndBucket(IndexBlock block, int bucket) {} + + public long getDataCost() { + return dataCost; + } + + record ResultTerm (int bucket, int termId, long combinedUrlId) {} + } + + public class SearchTermEvaluator { + private static final EdgePageWordMetadata blankMetadata = new EdgePageWordMetadata(EdgePageWordMetadata.emptyValue()); + + private final Map termData = new HashMap<>(16); + + private final List> searchTermVariants; + + public SearchTermEvaluator(EdgeSearchSpecification specsSet, Set results) { + this.searchTermVariants = specsSet.subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList(); + + final int[] termIdsAll = getIncludeTermIds(specsSet); + + Map resultIdsByBucket = new HashMap<>(7); + + for (int termId : termIdsAll) { + + for (var result: results) { + resultIdsByBucket + .computeIfAbsent(new SearchQuery.IndexAndBucket(result.block, result.bucketId), + id -> new LongAVLTreeSet()) + .add(result.combinedId); + } + + resultIdsByBucket.forEach((indexAndBucket, resultIds) -> + loadMetadata(termId, indexAndBucket.bucket, indexAndBucket.block, resultIds)); + + resultIdsByBucket.clear(); + } + } + + private int[] getIncludeTermIds(EdgeSearchSpecification specsSet) { + final var reader = Objects.requireNonNull(indexes.getLexiconReader()); - List> searchTermVariants = specsSet.subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList(); + final List terms = specsSet.allIncludeSearchTerms(); + final IntList ret = new IntArrayList(terms.size()); - // Memoize calls to getTermData, as they're somewhat expensive and highly redundant - Map termMetadata = new HashMap<>(32); + for (var term : terms) { + int id = reader.get(term); + + if (id >= 0) + ret.add(id); + } + + return ret.toIntArray(); + } + + private void loadMetadata(int termId, int bucket, IndexBlock indexBlock, + LongAVLTreeSet docIdsMissingMetadata) + { + EdgeIndexBucket index = indexes.getBucket(bucket); + + if (docIdsMissingMetadata.isEmpty()) + return; + + + long[] ids = docIdsMissingMetadata.toLongArray(); + long[] metadata = index.getMetadata(indexBlock, termId, ids); + + for (int i = 0; i < metadata.length; i++) { + if (metadata[i] == 0L) + continue; + + termData.put( + new SearchQuery.ResultTerm(bucket, termId, ids[i]), + new EdgePageWordMetadata(metadata[i]) + ); + + docIdsMissingMetadata.remove(ids[i]); + } + } + + public void addResultScores(EdgeSearchResultItem searchResult) { + final var reader = Objects.requireNonNull(indexes.getLexiconReader()); double bestScore = 0; for (int searchTermListIdx = 0; searchTermListIdx < searchTermVariants.size(); searchTermListIdx++) { double setScore = 0; int setSize = 0; - for (var searchTerm : searchTermVariants.get(searchTermListIdx)) { + var termList = searchTermVariants.get(searchTermListIdx); + + for (int termIdx = 0; termIdx < termList.size(); termIdx++) { + String searchTerm = termList.get(termIdx); final int termId = reader.get(searchTerm); - ResultTermData data = termMetadata.computeIfAbsent( - new ResultTerm(searchResult.bucketId, termId, searchResult.getCombinedId()), this::getTermData); + var key = new SearchQuery.ResultTerm(searchResult.bucketId, termId, searchResult.getCombinedId()); + var metadata = termData.getOrDefault(key, blankMetadata); + + EdgeSearchResultKeywordScore score = new EdgeSearchResultKeywordScore(searchTermListIdx, searchTerm, metadata); - var score = data.asScore(searchTermListIdx, searchTerm); searchResult.scores.add(score); - setScore += score.value(); + setScore += score.termValue(); + if (termIdx == 0) { + setScore += score.documentValue(); + } + setSize++; } bestScore = Math.min(bestScore, setScore/setSize); @@ -303,64 +342,27 @@ public class EdgeIndexQueryService { searchResult.setScore(bestScore); } - private ResultTermData getTermData(ResultTerm resultTerm) { - final EdgeIndexBucket bucket = indexes.getBucket(resultTerm.bucket); - final int termId = resultTerm.termId; - final long combinedUrlId = resultTerm.combinedUrlId; - return new ResultTermData(bucket.getTermScore(cachePool, termId, combinedUrlId), - bucket.isTermInBucket(cachePool, IndexBlock.Title, termId, combinedUrlId), - bucket.isTermInBucket(cachePool, IndexBlock.Link, termId, combinedUrlId), - bucket.isTermInBucket(cachePool, IndexBlock.Site, termId, combinedUrlId), - bucket.isTermInBucket(cachePool, IndexBlock.Subjects, termId, combinedUrlId), - bucket.isTermInBucket(cachePool, IndexBlock.NamesWords, termId, combinedUrlId), - bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Top, termId, combinedUrlId), - bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Middle, termId, combinedUrlId), - bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Lower, termId, combinedUrlId) - ); - } - - public long getDataCost() { - return dataCost; - } - - record ResultTerm (int bucket, int termId, long combinedUrlId) {} - record ResultTermData (IndexBlock index, - boolean title, - boolean link, - boolean site, - boolean subject, - boolean name, - boolean high, - boolean mid, - boolean low - ) { - public EdgeSearchResultKeywordScore asScore(int set, String searchTerm) { - return new EdgeSearchResultKeywordScore(set, searchTerm, index, title, link, site, subject, name, high, mid, low); - } - } } - - private EdgeIndexSearchTerms getSearchTerms(EdgeSearchSubquery request) { - final List excludes = new ArrayList<>(); - final List includes = new ArrayList<>(); + private SearchTerms getSearchTerms(EdgeSearchSubquery request) { + final IntList excludes = new IntArrayList(); + final IntList includes = new IntArrayList(); for (var include : request.searchTermsInclude) { var word = lookUpWord(include); if (word.isEmpty()) { logger.debug("Unknown search term: " + include); - return new EdgeIndexSearchTerms(Collections.emptyList(), Collections.emptyList()); + return new SearchTerms(); } includes.add(word.getAsInt()); } - for (var advice : request.searchTermsAdvice) { var word = lookUpWord(advice); if (word.isEmpty()) { logger.debug("Unknown search term: " + advice); - return new EdgeIndexSearchTerms(Collections.emptyList(), Collections.emptyList()); + return new SearchTerms(); } includes.add(word.getAsInt()); } @@ -369,7 +371,26 @@ public class EdgeIndexQueryService { lookUpWord(exclude).ifPresent(excludes::add); } - return new EdgeIndexSearchTerms(includes, excludes); + return new SearchTerms(includes, excludes); + } + + public record SearchTerms(IntList includes, IntList excludes) { + public SearchTerms() { + this(IntList.of(), IntList.of()); + } + + public boolean isEmpty() { + return includes.isEmpty(); + } + + public int[] sortedDistinctIncludes(IntComparator comparator) { + if (includes.isEmpty()) + return includes.toIntArray(); + + IntList list = new IntArrayList(new IntOpenHashSet(includes)); + list.sort(comparator); + return list.toIntArray(); + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexDomainQueryFactory.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexDomainQueryFactory.java index 794a08ea..d96b710e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexDomainQueryFactory.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexDomainQueryFactory.java @@ -2,49 +2,31 @@ package nu.marginalia.wmsa.edge.index.svc.query; import nu.marginalia.wmsa.edge.index.reader.SearchIndex; import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySource; -import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterBTreeRange; -import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepIf; -import java.util.*; -import java.util.stream.Collectors; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; public class IndexDomainQueryFactory { SearchIndex baseIndex; - List requiredIndices; - public Collection getIndicies() { - return requiredIndices; + public IndexDomainQueryFactory(SearchIndex sourceIndex) { + this.baseIndex = sourceIndex; } - public IndexDomainQueryFactory(SearchIndex baseIndex, List requiredIndices) { - this.baseIndex = baseIndex; - this.requiredIndices = requiredIndices.stream().filter(Objects::nonNull).collect(Collectors.toList()); - } - - public IndexQuery buildQuery(IndexQueryCachePool cachePool, int firstWordId) { + public IndexQuery buildQuery(int firstWordId) { if (baseIndex == null) { return new IndexQuery(Collections.emptyList()); } List sources = new ArrayList<>(1); - var range = baseIndex.rangeForWord(cachePool, firstWordId); + var range = baseIndex.rangeForWord(firstWordId); if (range.isPresent()) { - sources.add(range.asEntrySource()); + sources.add(range.asDomainEntrySource()); } - var query = new IndexQuery(sources); - for (var required : requiredIndices) { - var requiredRange = required.rangeForWord(firstWordId); - if (requiredRange.isPresent()) { - query.addInclusionFilter(new QueryFilterBTreeRange(required, requiredRange, cachePool)); - } - else { - query.addInclusionFilter(QueryFilterStepIf.noPass()); - } - } - - return query; + return new IndexQuery(sources); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQuery.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQuery.java index 94a431c3..bdd87297 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQuery.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQuery.java @@ -1,13 +1,12 @@ package nu.marginalia.wmsa.edge.index.svc.query; +import nu.marginalia.util.btree.BTreeQueryBuffer; import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySource; import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepIf; import java.util.ArrayList; import java.util.List; -import static java.lang.Math.min; - public class IndexQuery { private final List sources; private final List inclusionFilter = new ArrayList<>(10); @@ -27,33 +26,39 @@ public class IndexQuery { return si < sources.size(); } - public int getMoreResults(long[] dest, IndexSearchBudget budget) { - final EntrySource source = sources.get(si); - - int bufferUtilizedLength = source.read(dest, dest.length); - - if (bufferUtilizedLength <= 0) { - si++; - return 0; - } - - dataCost += bufferUtilizedLength; + public void getMoreResults(BTreeQueryBuffer dest) { + if (!fillBuffer(dest)) + return; for (var filter : inclusionFilter) { - bufferUtilizedLength = filter.retainDestructive(dest, bufferUtilizedLength); + filter.apply(dest); - dataCost += bufferUtilizedLength; + dataCost += dest.size(); - if (bufferUtilizedLength <= 0) { - si++; - return 0; + if (dest.isEmpty()) { + return; } } + } - int count = min(bufferUtilizedLength, dest.length); - System.arraycopy(dest, 0, dest, 0, count); + private boolean fillBuffer(BTreeQueryBuffer dest) { + for (;;) { + dest.reset(); - return count; + EntrySource source = sources.get(si); + source.read(dest); + + if (!dest.isEmpty()) { + break; + } + + if (!source.hasMore() && ++si >= sources.size()) + return false; + } + + dataCost += dest.size(); + + return !dest.isEmpty(); } public long dataCost() { @@ -62,9 +67,8 @@ public class IndexQuery { public String toString() { StringBuilder sb = new StringBuilder(); sb.append("Sources:\n"); - - for (var source: sources) { - sb.append("\t").append(source.getIndex().name).append("\n"); + for (var source : sources) { + sb.append(source).append('\n'); } sb.append("Includes:\n"); for (var include : inclusionFilter) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryCachePool.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryCachePool.java deleted file mode 100644 index 52925c3c..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryCachePool.java +++ /dev/null @@ -1,60 +0,0 @@ -package nu.marginalia.wmsa.edge.index.svc.query; - -import nu.marginalia.util.btree.CachingBTreeReader; -import nu.marginalia.wmsa.edge.index.reader.IndexWordsTable; -import nu.marginalia.wmsa.edge.index.reader.SearchIndex; -import org.slf4j.Logger; - -import java.util.HashMap; -import java.util.Map; - -public class IndexQueryCachePool { - private final Map indexCaches = new HashMap<>(); - private final Map rangeCache = new HashMap<>(); - private final Map savedCounts = new HashMap<>(); - - public CachingBTreeReader.BTreeCachedIndex getIndexCache(SearchIndex index, SearchIndex.IndexBTreeRange range) { - var key = new PoolKey(index, range.dataOffset); - var entry = indexCaches.get(key); - - if (entry == null) { - entry = range.createIndexCache(); - indexCaches.put(key, entry); - } - else { - savedCounts.merge(key, 1, Integer::sum); - } - - return entry; - } - - - public boolean isUrlPresent(SearchIndex index, int term, long url) { - var range = index.rangeForWord(this, term); - return range.isPresent() && range.hasUrl(this, url); - } - - public void printSummary(Logger logger) { - long loadedBytes = indexCaches.values().stream().mapToLong(CachingBTreeReader.BTreeCachedIndex::sizeBytes).sum(); - long savedBytes = savedCounts.entrySet().stream().mapToLong(e -> e.getValue() * indexCaches.get(e.getKey()).sizeBytes()).sum(); - - long loaded = indexCaches.values().stream().filter(CachingBTreeReader.BTreeCachedIndex::isLoaded).count(); - - logger.info("Index Cache Summary: {}/{} loaded/total, {} index blocks loaded, {} index blocks saved", loaded, indexCaches.size(), loadedBytes/4096., savedBytes/4096.); - } - - public void clear() { - indexCaches.clear(); - } - - public SearchIndex.IndexBTreeRange getRange(IndexWordsTable words, int wordId) { - return rangeCache.get(new RangeKey(words, wordId)); - } - - public void cacheRange(IndexWordsTable words, int wordId, SearchIndex.IndexBTreeRange range) { - rangeCache.put(new RangeKey(words, wordId), range); - } - - private record RangeKey(IndexWordsTable table, int wordId) {} - private record PoolKey(SearchIndex index, long dataOffset) {} -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryFactory.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryFactory.java index 19236755..f2707a99 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryFactory.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryFactory.java @@ -1,13 +1,13 @@ package nu.marginalia.wmsa.edge.index.svc.query; import nu.marginalia.wmsa.edge.index.reader.SearchIndex; +import nu.marginalia.wmsa.edge.index.svc.query.types.EmptyEntrySource; import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySource; -import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterBTreeRange; -import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepFromPredicate; +import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterBTreeRangeReject; +import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterBTreeRangeRetain; import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepIf; import java.util.*; -import java.util.function.LongPredicate; import java.util.stream.Collectors; public class IndexQueryFactory { @@ -23,59 +23,101 @@ public class IndexQueryFactory { this.excludeIndex = excludeIndex; } - public IndexQueryBuilder buildQuery(IndexQueryCachePool cachePool, int firstWordId) { + public IndexQueryBuilder buildQuery(int firstWordId) { List sources = new ArrayList<>(requiredIndices.size()); for (var ri : requiredIndices) { - var range = ri.rangeForWord(cachePool, firstWordId); + var range = ri.rangeForWord(firstWordId); if (range.isPresent()) { sources.add(range.asEntrySource()); } } - return new IndexQueryBuilder(new IndexQuery(sources), cachePool); + return new IndexQueryBuilder(new IndexQuery(sources)); + } + + public IndexQueryBuilder buildQuery(int quality, int wordId) { + List sources = new ArrayList<>(requiredIndices.size()); + + for (var ri : requiredIndices) { + var range = ri.rangeForWord(wordId); + if (range.isPresent()) { + sources.add(range.asQualityLimitingEntrySource(quality)); + } + } + + return new IndexQueryBuilder(new IndexQuery(sources)); + } + + public IndexQueryBuilder buildQuery(List domains, int wordId) { + List sources = new ArrayList<>(requiredIndices.size()); + + for (var ri : requiredIndices) { + var range = ri.rangeForWord(wordId); + + if (range.isPresent()) { + for (int dom : domains) { + long prefix = (long) dom << 32L; + long prefixNext = prefix + 0x0000_0001_0000_0000L; + + var source = range.asPrefixSource(prefix, prefixNext); + if (source.hasMore()) { + sources.add(source); + } + } + } + + } + + if (sources.isEmpty()) { + sources.add(new EmptyEntrySource()); + } + + return new IndexQueryBuilder(new IndexQuery(sources)); } public class IndexQueryBuilder { private final IndexQuery query; - private final IndexQueryCachePool cachePool; - IndexQueryBuilder(IndexQuery query, - IndexQueryCachePool cachePool) { + IndexQueryBuilder(IndexQuery query) { this.query = query; - this.cachePool = cachePool; - } - - public void filter(LongPredicate predicate) { - query.addInclusionFilter(new QueryFilterStepFromPredicate(predicate)); } public IndexQueryBuilder also(int termId) { List filters = new ArrayList<>(requiredIndices.size()); for (var ri : requiredIndices) { - var range = ri.rangeForWord(cachePool, termId); + var range = ri.rangeForWord(termId); if (range.isPresent()) { - filters.add(new QueryFilterBTreeRange(ri, range, cachePool)); - } - else { - filters.add(QueryFilterStepIf.noPass()); + filters.add(new QueryFilterBTreeRangeRetain(range)); } } + if (filters.isEmpty()) { + filters.add(QueryFilterStepIf.noPass()); + } - filters.sort(Comparator.naturalOrder()); - query.addInclusionFilter(QueryFilterStepIf.anyOf(filters)); + + if (filters.size() > 1) { + filters.sort(Comparator.naturalOrder()); + query.addInclusionFilter(QueryFilterStepIf.anyOf(filters)); + } + else { + query.addInclusionFilter(filters.get(0)); + } return this; } + public void addInclusionFilter(QueryFilterStepIf filter) { + query.addInclusionFilter(filter); + } public IndexQueryBuilder not(int termId) { for (var ri : excludeIndex) { - var range = ri.rangeForWord(cachePool, termId); + var range = ri.rangeForWord(termId); if (range.isPresent()) { - query.addInclusionFilter(range.asExcludeFilterStep(cachePool)); + query.addInclusionFilter(new QueryFilterBTreeRangeReject(range)); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryParams.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryParams.java new file mode 100644 index 00000000..d157c8da --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryParams.java @@ -0,0 +1,16 @@ +package nu.marginalia.wmsa.edge.index.svc.query; + +import nu.marginalia.wmsa.edge.index.model.IndexBlock; +import nu.marginalia.wmsa.edge.index.svc.EdgeIndexQueryService; + +import java.util.List; + +public record IndexQueryParams(IndexBlock block, + EdgeIndexQueryService.SearchTerms searchTerms, + Integer qualityLimit, + Integer rankLimit, + List targetDomains + ) +{ + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/EmptyEntrySource.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/EmptyEntrySource.java new file mode 100644 index 00000000..43b171ad --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/EmptyEntrySource.java @@ -0,0 +1,19 @@ +package nu.marginalia.wmsa.edge.index.svc.query.types; + +import nu.marginalia.util.btree.BTreeQueryBuffer; + +public class EmptyEntrySource implements EntrySource { + @Override + public void skip(int n) { + } + + @Override + public void read(BTreeQueryBuffer buffer) { + buffer.zero(); + } + + @Override + public boolean hasMore() { + return false; + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/EntrySource.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/EntrySource.java index b550a589..c31c3aed 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/EntrySource.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/EntrySource.java @@ -1,9 +1,10 @@ package nu.marginalia.wmsa.edge.index.svc.query.types; -import nu.marginalia.wmsa.edge.index.reader.SearchIndex; +import nu.marginalia.util.btree.BTreeQueryBuffer; public interface EntrySource { - SearchIndex getIndex(); - int read(long[] buffer, int n); + void skip(int n); + void read(BTreeQueryBuffer buffer); + boolean hasMore(); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/EntrySourceFromBTree.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/EntrySourceFromBTree.java new file mode 100644 index 00000000..7641966c --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/EntrySourceFromBTree.java @@ -0,0 +1,108 @@ +package nu.marginalia.wmsa.edge.index.svc.query.types; + +import nu.marginalia.util.btree.BTreeQueryBuffer; +import nu.marginalia.util.btree.BTreeReader; +import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata; + +import javax.annotation.Nullable; + +import static java.lang.Math.min; +import static nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter.*; + +public class EntrySourceFromBTree implements EntrySource { + @Nullable + private final BTreeReader reader; + private final long metadataBitMask; + private final Integer qualityLimit; + + public static final long NO_MASKING = ~0L; + + int pos; + long endOffset; + + public EntrySourceFromBTree(@Nullable BTreeReader reader, long metadataBitMask, Integer qualityLimit) { + this.reader = reader; + this.metadataBitMask = metadataBitMask; + this.qualityLimit = qualityLimit; + + if (reader != null) { + pos = 0; + endOffset = pos + (long) reader.numEntries() * ENTRY_SIZE; + } + } + + + @Override + public void skip(int n) { + pos += n * ENTRY_SIZE; + } + + @Override + public void read(BTreeQueryBuffer buffer) { + if (reader == null) { + buffer.zero(); + return; + } + + assert buffer.end%ENTRY_SIZE == 0; + + buffer.end = min(buffer.end, (int)(endOffset - pos)); + + reader.readData(buffer.data, buffer.end, pos); + + pos += buffer.end; + + destagger(buffer); + buffer.uniq(); + } + + private void destagger(BTreeQueryBuffer buffer) { + if (metadataBitMask == NO_MASKING && qualityLimit == null) { + for (int i = 0; (i + ENTRY_SIZE - 1) < buffer.end; i += ENTRY_SIZE) { + buffer.data[i / ENTRY_SIZE] = buffer.data[i + ENTRY_URL_OFFSET]; + } + + buffer.end /= ENTRY_SIZE; + } + else { + int write = 0; + + for (int read = 0; read < buffer.end; read+=ENTRY_SIZE) { + final long metadata = buffer.data[read + ENTRY_METADATA_OFFSET]; + + if (isQualityOk(metadata) && isFlagsOk(metadata)) { + buffer.data[write++] = buffer.data[read+ENTRY_URL_OFFSET]; + } + } + + buffer.end = write; + } + } + + private boolean isFlagsOk(long metadata) { + return metadataBitMask == ~0L || EdgePageWordMetadata.hasFlags(metadata, metadataBitMask); + } + + private boolean isQualityOk(long metadata) { + if (qualityLimit == null) + return true; + + final int quality = EdgePageWordMetadata.decodeQuality(metadata); + + if (qualityLimit < 0) + return quality > -qualityLimit; + else + return quality < qualityLimit; + } + + @Override + public boolean hasMore() { + return pos < endOffset; + } + + @Override + public String toString() { + return String.format("BTreeRange.EntrySource(@" + pos + ": " + endOffset + ")"); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/EntrySourceFromMapRange.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/EntrySourceFromMapRange.java new file mode 100644 index 00000000..99cb94d6 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/EntrySourceFromMapRange.java @@ -0,0 +1,60 @@ +package nu.marginalia.wmsa.edge.index.svc.query.types; + +import nu.marginalia.util.btree.BTreeQueryBuffer; +import nu.marginalia.util.multimap.MultimapFileLong; + +import static java.lang.Math.min; +import static nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter.ENTRY_SIZE; +import static nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter.ENTRY_URL_OFFSET; + +public class EntrySourceFromMapRange implements EntrySource { + + private final MultimapFileLong map; + private long pos; + private final long endOffset; + + public EntrySourceFromMapRange(MultimapFileLong map, long start, long end) { + this.map = map; + this.pos = start; + this.endOffset = end; + } + + @Override + public void skip(int n) { + pos += (long) n * ENTRY_SIZE; + } + + @Override + public void read(BTreeQueryBuffer buffer) { + + assert buffer.end%ENTRY_SIZE == 0; + + buffer.end = min(buffer.end, (int)(endOffset - pos)); + + map.read(buffer.data, buffer.end, pos); + + pos += buffer.end; + + destagger(buffer); + buffer.uniq(); + } + + private void destagger(BTreeQueryBuffer buffer) { + for (int i = 0; (i + ENTRY_SIZE - 1) < buffer.end; i += ENTRY_SIZE) { + buffer.data[i / ENTRY_SIZE] = buffer.data[i + ENTRY_URL_OFFSET]; + } + + buffer.end /= ENTRY_SIZE; + } + + @Override + public boolean hasMore() { + return pos < endOffset; + } + + @Override + public String toString() { + return String.format("BTreeRange.EntrySourceFromMapRange(@" + pos + ": " + endOffset + ")"); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterAnyOf.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterAnyOf.java index 52d8d1f2..9944b89d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterAnyOf.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterAnyOf.java @@ -1,21 +1,18 @@ package nu.marginalia.wmsa.edge.index.svc.query.types.filter; -import nu.marginalia.wmsa.edge.index.reader.SearchIndex; +import nu.marginalia.util.btree.BTreeQueryBuffer; +import java.util.Arrays; import java.util.List; import java.util.StringJoiner; -class QueryFilterAnyOf implements QueryFilterStepIf { +public class QueryFilterAnyOf implements QueryFilterStepIf { private final List steps; - QueryFilterAnyOf(List steps) { + public QueryFilterAnyOf(List steps) { this.steps = steps; } - public SearchIndex getIndex() { - return null; - } - public double cost() { return steps.stream().mapToDouble(QueryFilterStepIf::cost).average().orElse(0.); } @@ -29,6 +26,35 @@ class QueryFilterAnyOf implements QueryFilterStepIf { return false; } + + public void apply(BTreeQueryBuffer buffer) { + int start; + int end = buffer.end; + + steps.get(0).apply(buffer); + + // The filter functions will partition the data in the buffer from 0 to END, + // and update END to the length of the retained items, keeping the retained + // items sorted but making no guarantees about the rejected half + // + // Therefore, we need to re-sort the rejected side, and to satisfy the + // constraint that the data is sorted up to END, finally sort it again. + // + // This sorting may seem like it's slower, but filter.apply(...) is + // typically much faster than iterating over filter.test(...); so this + // is more than made up for + + for (int fi = 1; fi < steps.size(); fi++) + { + start = buffer.end; + Arrays.sort(buffer.data, start, end); + buffer.startFilterForRange(start, end); + steps.get(fi).apply(buffer); + } + + Arrays.sort(buffer.data, 0, buffer.end); + } + public String describe() { StringJoiner sj = new StringJoiner(",", "[Any Of: ", "]"); for (var step : steps) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterBTreeRange.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterBTreeRange.java deleted file mode 100644 index 6ce32620..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterBTreeRange.java +++ /dev/null @@ -1,33 +0,0 @@ -package nu.marginalia.wmsa.edge.index.svc.query.types.filter; - -import nu.marginalia.util.btree.CachingBTreeReader; -import nu.marginalia.wmsa.edge.index.reader.SearchIndex; -import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool; -import org.jetbrains.annotations.Nullable; - -public record QueryFilterBTreeRange(SearchIndex source, SearchIndex.IndexBTreeRange range, CachingBTreeReader.BTreeCachedIndex cache) implements QueryFilterStepIf { - - public QueryFilterBTreeRange(SearchIndex source, SearchIndex.IndexBTreeRange range, IndexQueryCachePool pool) { - this(source, range, pool.getIndexCache(source, range)); - } - - @Nullable - @Override - public SearchIndex getIndex() { - return source; - } - - public boolean test(long id) { - return range.hasUrl(cache, id); - } - - @Override - public double cost() { - return cache.getIndexedDataSize(); - } - - @Override - public String describe() { - return "UrlRange["+getIndex().name+"]"; - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterBTreeRangeReject.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterBTreeRangeReject.java new file mode 100644 index 00000000..ed826f10 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterBTreeRangeReject.java @@ -0,0 +1,27 @@ +package nu.marginalia.wmsa.edge.index.svc.query.types.filter; + +import nu.marginalia.util.btree.BTreeQueryBuffer; +import nu.marginalia.wmsa.edge.index.reader.SearchIndexURLRange; + +public record QueryFilterBTreeRangeReject(SearchIndexURLRange range) implements QueryFilterStepIf { + + @Override + public void apply(BTreeQueryBuffer buffer) { + range.rejectUrls(buffer); + buffer.finalizeFiltering(); + } + + public boolean test(long id) { + return !range.hasUrl(id); + } + + @Override + public double cost() { + return range.numEntries(); + } + + @Override + public String describe() { + return "Reject: UrlRange[]"; + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterBTreeRangeRetain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterBTreeRangeRetain.java new file mode 100644 index 00000000..c0929076 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterBTreeRangeRetain.java @@ -0,0 +1,27 @@ +package nu.marginalia.wmsa.edge.index.svc.query.types.filter; + +import nu.marginalia.util.btree.BTreeQueryBuffer; +import nu.marginalia.wmsa.edge.index.reader.SearchIndexURLRange; + +public record QueryFilterBTreeRangeRetain(SearchIndexURLRange range) implements QueryFilterStepIf { + + @Override + public void apply(BTreeQueryBuffer buffer) { + range.retainUrls(buffer); + buffer.finalizeFiltering(); + } + + public boolean test(long id) { + return range.hasUrl(id); + } + + @Override + public double cost() { + return range.numEntries(); + } + + @Override + public String describe() { + return "UrlRange[]"; + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterNoPass.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterNoPass.java index 3c2f6b07..f1a9a964 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterNoPass.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterNoPass.java @@ -1,6 +1,6 @@ package nu.marginalia.wmsa.edge.index.svc.query.types.filter; -import nu.marginalia.wmsa.edge.index.reader.SearchIndex; +import nu.marginalia.util.btree.BTreeQueryBuffer; class QueryFilterNoPass implements QueryFilterStepIf { static final QueryFilterStepIf instance = new QueryFilterNoPass(); @@ -10,8 +10,8 @@ class QueryFilterNoPass implements QueryFilterStepIf { return false; } - public SearchIndex getIndex() { - return null; + public void apply(BTreeQueryBuffer buffer) { + buffer.finalizeFiltering(); } public double cost() { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterStepFromPredicate.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterStepFromPredicate.java index 4fdb204e..af2bca13 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterStepFromPredicate.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterStepFromPredicate.java @@ -1,8 +1,5 @@ package nu.marginalia.wmsa.edge.index.svc.query.types.filter; -import nu.marginalia.wmsa.edge.index.reader.SearchIndex; -import org.jetbrains.annotations.Nullable; - import java.util.function.LongPredicate; public class QueryFilterStepFromPredicate implements QueryFilterStepIf { @@ -12,12 +9,6 @@ public class QueryFilterStepFromPredicate implements QueryFilterStepIf { this.pred = pred; } - @Nullable - @Override - public SearchIndex getIndex() { - return null; - } - @Override public boolean test(long value) { return pred.test(value); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterStepIf.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterStepIf.java index 211c9e79..e1418e1d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterStepIf.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterStepIf.java @@ -1,14 +1,10 @@ package nu.marginalia.wmsa.edge.index.svc.query.types.filter; -import nu.marginalia.wmsa.edge.index.reader.SearchIndex; +import nu.marginalia.util.btree.BTreeQueryBuffer; -import javax.annotation.Nullable; import java.util.List; public interface QueryFilterStepIf extends Comparable { - @Nullable - SearchIndex getIndex(); - boolean test(long value); double cost(); @@ -19,46 +15,27 @@ public interface QueryFilterStepIf extends Comparable { String describe(); - /** - * Move each value in items to the beginning of the array, - * and return the number of matching items. + /**

For each item in buffer from READ to END, retain the items that + * satisfy the filter, maintaining their order, and update END + * to the length of the retained items.

* - * The remaining values are undefined. + *

Items that are rejected are moved past the new END, all items + * are kept, but their order is not guaranteed.

+ * + *

ASSUMPTION: buffer is sorted up until end.

*/ - default int retainDestructive(long[] items, int max) { - int keep = 0; - for (int i = 0; i < max; i++) { - if (test(items[i])) { - if (i != keep) { - items[keep] = items[i]; - } - keep++; + default void apply(BTreeQueryBuffer buffer) { + while (buffer.hasMore()) { + if (test(buffer.currentValue())) { + buffer.retainAndAdvance(); + } + else { + buffer.rejectAndAdvance(); } } - return keep; + buffer.finalizeFiltering(); } - /** - * Move each value in items to the beginning of the array, - * and return the number of matching items. The values that do - * not pass the test are moved to the end of the array. - */ - default int retainReorder(long[] items, int start, int max) { - int keep = 0; - for (int i = start; i < max; i++) { - if (test(items[i])) { - if (i != keep) { - long tmp = items[keep]; - items[keep] = items[i]; - items[i] = tmp; - } - keep++; - } - } - return keep; - } - - static QueryFilterStepIf noPass() { return QueryFilterNoPass.instance; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryRankLimitingFilter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryRankLimitingFilter.java new file mode 100644 index 00000000..69cdc833 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryRankLimitingFilter.java @@ -0,0 +1,37 @@ +package nu.marginalia.wmsa.edge.index.svc.query.types.filter; + +import nu.marginalia.util.btree.BTreeQueryBuffer; + +public class QueryRankLimitingFilter implements QueryFilterStepIf +{ + private final int rankLimit; + + public QueryRankLimitingFilter(int rankLimit) { + this.rankLimit = rankLimit; + } + + @Override + public boolean test(long value) { + long rank = value >>> 32L; + return rank < rankLimit; + } + + @Override + public void apply(BTreeQueryBuffer buffer) { + + while (buffer.hasMore() && test(buffer.currentValue())) { + buffer.retainAndAdvance(); + } + + buffer.finalizeFiltering(); + } + @Override + public double cost() { + return 0; + } + + @Override + public String describe() { + return getClass().getSimpleName(); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/model/BasicDocumentData.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/model/BasicDocumentData.java index 905b7486..5d207fd4 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/model/BasicDocumentData.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/model/BasicDocumentData.java @@ -2,9 +2,9 @@ package nu.marginalia.wmsa.edge.integration.model; import lombok.AllArgsConstructor; import lombok.Data; +import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainLink; import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet; -import nu.marginalia.wmsa.edge.model.EdgeUrl; @Data diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostProcessor.java index a5fb0656..839be1ab 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostProcessor.java @@ -3,6 +3,7 @@ package nu.marginalia.wmsa.edge.integration.stackoverflow; import com.google.inject.Inject; import nu.marginalia.util.language.processing.DocumentKeywordExtractor; import nu.marginalia.util.language.processing.SentenceExtractor; +import nu.marginalia.util.language.processing.model.KeywordMetadata; import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData; @@ -43,13 +44,13 @@ public class StackOverflowPostProcessor { } var dld = sentenceExtractor.extractSentences(doc); - var keywords = documentKeywordExtractor.extractKeywords(dld); + var keywords = documentKeywordExtractor.extractKeywords(dld, new KeywordMetadata(-15)); - keywords.get(IndexBlock.Meta).addJust("site:"+post.getUrl().domain); - keywords.get(IndexBlock.Words_1).addJust("site:"+post.getUrl().domain); - keywords.get(IndexBlock.Words_1).addJust("special:wikipedia"); - keywords.get(IndexBlock.Meta).addJust("special:wikipedia"); - keywords.get(IndexBlock.Meta).addJust("js:true"); + keywords.get(IndexBlock.Meta).addJustNoMeta("site:"+post.getUrl().domain); + keywords.get(IndexBlock.Words_1).addJustNoMeta("site:"+post.getUrl().domain); + keywords.get(IndexBlock.Words_1).addJustNoMeta("special:wikipedia"); + keywords.get(IndexBlock.Meta).addJustNoMeta("special:wikipedia"); + keywords.get(IndexBlock.Meta).addJustNoMeta("js:true"); String title = StringUtils.abbreviate(post.getTitle(), 255); String description = StringUtils.abbreviate(Jsoup.parseBodyFragment(post.getJustBody()).text(), 255); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaProcessor.java index 22536b90..f724c0b5 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaProcessor.java @@ -2,6 +2,7 @@ package nu.marginalia.wmsa.edge.integration.wikipedia; import nu.marginalia.util.language.processing.DocumentKeywordExtractor; import nu.marginalia.util.language.processing.SentenceExtractor; +import nu.marginalia.util.language.processing.model.KeywordMetadata; import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData; @@ -39,13 +40,13 @@ public class WikipediaProcessor { EdgeDomainLink[] domainLinks = getDomainLinks(docUrl, doc); var dld = sentenceExtractor.extractSentences(doc); - var keywords = documentKeywordExtractor.extractKeywords(dld); + var keywords = documentKeywordExtractor.extractKeywords(dld, new KeywordMetadata(15)); - keywords.get(IndexBlock.Meta).addJust("site:"+post.getUrl().domain); - keywords.get(IndexBlock.Words_1).addJust("site:"+post.getUrl().domain); - keywords.get(IndexBlock.Words_1).addJust("special:stackoverflow"); - keywords.get(IndexBlock.Meta).addJust("special:stackoverflow"); - keywords.get(IndexBlock.Meta).addJust("js:true"); + keywords.get(IndexBlock.Meta).addJustNoMeta("site:"+post.getUrl().domain); + keywords.get(IndexBlock.Words_1).addJustNoMeta("site:"+post.getUrl().domain); + keywords.get(IndexBlock.Words_1).addJustNoMeta("special:stackoverflow"); + keywords.get(IndexBlock.Meta).addJustNoMeta("special:stackoverflow"); + keywords.get(IndexBlock.Meta).addJustNoMeta("js:true"); return new BasicDocumentData(docUrl, title, description, post.body.hashCode(), keywords, domainLinks, dld.totalNumWords()); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeDomainIndexingState.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeDomainIndexingState.java index b10d0e88..6a57a871 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeDomainIndexingState.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeDomainIndexingState.java @@ -1,12 +1,18 @@ package nu.marginalia.wmsa.edge.model.crawl; public enum EdgeDomainIndexingState { - ACTIVE, - EXHAUSTED, - SPECIAL, - SOCIAL_MEDIA, - BLOCKED, - REDIR, - ERROR, - UNKNOWN + ACTIVE("Active"), + EXHAUSTED("Fully Crawled"), + SPECIAL("Content is side-loaded"), + SOCIAL_MEDIA("Social media-like website"), + BLOCKED("Blocked"), + REDIR("Redirected to another domain"), + ERROR("Error during crawling"), + UNKNOWN("Unknown"); + + public String desc; + + EdgeDomainIndexingState(String desc) { + this.desc = desc; + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWordSet.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWordSet.java index 7dfe0f6a..242ac5da 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWordSet.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWordSet.java @@ -1,6 +1,7 @@ package nu.marginalia.wmsa.edge.model.crawl; import lombok.Data; +import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import java.util.*; @@ -25,10 +26,12 @@ public class EdgePageWordSet { return words; } - public void append(IndexBlock block, Collection words) { + public void append(IndexBlock block, Collection words) { wordSets.computeIfAbsent(block, b -> new EdgePageWords(block)).addAll(words); } - + public void appendWithNoMeta(IndexBlock block, Collection words) { + wordSets.computeIfAbsent(block, b -> new EdgePageWords(block)).addAllNoMeta(words); + } public Collection values() { return new ArrayList<>(wordSets.values()); } @@ -41,7 +44,10 @@ public class EdgePageWordSet { var sj = new StringJoiner("\n", "EdgePageWordSet:\n", ""); wordSets.forEach((block, words) -> { if (words.size() > 0) { - sj.add("\t" + block + "\t" + words.getWords()); + sj.add("\t" + block); + for (int i = 0; i < words.size(); i++) { + sj.add("\t\t" + words.getWords().get(i) + ":" + new EdgePageWordMetadata(words.getMetadata().get(i))); + } } }); return sj.toString(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWords.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWords.java index bc97d6aa..2258d764 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWords.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWords.java @@ -1,35 +1,91 @@ package nu.marginalia.wmsa.edge.model.crawl; + +import gnu.trove.list.array.TLongArrayList; import lombok.Getter; import lombok.ToString; +import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags; +import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import java.util.ArrayList; import java.util.Collection; import java.util.List; +import java.util.Set; @ToString @Getter public class EdgePageWords{ public final IndexBlock block; - public final List words = new ArrayList<>(); + public final ArrayList words = new ArrayList<>(); + public final TLongArrayList metadata = new TLongArrayList(); public EdgePageWords(IndexBlock block) { this.block = block; } - public EdgePageWords(IndexBlock block, Collection initial) { + public EdgePageWords(IndexBlock block, Collection initial) { this.block = block; - addAll(initial); + words.ensureCapacity(initial.size()); + metadata.ensureCapacity(initial.size()); + for (var entry : initial) { + words.add(entry.word); + metadata.add(entry.metadata); + } } - public void addAll(Collection words) { + public static EdgePageWords withBlankMetadata(IndexBlock block, List entries) { + List emptyMeta = new ArrayList<>(entries.size()); + + for (int i = 0; i < entries.size(); i++) { + emptyMeta.add(EdgePageWordMetadata.emptyValue()); + } + + return new EdgePageWords(block, entries, emptyMeta); + } + + public void addJustNoMeta(String word) { + words.add(word); + metadata.add(0); + } + + private EdgePageWords(IndexBlock block, List words, List meta) { + this.block = block; + this.words.addAll(words); + this.metadata.addAll(meta); } - public void addAllMax(Collection words, int limit) { - words.stream().limit(limit).forEach(this.words::add); + + public void addAll(Collection newWords) { + words.ensureCapacity(words.size() + newWords.size()); + metadata.ensureCapacity(metadata.size() + newWords.size()); + + for (var entry : newWords) { + words.add(entry.word); + metadata.add(entry.metadata); + } } + + public void setFlagOnMetadataForWords(EdgePageWordFlags flag, Set flagWords) { + for (int i = 0; i < words.size(); i++) { + if (flagWords.contains(words.get(i))) { + metadata.set(i, metadata.get(i) | flag.asBit()); + } + } + } + + public void addAllNoMeta(Collection newWords) { + words.ensureCapacity(words.size() + newWords.size()); + metadata.ensureCapacity(metadata.size() + newWords.size()); + + for (var entry : newWords) { + words.add(entry); + metadata.add(0L); + } + } + public int size() { return words.size(); } - public void addJust(String word) { words.add(word); } + public record Entry(String word, long metadata) { + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeUrlState.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeUrlState.java index 67fc2b61..f9b57be3 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeUrlState.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeUrlState.java @@ -5,6 +5,6 @@ public enum EdgeUrlState { OK, REDIRECT, DEAD, - ARCHIVED, - DISQUALIFIED + DISQUALIFIED; + } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultItem.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultItem.java index 4db221b4..d6d20233 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultItem.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultItem.java @@ -2,24 +2,26 @@ package nu.marginalia.wmsa.edge.model.search; import lombok.AllArgsConstructor; import lombok.Getter; -import lombok.ToString; +import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.id.EdgeId; import java.util.ArrayList; import java.util.List; -@AllArgsConstructor @ToString @Getter +@AllArgsConstructor @Getter public class EdgeSearchResultItem { public final int bucketId; + public final IndexBlock block; public final long combinedId; public final List scores; public int resultsFromDomain; - public EdgeSearchResultItem(int bucketId, long val) { + public EdgeSearchResultItem(int bucketId, IndexBlock block, long val) { this.bucketId = bucketId; + this.block = block; this.combinedId = val; this.scores = new ArrayList<>(16); } @@ -49,6 +51,10 @@ public class EdgeSearchResultItem { return getUrlIdInt(); } + public String toString() { + return getClass().getSimpleName() + "[ url= " + getUrlId() + ", rank=" + getRanking() + "; bucket = " + bucketId + "]"; + } + public boolean equals(Object other) { if (other == null) return false; @@ -63,7 +69,7 @@ public class EdgeSearchResultItem { public long deduplicationKey() { final int ranking = getRanking(); - if (ranking == Integer.MAX_VALUE) { + if (ranking == Integer.MAX_VALUE || ranking == Integer.MIN_VALUE) { return 0; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultKeywordScore.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultKeywordScore.java index 10fcfa67..24406fc3 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultKeywordScore.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultKeywordScore.java @@ -1,36 +1,55 @@ package nu.marginalia.wmsa.edge.model.search; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; +import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags; +import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata; + +import static java.lang.Integer.lowestOneBit; +import static java.lang.Integer.numberOfTrailingZeros; public record EdgeSearchResultKeywordScore(int set, String keyword, - IndexBlock index, - boolean title, - boolean link, - boolean site, - boolean subject, - boolean name, - boolean high, - boolean mid, - boolean low) { - public double value() { - double sum = 0; - if (title) - sum -= 15; - if (link) - sum -= 10; - if (site) - sum -= 10; - if (subject) - sum -= 10; - if (high) - sum -= 5; - if (mid) - sum -= 3; - if (low) - sum -= 2; - if (name) - sum -= -1; + EdgePageWordMetadata metadata) { + public double documentValue() { + long sum = 0; + sum += metadata.quality() / 5.; + if (metadata.flags().contains(EdgePageWordFlags.Simple)) { + sum += 20; + } return sum; } + + public double termValue() { + double sum = 0; + + if (metadata.flags().contains(EdgePageWordFlags.Title)) { + sum -= 15; + } + + if (metadata.flags().contains(EdgePageWordFlags.Site)) { + sum -= 10; + } + else if (metadata.flags().contains(EdgePageWordFlags.SiteAdjacent)) { + sum -= 5; + } + + if (metadata.flags().contains(EdgePageWordFlags.Subjects)) { + sum -= 10; + } + if (metadata.flags().contains(EdgePageWordFlags.NamesWords)) { + sum -= 1; + } + + sum -= metadata.tfIdf() / 50.; + sum += firstPos() / 5.; + sum -= Integer.bitCount(positions()) / 3.; + + return sum; + } + + public int firstPos() { + return numberOfTrailingZeros(lowestOneBit(metadata.positions())); + } + public int positions() { return metadata.positions(); } + public boolean isSpecial() { return keyword.contains(":"); } + public boolean isRegular() { return !keyword.contains(":"); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSpecification.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSpecification.java index 3fd6699e..c39ea7fe 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSpecification.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSpecification.java @@ -2,13 +2,18 @@ package nu.marginalia.wmsa.edge.model.search; import lombok.*; +import java.util.ArrayList; +import java.util.HashSet; import java.util.List; +import java.util.Set; @ToString @Getter @Builder @With @AllArgsConstructor public class EdgeSearchSpecification { public List buckets; public List subqueries; + public List domains; + public final int limitByDomain; public final int limitTotal; @@ -17,4 +22,14 @@ public class EdgeSearchSpecification { public final int timeoutMs; public final int fetchSize; + public final Integer quality; + public final Integer rank; + + public List allIncludeSearchTerms() { + Set searchTerms = new HashSet<>(64); + for (var query : subqueries) { + searchTerms.addAll(query.searchTermsInclude); + } + return new ArrayList<>(searchTerms); + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeUrlDetails.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeUrlDetails.java index 281be169..bd1e7ade 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeUrlDetails.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeUrlDetails.java @@ -4,9 +4,10 @@ import lombok.*; import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature; import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; -import nu.marginalia.wmsa.edge.search.model.EdgeSearchRankingSymbols; +import java.util.EnumSet; import java.util.Objects; +import java.util.StringJoiner; @AllArgsConstructor @NoArgsConstructor @With @Getter @ToString public class EdgeUrlDetails { @@ -32,6 +33,9 @@ public class EdgeUrlDetails { public int resultsFromSameDomain; + public String positions; + public EdgeSearchResultItem resultItem; + public boolean hasMoreResults() { return resultsFromSameDomain > 1; } @@ -122,6 +126,42 @@ public class EdgeUrlDetails { return "PLAIN".equals(format); } + public int getProblemCount() { + int numProblems = 0; + + for (var problem :EnumSet.of( + HtmlFeature.JS, + HtmlFeature.TRACKING, + HtmlFeature.AFFILIATE_LINK, + HtmlFeature.COOKIES, + HtmlFeature.ADVERTISEMENT)) { + if (HtmlFeature.hasFeature(features, problem)) { + numProblems++; + } + } + return numProblems; + } + + public String getProblems() { + StringJoiner sj = new StringJoiner(", "); + + if (isScripts()) { + sj.add("Javascript"); + } + if (isCookies()) { + sj.add("Cookies"); + } + if (isTracking()) { + sj.add("Tracking/Analytics"); + } + if (isAffiliate()) { + sj.add("Affiliate Linking"); + } + + return sj.toString(); + + } + public boolean isScripts() { return HtmlFeature.hasFeature(features, HtmlFeature.JS); } @@ -137,7 +177,6 @@ public class EdgeUrlDetails { public boolean isCookies() { return HtmlFeature.hasFeature(features, HtmlFeature.COOKIES); } - public boolean isUnknown() { return HtmlFeature.hasFeature(features, HtmlFeature.UNKNOWN); } public boolean isAds() { return HtmlFeature.hasFeature(features, HtmlFeature.ADVERTISEMENT); } public boolean isSpecialDomain() { @@ -145,12 +184,13 @@ public class EdgeUrlDetails { } public int getLogRank() { return (int) Math.round(Math.min(Math.log(1+rankingId),10)); } - public String getRankingSymbol() { - return EdgeSearchRankingSymbols.getRankingSymbol(termScore); - } + public int getMatchRank() { + if (termScore <= 1) return 1; + if (termScore <= 2) return 2; + if (termScore <= 3) return 3; + if (termScore <= 5) return 5; - public String getRankingSymbolDesc() { - return EdgeSearchRankingSymbols.getRankingSymbolDescription(termScore); + return 10; } public double getFeatureScore() { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchService.java index 6c28c0ef..6483c922 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchService.java @@ -1,26 +1,17 @@ package nu.marginalia.wmsa.edge.search; -import com.google.common.base.Strings; import com.google.gson.Gson; import com.google.inject.Inject; import com.google.inject.name.Named; import lombok.SneakyThrows; -import nu.marginalia.wmsa.api.model.ApiSearchResult; -import nu.marginalia.wmsa.api.model.ApiSearchResults; import nu.marginalia.wmsa.client.GsonFactory; import nu.marginalia.wmsa.configuration.WebsiteUrl; import nu.marginalia.wmsa.configuration.server.Context; import nu.marginalia.wmsa.configuration.server.Initialization; import nu.marginalia.wmsa.configuration.server.MetricsServer; import nu.marginalia.wmsa.configuration.server.Service; -import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient; -import nu.marginalia.wmsa.edge.search.command.CommandEvaluator; import nu.marginalia.wmsa.edge.search.command.IndexCommand; -import nu.marginalia.wmsa.edge.search.command.SearchJsParameter; -import nu.marginalia.wmsa.edge.search.command.SearchParameters; -import nu.marginalia.wmsa.edge.search.exceptions.RedirectException; -import nu.marginalia.wmsa.edge.search.query.model.EdgeUserSearchParameters; -import nu.marginalia.wmsa.edge.search.svc.EdgeSearchErrorPageService; +import nu.marginalia.wmsa.edge.search.svc.*; import nu.marginalia.wmsa.resource_store.StaticResources; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -30,54 +21,51 @@ import spark.Spark; import java.net.URLEncoder; import java.nio.charset.StandardCharsets; -import java.util.Optional; -import java.util.stream.Collectors; public class EdgeSearchService extends Service { - private final EdgeIndexClient indexClient; - private final EdgeSearchOperator searchOperator; - private final CommandEvaluator searchCommandEvaulator; private final WebsiteUrl websiteUrl; private StaticResources staticResources; - private final EdgeSearchErrorPageService errorPageService; private static final Logger logger = LoggerFactory.getLogger(EdgeSearchService.class); @SneakyThrows @Inject public EdgeSearchService(@Named("service-host") String ip, @Named("service-port") Integer port, - EdgeIndexClient indexClient, Initialization initialization, MetricsServer metricsServer, - EdgeSearchOperator searchOperator, - CommandEvaluator searchCommandEvaulator, WebsiteUrl websiteUrl, StaticResources staticResources, IndexCommand indexCommand, - EdgeSearchErrorPageService errorPageService) { + EdgeSearchErrorPageService errorPageService, + EdgeSearchAddToCrawlQueueService addToCrawlQueueService, + EdgeSearchFlagSiteService flagSiteService, + EdgeSearchQueryService searchQueryService, + EdgeSearchApiQueryService apiQueryService + ) { super(ip, port, initialization, metricsServer); - this.indexClient = indexClient; - this.searchOperator = searchOperator; - this.searchCommandEvaulator = searchCommandEvaulator; this.websiteUrl = websiteUrl; this.staticResources = staticResources; - this.errorPageService = errorPageService; Spark.staticFiles.expireTime(600); - Spark.get("/search", this::pathSearch); + Spark.get("/search", searchQueryService::pathSearch); Gson gson = GsonFactory.get(); - Spark.get("/api/search", this::apiSearch, gson::toJson); - Spark.get("/public/search", this::pathSearch); + Spark.get("/api/search", apiQueryService::apiSearch, gson::toJson); + Spark.get("/public/search", searchQueryService::pathSearch); Spark.get("/public/site-search/:site/*", this::siteSearchRedir); Spark.get("/public/", indexCommand::render); Spark.get("/public/:resource", this::serveStatic); + Spark.post("/public/site/suggest/", addToCrawlQueueService::suggestCrawling); + + Spark.get("/public/site/flag-site/:domainId", flagSiteService::flagSiteForm); + Spark.post("/public/site/flag-site/:domainId", flagSiteService::flagSiteAction); + Spark.get("/site-search/:site/*", this::siteSearchRedir); @@ -108,67 +96,5 @@ public class EdgeSearchService extends Service { } - @SneakyThrows - private Object apiSearch(Request request, Response response) { - - final var ctx = Context.fromRequest(request); - final String queryParam = request.queryParams("query"); - final int limit; - EdgeSearchProfile profile = EdgeSearchProfile.YOLO; - - String count = request.queryParamOrDefault("count", "20"); - limit = Integer.parseInt(count); - - String index = request.queryParamOrDefault("index", "0"); - if (!Strings.isNullOrEmpty(index)) { - profile = switch (index) { - case "0" -> EdgeSearchProfile.YOLO; - case "1" -> EdgeSearchProfile.MODERN; - case "2" -> EdgeSearchProfile.DEFAULT; - case "3" -> EdgeSearchProfile.CORPO_CLEAN; - default -> EdgeSearchProfile.CORPO_CLEAN; - }; - } - - final String humanQuery = queryParam.trim(); - - var results = searchOperator.doApiSearch(ctx, new EdgeUserSearchParameters(humanQuery, profile, SearchJsParameter.DEFAULT)); - - return new ApiSearchResults("RESTRICTED", humanQuery, results.stream().map(ApiSearchResult::new).limit(limit).collect(Collectors.toList())); - } - - @SneakyThrows - private Object pathSearch(Request request, Response response) { - - final var ctx = Context.fromRequest(request); - - final String queryParam = request.queryParams("query"); - if (null == queryParam || queryParam.isBlank()) { - response.redirect(websiteUrl.url()); - return null; - } - - final String profileStr = Optional.ofNullable(request.queryParams("profile")).orElse(EdgeSearchProfile.YOLO.name); - final String humanQuery = queryParam.trim(); - - var params = new SearchParameters( - EdgeSearchProfile.getSearchProfile(profileStr), - SearchJsParameter.parse(request.queryParams("js")) - ); - - try { - return searchCommandEvaulator.eval(ctx, params, humanQuery); - } - catch (RedirectException ex) { - response.redirect(ex.newUrl); - } - catch (Exception ex) { - logger.error("Error", ex); - errorPageService.serveError(ctx, response); - } - - return ""; - } - } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/SearchParameters.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/SearchParameters.java index 33ebb1bc..ff5662ea 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/SearchParameters.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/SearchParameters.java @@ -1,8 +1,8 @@ package nu.marginalia.wmsa.edge.search.command; -import nu.marginalia.wmsa.edge.search.EdgeSearchProfile; +import nu.marginalia.wmsa.edge.search.model.EdgeSearchProfile; -public record SearchParameters(EdgeSearchProfile profile, SearchJsParameter js) { +public record SearchParameters(EdgeSearchProfile profile, SearchJsParameter js, boolean detailedResults) { public String profileStr() { return profile.name; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SearchCommand.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SearchCommand.java index c75c0699..3b1ddab6 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SearchCommand.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SearchCommand.java @@ -10,7 +10,6 @@ import nu.marginalia.wmsa.edge.search.command.SearchParameters; import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResults; import nu.marginalia.wmsa.edge.search.query.model.EdgeUserSearchParameters; import nu.marginalia.wmsa.edge.search.results.BrowseResultCleaner; -import nu.marginalia.wmsa.edge.search.svc.EdgeSearchUnitConversionService; import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer; import nu.marginalia.wmsa.renderer.mustache.RendererFactory; @@ -21,9 +20,8 @@ public class SearchCommand implements SearchCommandInterface { private final EdgeDomainBlacklist blacklist; private final EdgeDataStoreDao dataStoreDao; private final EdgeSearchOperator searchOperator; - private final EdgeSearchUnitConversionService edgeSearchUnitConversionService; private final MustacheRenderer searchResultsRenderer; - private BrowseResultCleaner browseResultCleaner; + private final BrowseResultCleaner browseResultCleaner; public static final int MAX_DOMAIN_RESULTS = 3; @@ -31,14 +29,12 @@ public class SearchCommand implements SearchCommandInterface { public SearchCommand(EdgeDomainBlacklist blacklist, EdgeDataStoreDao dataStoreDao, EdgeSearchOperator searchOperator, - EdgeSearchUnitConversionService edgeSearchUnitConversionService, RendererFactory rendererFactory, BrowseResultCleaner browseResultCleaner ) throws IOException { this.blacklist = blacklist; this.dataStoreDao = dataStoreDao; this.searchOperator = searchOperator; - this.edgeSearchUnitConversionService = edgeSearchUnitConversionService; this.browseResultCleaner = browseResultCleaner; searchResultsRenderer = rendererFactory.renderer("edge/search-results"); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteListCommand.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteListCommand.java index 3a6e4ff4..76b24d51 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteListCommand.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteListCommand.java @@ -5,10 +5,10 @@ import nu.marginalia.wmsa.configuration.server.Context; import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails; -import nu.marginalia.wmsa.edge.search.EdgeSearchProfile; import nu.marginalia.wmsa.edge.search.command.SearchCommandInterface; import nu.marginalia.wmsa.edge.search.command.SearchParameters; import nu.marginalia.wmsa.edge.search.model.DomainInformation; +import nu.marginalia.wmsa.edge.search.model.EdgeSearchProfile; import nu.marginalia.wmsa.edge.search.siteinfo.DomainInformationService; import nu.marginalia.wmsa.edge.search.svc.EdgeSearchQueryIndexService; import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer; @@ -58,21 +58,27 @@ public class SiteListCommand implements SearchCommandInterface { List resultSet; Path screenshotPath = null; + Integer domainId = -1; if (null != domain) { resultSet = searchQueryIndexService.performDumbQuery(ctx, EdgeSearchProfile.CORPO, IndexBlock.Words_1, 100, 100, "site:"+domain); - - screenshotPath = Path.of("/screenshot/" + dataStoreDao.getDomainId(domain).id()); + domainId = dataStoreDao.getDomainId(domain).id(); + screenshotPath = Path.of("/screenshot/" + domainId); } else { resultSet = Collections.emptyList(); } - return Optional.of(siteInfoRenderer.render(results, Map.of("query", query, - "hideRanking", true, - "focusDomain", Objects.requireNonNullElse(domain, ""), - "profile", parameters.profileStr(), - "results", resultSet, "screenshot", - screenshotPath == null ? "" : screenshotPath.toString()))); + Map renderObject = new HashMap<>(10); + + renderObject.put("query", query); + renderObject.put("hideRanking", true); + renderObject.put("profile", parameters.profileStr()); + renderObject.put("results", resultSet); + renderObject.put("screenshot", screenshotPath == null ? "" : screenshotPath.toString()); + renderObject.put("domainId", domainId); + renderObject.put("focusDomain", domain); + + return Optional.of(siteInfoRenderer.render(results, renderObject)); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/DomainInformation.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/DomainInformation.java index 862df844..949c9e5f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/DomainInformation.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/DomainInformation.java @@ -1,15 +1,11 @@ package nu.marginalia.wmsa.edge.search.model; -import lombok.AllArgsConstructor; -import lombok.Getter; -import lombok.NoArgsConstructor; -import lombok.ToString; +import lombok.*; import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; import java.util.List; -@Getter @AllArgsConstructor @NoArgsConstructor +@Getter @AllArgsConstructor @NoArgsConstructor @Builder @ToString public class DomainInformation { EdgeDomain domain; @@ -22,6 +18,9 @@ public class DomainInformation { int outboundLinks; double ranking; - EdgeDomainIndexingState state; + boolean suggestForCrawling; + boolean inCrawlQueue; + + String state; List linkingDomains; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/EdgeSearchProfile.java similarity index 66% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/EdgeSearchProfile.java index 99601965..73831886 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/EdgeSearchProfile.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.search; +package nu.marginalia.wmsa.edge.search.model; import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature; import nu.marginalia.wmsa.edge.index.model.IndexBlock; @@ -6,6 +6,7 @@ import nu.marginalia.wmsa.edge.model.search.EdgeSearchSubquery; import java.util.Arrays; import java.util.List; +import java.util.Objects; import java.util.stream.Collectors; public enum EdgeSearchProfile { @@ -13,12 +14,14 @@ public enum EdgeSearchProfile { DEFAULT("default", SearchOrder.DEFAULT_ORDER, 0, 1), MODERN("modern", SearchOrder.DEFAULT_ORDER, 2), CORPO("corpo", SearchOrder.DEFAULT_ORDER, 4, 5, 7), - YOLO("yolo", SearchOrder.DEFAULT_ORDER, 0, 2, 1, 3, 4, 6), - CORPO_CLEAN("corpo-clean", SearchOrder.DEFAULT_ORDER, 4, 5), + YOLO("yolo", SearchOrder.DEFAULT_ORDER, 0, 2, 1, 3, 6), + CORPO_CLEAN("corpo-clean", SearchOrder.DEFAULT_ORDER, 0, 1), ACADEMIA("academia", SearchOrder.DEFAULT_ORDER, 3), FOOD("food", SearchOrder.DEFAULT_ORDER, 2, 0), CRAFTS("crafts", SearchOrder.DEFAULT_ORDER, 2, 0), + + CLASSICS("classics", SearchOrder.DEFAULT_ORDER, 4, 5, 7), ; @@ -34,20 +37,19 @@ public enum EdgeSearchProfile { this.buckets = Arrays.stream(buckets).boxed().collect(Collectors.toList()); } - static EdgeSearchProfile getSearchProfile(String param) { + private final static EdgeSearchProfile[] values = values(); + public static EdgeSearchProfile getSearchProfile(String param) { if (null == param) { return YOLO; } - return switch (param) { - case "modern" -> MODERN; - case "default" -> DEFAULT; - case "corpo" -> CORPO; - case "academia" -> ACADEMIA; - case "food" -> FOOD; - case "crafts" -> CRAFTS; - default -> YOLO; - }; + for (var profile : values) { + if (Objects.equals(profile.name, param)) { + return profile; + } + } + + return YOLO; } public void addTacitTerms(EdgeSearchSubquery subquery) { @@ -57,11 +59,13 @@ public enum EdgeSearchProfile { if (this == CRAFTS) { subquery.searchTermsInclude.add(HtmlFeature.CATEGORY_CRAFTS.getKeyword()); } + } + public String getNearDomain() { + if (this == CLASSICS) { + return "classics.mit.edu"; + } + return null; } } -class SearchOrder { - static List DEFAULT_ORDER - = List.of(IndexBlock.Title, IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus); -} \ No newline at end of file diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/EdgeSearchRankingSymbols.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/EdgeSearchRankingSymbols.java index 1c3a85aa..989db072 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/EdgeSearchRankingSymbols.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/EdgeSearchRankingSymbols.java @@ -7,11 +7,11 @@ public class EdgeSearchRankingSymbols { private static final TreeMap symbols; static { symbols = new TreeMap<>(); - symbols.put(1.0, new RankingSymbol("⭐", "Fits search terms very well")); - symbols.put(2.0, new RankingSymbol("🟢", "Fits search terms well")); - symbols.put(4.0, new RankingSymbol("🟡", "Fits search terms decently")); - symbols.put(6.0, new RankingSymbol("🟠", "Could fit search terms")); - symbols.put(100.0, new RankingSymbol("🟤", "Poor fit for search terms, grasping at straws")); + symbols.put(1.0, new RankingSymbol("Ⓐ", "Fits search terms very well")); + symbols.put(3.0, new RankingSymbol("Ⓑ", "Fits search terms decently")); + symbols.put(4.0, new RankingSymbol("Ⓒ", "Could fit search terms")); + symbols.put(5.0, new RankingSymbol("Ⓓ", "Poor fit for search terms, grasping at straws")); + symbols.put(100., new RankingSymbol("Ⓕ", "Poor fit for search terms, grasping at straws")); } public static String getRankingSymbol(double termScore) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/SearchOrder.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/SearchOrder.java new file mode 100644 index 00000000..831fdbae --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/SearchOrder.java @@ -0,0 +1,10 @@ +package nu.marginalia.wmsa.edge.search.model; + +import nu.marginalia.wmsa.edge.index.model.IndexBlock; + +import java.util.List; + +class SearchOrder { + static List DEFAULT_ORDER + = List.of(IndexBlock.Title, IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus); +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/NearQueryProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/NearQueryProcessor.java new file mode 100644 index 00000000..60b49d37 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/NearQueryProcessor.java @@ -0,0 +1,65 @@ +package nu.marginalia.wmsa.edge.search.query; + +import com.google.inject.Inject; +import com.zaxxer.hikari.HikariDataSource; +import lombok.SneakyThrows; + +import java.sql.ResultSet; +import java.util.ArrayList; +import java.util.List; +import java.util.function.Consumer; + +public class NearQueryProcessor { + + private final HikariDataSource dataSource; + + @Inject + public NearQueryProcessor(HikariDataSource dataSource) { + this.dataSource = dataSource; + } + + @SneakyThrows + public List getRelatedDomains(String term, Consumer onProblem) { + List ret = new ArrayList<>(); + try (var conn = dataSource.getConnection(); + + var selfStmt = conn.prepareStatement(""" + SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=? + """); + var stmt = conn.prepareStatement(""" + SELECT NEIGHBOR_ID, ND.INDEXED, ND.STATE FROM EC_DOMAIN_NEIGHBORS_2 + INNER JOIN EC_DOMAIN ND ON ND.ID=NEIGHBOR_ID + WHERE DOMAIN_ID=? + """)) { + ResultSet rsp; + selfStmt.setString(1, term); + rsp = selfStmt.executeQuery(); + int domainId = -1; + if (rsp.next()) { + domainId = rsp.getInt(1); + ret.add(domainId); + } + + stmt.setInt(1, domainId); + rsp = stmt.executeQuery(); + + while (rsp.next()) { + int id = rsp.getInt(1); + int indexed = rsp.getInt(2); + String state = rsp.getString(3); + + if (indexed > 0 && ("ACTIVE".equalsIgnoreCase(state) || "SOCIAL_MEDIA".equalsIgnoreCase(state) || "SPECIAL".equalsIgnoreCase(state))) { + ret.add(id); + } + } + + } + + if (ret.isEmpty()) { + onProblem.accept("Could not find domains adjacent " + term); + } + + return ret; + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java index 4bb640fb..adf6c2b5 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java @@ -9,9 +9,10 @@ import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification; import nu.marginalia.wmsa.edge.model.search.EdgeSearchSubquery; +import nu.marginalia.wmsa.edge.search.model.EdgeSearchProfile; import nu.marginalia.wmsa.edge.search.query.model.EdgeSearchQuery; import nu.marginalia.wmsa.edge.search.query.model.EdgeUserSearchParameters; -import nu.marginalia.wmsa.edge.search.results.SearchResultValuator; +import nu.marginalia.wmsa.edge.search.valuation.SearchResultValuator; import org.eclipse.jetty.http.HttpStatus; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -28,17 +29,24 @@ public class QueryFactory { private final NGramBloomFilter nGramBloomFilter; private final Logger logger = LoggerFactory.getLogger(getClass()); private final SearchResultValuator searchResultValuator; + private NearQueryProcessor nearQueryProcessor; private static final int RETAIN_QUERY_VARIANT_COUNT = 5; @Inject - public QueryFactory(LanguageModels lm, TermFrequencyDict dict, EnglishDictionary englishDictionary, NGramBloomFilter nGramBloomFilter, SearchResultValuator searchResultValuator) { + public QueryFactory(LanguageModels lm, + TermFrequencyDict dict, + EnglishDictionary englishDictionary, + NGramBloomFilter nGramBloomFilter, + SearchResultValuator searchResultValuator, + NearQueryProcessor nearQueryProcessor) { this.lm = lm; this.dict = dict; this.englishDictionary = englishDictionary; this.nGramBloomFilter = nGramBloomFilter; this.searchResultValuator = searchResultValuator; + this.nearQueryProcessor = nearQueryProcessor; } public QueryParser getParser() { @@ -70,7 +78,7 @@ public class QueryFactory { for (var sq : processedQuery.specs.subqueries) { for (var block : profile.indexBlocks) { - subqueries.add(sq.withBlock(block).setValue(sq.getValue() * block.sortOrder)); + subqueries.add(sq.withBlock(block).setValue(sq.getValue() * block.ordinal())); } } @@ -108,6 +116,9 @@ public class QueryFactory { basicQuery.clear(); } + Integer qualityLimit = null; + Integer rankLimit = null; + for (Token t : basicQuery) { if (t.type == TokenType.QUOT_TERM || t.type == TokenType.LITERAL_TERM) { if (t.str.startsWith("site:")) { @@ -117,11 +128,25 @@ public class QueryFactory { searchTermsHuman.addAll(toHumanSearchTerms(t)); analyzeSearchTerm(problems, t); } + if (t.type == TokenType.QUALITY_TERM) { + qualityLimit = Integer.parseInt(t.str); + } + if (t.type == TokenType.RANK_TERM) { + if (profile == EdgeSearchProfile.CORPO) { + problems.add("Rank limit (" + t.displayStr + ") ignored in unranked query"); + } else { + rankLimit = Integer.parseInt(t.str); + } + } } + + var queryPermutations = queryParser.permuteQueriesNew(basicQuery); List subqueries = new ArrayList<>(); + String near = profile.getNearDomain(); + for (var parts : queryPermutations) { List searchTermsExclude = new ArrayList<>(); List searchTermsInclude = new ArrayList<>(); @@ -141,7 +166,11 @@ public class QueryFactory { if (t.str.toLowerCase().startsWith("site:")) { domain = t.str.substring("site:".length()); } - + break; + case QUALITY_TERM: + break; // + case NEAR_TERM: + near = t.str; break; default: logger.warn("Unexpected token type {}", t); @@ -156,14 +185,30 @@ public class QueryFactory { subqueries.add(subquery); } + List domains = Collections.emptyList(); + + if (near != null) { + if (domain == null) { + domains = nearQueryProcessor.getRelatedDomains(near, problems::add); + } + } + + if (qualityLimit != null && domains.isEmpty()) { + problems.add("Quality limit will be ignored when combined with 'near:'"); + } + + var buckets = domains.isEmpty() ? profile.buckets : EdgeSearchProfile.CORPO.buckets; EdgeSearchSpecification.EdgeSearchSpecificationBuilder specsBuilder = EdgeSearchSpecification.builder() .subqueries(subqueries) .limitTotal(100) .humanQuery(query) - .buckets(profile.buckets) + .buckets(buckets) .timeoutMs(250) - .fetchSize(4096); + .fetchSize(4096) + .quality(qualityLimit) + .rank(rankLimit) + .domains(domains); if (domain != null) { specsBuilder = specsBuilder.limitByDomain(100); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryParser.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryParser.java index 749c7e47..21399588 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryParser.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryParser.java @@ -61,7 +61,18 @@ public class QueryParser { var t = basicTokens.get(i); if (t.type == TokenType.LITERAL_TERM) { - parsedTokens.add(t); + if (t.str.startsWith("q:") && t.str.matches("q:[+-]?\\d+")) { + parsedTokens.add(new Token(TokenType.QUALITY_TERM, t.str.substring(2), t.displayStr)); + } + else if (t.str.startsWith("r:") && t.str.matches("r:\\d+")) { + parsedTokens.add(new Token(TokenType.RANK_TERM, t.str.substring(2), t.displayStr)); + } + else if (t.str.startsWith("near:")) { + parsedTokens.add(new Token(TokenType.NEAR_TERM, t.str.substring(5), t.displayStr)); + } + else { + parsedTokens.add(t); + } continue; } else if (t.type != TokenType.LPAREN) { @@ -477,6 +488,11 @@ enum TokenType { EXCLUDE_TERM, ADVICE_TERM, + QUALITY_TERM, + RANK_TERM, + + NEAR_TERM, + QUOT, MINUS, LPAREN, diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/model/EdgeUserSearchParameters.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/model/EdgeUserSearchParameters.java index 66f20af0..a84273fb 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/model/EdgeUserSearchParameters.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/model/EdgeUserSearchParameters.java @@ -1,7 +1,7 @@ package nu.marginalia.wmsa.edge.search.query.model; -import nu.marginalia.wmsa.edge.search.EdgeSearchProfile; import nu.marginalia.wmsa.edge.search.command.SearchJsParameter; +import nu.marginalia.wmsa.edge.search.model.EdgeSearchProfile; public record EdgeUserSearchParameters (String humanQuery, EdgeSearchProfile profile, SearchJsParameter jsSetting) { } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultDecorator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultDecorator.java index ea485b24..e6235a74 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultDecorator.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultDecorator.java @@ -3,12 +3,15 @@ package nu.marginalia.wmsa.edge.search.results; import com.google.inject.Inject; import gnu.trove.list.array.TIntArrayList; import gnu.trove.map.hash.TIntObjectHashMap; +import nu.marginalia.util.BrailleBlockPunchCards; import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao; import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; import nu.marginalia.wmsa.edge.model.id.EdgeIdList; import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultItem; +import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultKeywordScore; import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails; +import nu.marginalia.wmsa.edge.search.valuation.SearchResultValuator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -55,30 +58,44 @@ public class SearchResultDecorator { continue; } - if (details.rankingId == Integer.MAX_VALUE) { - details.rankingId = rankingId; - } + details.rankingId = rankingId; details.resultsFromSameDomain = resultItem.resultsFromDomain; details.termScore = calculateTermScore(resultItem, details); + details.positions = getPositions(resultItem); + details.resultItem = resultItem; logger.debug("{} -> {}", details.url, details.termScore); retList.add(details); } if (!missedIds.isEmpty()) { - logger.debug("Could not look up documents: {}", missedIds.toArray()); + logger.info("Could not look up documents: {}", missedIds.toArray()); } return retList; } + private String getPositions(EdgeSearchResultItem resultItem) { + int bits = resultItem.scores.stream() + .filter(EdgeSearchResultKeywordScore::isRegular) + .mapToInt(EdgeSearchResultKeywordScore::positions) + .reduce(this::or) + .orElse(0); + + return BrailleBlockPunchCards.printBits(bits, 32); + + } + + private int or(int a, int b) { + return a | b; + } + private double calculateTermScore(EdgeSearchResultItem resultItem, EdgeUrlDetails details) { final double statePenalty = (details.domainState == EdgeDomainIndexingState.SPECIAL) ? 1.25 : 0; final double value = valuator.evaluateTerms(resultItem.scores, details.words, details.title.length()); - if (dumpTermData) { System.out.println("---"); System.out.println(details.getUrl()); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultValuator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultValuator.java deleted file mode 100644 index 4cf4698e..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultValuator.java +++ /dev/null @@ -1,204 +0,0 @@ -package nu.marginalia.wmsa.edge.search.results; - -import com.google.inject.Inject; -import com.google.inject.Singleton; -import nu.marginalia.util.language.WordPatterns; -import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.model.IndexBlockType; -import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultKeywordScore; -import nu.marginalia.wmsa.edge.model.search.EdgeSearchSubquery; - -import java.util.List; -import java.util.regex.Pattern; - -import static java.lang.Math.min; - -@Singleton -public class SearchResultValuator { - private final TermFrequencyDict dict; - - private static final Pattern separator = Pattern.compile("_"); - - private static final int MIN_LENGTH = 2000; - private static final int AVG_LENGTH = 5000; - - @Inject - public SearchResultValuator(TermFrequencyDict dict) { - this.dict = dict; - } - - - public double preEvaluate(EdgeSearchSubquery sq) { - final String[] terms = sq.searchTermsInclude.stream().filter(f -> !f.contains(":")).toArray(String[]::new); - final IndexBlock index = sq.block; - - double termSum = 0.; - double factorSum = 0.; - - final double[] weights = getTermWeights(terms); - - for (int i = 0; i < terms.length; i++) { - final double factor = 1. / (1.0 + weights[i]); - - factorSum += factor; - termSum += (index.sortOrder + 0.5) * factor; - } - - return termSum / factorSum; - } - - public double evaluateTerms(List rawScores, int length, int titleLength) { - int sets = 1 + rawScores.stream().mapToInt(EdgeSearchResultKeywordScore::set).max().orElse(0); - - double bestScore = 1000; - double bestAllTermsFactor = 1.; - - int termCount = 5; - - for (int set = 0; set <= sets; set++) { - int thisSet = set; - EdgeSearchResultKeywordScore[] scores = rawScores.stream().filter(w -> w.set() == thisSet && !w.keyword().contains(":")).toArray(EdgeSearchResultKeywordScore[]::new); - - if (scores.length == 0) { - continue; - } - - final double[] weights = getTermWeights(scores); - final double lengthPenalty = getLengthPenalty(length); - - double termSum = 0.; - double factorSum = 0.; - - double allTermsFactor = 1.0; - - for (int i = 0; i < scores.length; i++) { - - final double factor = 1. / (1.0 + weights[i]); - - factorSum += factor; - termSum += (scores[i].index().sortOrder + 0.5) * factor / lengthPenalty; - - } - - assert factorSum != 0; - - double value = termSum / factorSum; - - for (int i = 0; i < scores.length; i++) { - final double factor = 1. / (1.0 + weights[i]); - - allTermsFactor *= getAllTermsFactorForScore(scores[i], scores[i].index(), factor/factorSum, scores.length, titleLength); - } - - termCount = min(termCount, scores.length); - bestAllTermsFactor = min(bestAllTermsFactor, allTermsFactor); - bestScore = min(bestScore, value); - } - - return bestScore * bestAllTermsFactor * Math.sqrt(1. + termCount); - } - - private double getAllTermsFactorForScore(EdgeSearchResultKeywordScore score, IndexBlock block, double termWeight, int scoreCount, int titleLength) { - double f = 1.; - - - if (score.link()) { - f *= Math.pow(0.5, termWeight / scoreCount); - } - - if (score.title()) { - if (block.type.equals(IndexBlockType.PAGE_DATA)) { - f *= Math.pow(0.8, termWeight / scoreCount); - } - else if (titleLength <= 64) { - f *= Math.pow(0.5, termWeight / scoreCount); - } - else if (titleLength < 96) { - f *= Math.pow(0.75, termWeight / scoreCount); - } - else { // likely keyword stuffing if the title is this long - f *= Math.pow(0.9, termWeight / scoreCount); - } - } - - if (!block.type.equals(IndexBlockType.TF_IDF)) { - if (score.high()) { - f *= Math.pow(0.75, termWeight / scoreCount); - } else if (score.mid()) { - f *= Math.pow(0.8, termWeight / scoreCount); - } else if (score.low()) { - f *= Math.pow(0.9, termWeight / scoreCount); - } - } - - if (score.site()) { - f *= Math.pow(0.75, termWeight / scoreCount); - } - - if (score.subject()) { - f *= Math.pow(0.8, termWeight / scoreCount); - } - - if (!score.title() && !score.subject() && score.name()) { - f *= Math.pow(0.9, termWeight / scoreCount); - } - - return f; - } - - private double getLengthPenalty(int length) { - if (length < MIN_LENGTH) { - length = MIN_LENGTH; - } - if (length > AVG_LENGTH) { - length = AVG_LENGTH; - } - return (0.5 + 0.5 * length / AVG_LENGTH); - } - - private double[] getTermWeights(EdgeSearchResultKeywordScore[] scores) { - double[] weights = new double[scores.length]; - - for (int i = 0; i < scores.length; i++) { - String[] parts = separator.split(scores[i].keyword()); - double sumScore = 0.; - - int count = 0; - for (String part : parts) { - if (!WordPatterns.isStopWord(part)) { - sumScore += dict.getTermFreq(part); - count++; - } - } - if (count == 0) count = 1; - - weights[i] = Math.sqrt(sumScore)/count; - } - - return weights; - } - - - private double[] getTermWeights(String[] words) { - double[] weights = new double[words.length]; - - for (int i = 0; i < words.length; i++) { - String[] parts = separator.split(words[i]); - double sumScore = 0.; - - int count = 0; - for (String part : parts) { - if (!WordPatterns.isStopWord(part)) { - sumScore += dict.getTermFreq(part); - count++; - } - } - if (count == 0) count = 1; - - weights[i] = Math.sqrt(sumScore)/count; - } - - return weights; - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java index ba793081..d12b4a41 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java @@ -46,19 +46,59 @@ public class DomainInformationService { if (domainId == null) { return Optional.empty(); } - EdgeDomain domain = dataStoreDao.getDomain(domainId); - boolean blacklisted = isBlacklisted(domain); + Optional domain = dataStoreDao.getDomain(domainId); + if (domain.isEmpty()) { + return Optional.empty(); + } + + boolean blacklisted = isBlacklisted(domain.get()); int pagesKnown = getPagesKnown(domainId); int pagesVisited = getPagesVisited(domainId); int pagesIndexed = getPagesIndexed(domainId); int incomingLinks = getIncomingLinks(domainId); int outboundLinks = getOutboundLinks(domainId); + + boolean inCrawlQueue = inCrawlQueue(domainId); + double rank = Math.round(10000.0*(1.0-getRank(domainId)))/100; + EdgeDomainIndexingState state = getDomainState(domainId); List linkingDomains = getLinkingDomains(domainId); - return Optional.of(new DomainInformation(domain, blacklisted, pagesKnown, pagesVisited, pagesIndexed, incomingLinks, outboundLinks, rank, state, linkingDomains)); + var di = DomainInformation.builder() + .domain(domain.get()) + .blacklisted(blacklisted) + .pagesKnown(pagesKnown) + .pagesFetched(pagesVisited) + .pagesIndexed(pagesIndexed) + .incomingLinks(incomingLinks) + .outboundLinks(outboundLinks) + .ranking(rank) + .state(state.desc) + .linkingDomains(linkingDomains) + .inCrawlQueue(inCrawlQueue) + .suggestForCrawling((pagesVisited == 0 && !inCrawlQueue)) + .build(); + + return Optional.of(di); + } + + @SneakyThrows + private boolean inCrawlQueue(EdgeId domainId) { + try (var connection = dataSource.getConnection()) { + try (var stmt = connection.prepareStatement( + """ + SELECT 1 FROM CRAWL_QUEUE + INNER JOIN EC_DOMAIN ON CRAWL_QUEUE.DOMAIN_NAME = EC_DOMAIN.DOMAIN_NAME + WHERE EC_DOMAIN.ID=? + """)) + { + stmt.setInt(1, domainId.id()); + var rsp = stmt.executeQuery(); + return rsp.next(); + } + } } private EdgeId getDomainFromPartial(String site) { @@ -66,12 +106,7 @@ public class DomainInformationService { return dataStoreDao.getDomainId(new EdgeDomain(site)); } catch (Exception ex) { - try { - return dataStoreDao.getDomainId(new EdgeDomain(site)); - } - catch (Exception ex2) { - return null; - } + return null; } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchAddToCrawlQueueService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchAddToCrawlQueueService.java new file mode 100644 index 00000000..f2421ded --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchAddToCrawlQueueService.java @@ -0,0 +1,70 @@ +package nu.marginalia.wmsa.edge.search.svc; + +import com.google.inject.Inject; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.wmsa.configuration.WebsiteUrl; +import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao; +import nu.marginalia.wmsa.edge.model.id.EdgeId; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import spark.Request; +import spark.Response; +import spark.Spark; + +import java.sql.SQLException; + +public class EdgeSearchAddToCrawlQueueService { + + private EdgeDataStoreDao edgeDataStoreDao; + private WebsiteUrl websiteUrl; + private HikariDataSource dataSource; + private final Logger logger = LoggerFactory.getLogger(EdgeSearchAddToCrawlQueueService.class); + + @Inject + public EdgeSearchAddToCrawlQueueService(EdgeDataStoreDao edgeDataStoreDao, + WebsiteUrl websiteUrl, + HikariDataSource dataSource) { + this.edgeDataStoreDao = edgeDataStoreDao; + this.websiteUrl = websiteUrl; + this.dataSource = dataSource; + } + + public Object suggestCrawling(Request request, Response response) throws SQLException { + logger.info("{}", request.queryParams()); + int id = Integer.parseInt(request.queryParams("id")); + boolean nomisclick = "on".equals(request.queryParams("nomisclick")); + + String domainName = getDomainName(id); + + if (nomisclick) { + logger.info("Adding {} to crawl queue", domainName); + addToCrawlQueue(id); + } + else { + logger.info("Nomisclick not set, not adding {} to crawl queue", domainName); + } + + response.redirect(websiteUrl.withPath("/site/" + domainName)); + + return ""; + } + + private void addToCrawlQueue(int id) throws SQLException { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + INSERT IGNORE INTO CRAWL_QUEUE(DOMAIN_NAME, SOURCE) + SELECT DOMAIN_NAME, "user" FROM EC_DOMAIN WHERE ID=? + """)) { + stmt.setInt(1, id); + stmt.executeUpdate(); + } + } + + private String getDomainName(int id) { + var domain = edgeDataStoreDao.getDomain(new EdgeId<>(id)); + if (domain.isEmpty()) + Spark.halt(404); + return domain.get().toString(); + } +} + diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchApiQueryService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchApiQueryService.java new file mode 100644 index 00000000..6c5ae7fd --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchApiQueryService.java @@ -0,0 +1,55 @@ +package nu.marginalia.wmsa.edge.search.svc; + +import com.google.common.base.Strings; +import com.google.inject.Inject; +import lombok.SneakyThrows; +import nu.marginalia.wmsa.api.model.ApiSearchResult; +import nu.marginalia.wmsa.api.model.ApiSearchResults; +import nu.marginalia.wmsa.configuration.server.Context; +import nu.marginalia.wmsa.edge.search.EdgeSearchOperator; +import nu.marginalia.wmsa.edge.search.command.SearchJsParameter; +import nu.marginalia.wmsa.edge.search.model.EdgeSearchProfile; +import nu.marginalia.wmsa.edge.search.query.model.EdgeUserSearchParameters; +import spark.Request; +import spark.Response; + +import java.util.stream.Collectors; + +public class EdgeSearchApiQueryService { + private EdgeSearchOperator searchOperator; + + @Inject + public EdgeSearchApiQueryService(EdgeSearchOperator searchOperator) { + this.searchOperator = searchOperator; + } + + @SneakyThrows + public Object apiSearch(Request request, Response response) { + + final var ctx = Context.fromRequest(request); + final String queryParam = request.queryParams("query"); + final int limit; + EdgeSearchProfile profile = EdgeSearchProfile.YOLO; + + String count = request.queryParamOrDefault("count", "20"); + limit = Integer.parseInt(count); + + String index = request.queryParamOrDefault("index", "0"); + if (!Strings.isNullOrEmpty(index)) { + profile = switch (index) { + case "0" -> EdgeSearchProfile.YOLO; + case "1" -> EdgeSearchProfile.MODERN; + case "2" -> EdgeSearchProfile.DEFAULT; + case "3" -> EdgeSearchProfile.CORPO_CLEAN; + default -> EdgeSearchProfile.CORPO_CLEAN; + }; + } + + final String humanQuery = queryParam.trim(); + + var results = searchOperator.doApiSearch(ctx, new EdgeUserSearchParameters(humanQuery, profile, SearchJsParameter.DEFAULT)); + + return new ApiSearchResults("RESTRICTED", humanQuery, results.stream().map(ApiSearchResult::new).limit(limit).collect(Collectors.toList())); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchFlagSiteService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchFlagSiteService.java new file mode 100644 index 00000000..fec5c737 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchFlagSiteService.java @@ -0,0 +1,125 @@ +package nu.marginalia.wmsa.edge.search.svc; + +import com.google.inject.Inject; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer; +import nu.marginalia.wmsa.renderer.mustache.RendererFactory; +import spark.Request; +import spark.Response; +import spark.Spark; + +import java.io.IOException; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.function.Function; +import java.util.stream.Collectors; + +public class EdgeSearchFlagSiteService { + private final MustacheRenderer formTemplate; + private final HikariDataSource dataSource; + + private final CategoryItem unknownCategory = new CategoryItem("unknown", "Unknown"); + + private final List categories = + List.of( + new CategoryItem("spam", "Spam"), + new CategoryItem("freebooting", "Reposting Stolen Content"), + new CategoryItem("broken", "Broken Website"), + new CategoryItem("shock", "Shocking/Offensive"), + new CategoryItem("blacklist", "Review Blacklisting") + ); + + private final Map categoryItemMap = + categories.stream().collect(Collectors.toMap(CategoryItem::categoryName, Function.identity())); + @Inject + public EdgeSearchFlagSiteService(RendererFactory rendererFactory, + HikariDataSource dataSource) throws IOException { + formTemplate = rendererFactory.renderer("edge/indict/indict-form"); + this.dataSource = dataSource; + } + + public Object flagSiteForm(Request request, Response response) throws SQLException { + final int domainId = Integer.parseInt(request.params("domainId")); + + var model = getModel(domainId, false); + return formTemplate.render(model); + } + + public Object flagSiteAction(Request request, Response response) throws SQLException { + + int domainId = Integer.parseInt(request.params("domainId")); + + var formData = new FlagSiteFormData( + domainId, + request.queryParams("category"), + request.queryParams("description"), + request.queryParams("samplequery") + ); + + insertComplaint(formData); + + return formTemplate.render(getModel(domainId, true)); + } + + private void insertComplaint(FlagSiteFormData formData) throws SQLException { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement( + """ + INSERT INTO DOMAIN_COMPLAINT(DOMAIN_ID, CATEGORY, DESCRIPTION, SAMPLE) VALUES (?, ?, ?, ?) + """)) { + stmt.setInt(1, formData.domainId); + stmt.setString(2, formData.category); + stmt.setString(3, formData.description); + stmt.setString(4, formData.sampleQuery); + stmt.executeUpdate(); + } + } + + private FlagSiteViewModel getModel(int id, boolean isSubmitted) throws SQLException { + + + try (var conn = dataSource.getConnection(); + var complaintsStmt = conn.prepareStatement(""" + SELECT CATEGORY, FILE_DATE, REVIEWED, DECISION + FROM DOMAIN_COMPLAINT + WHERE DOMAIN_ID=? + """); + var stmt = conn.prepareStatement( + """ + SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE EC_DOMAIN.ID=? + """)) + { + List complaints = new ArrayList<>(); + + complaintsStmt.setInt(1, id); + ResultSet rs = complaintsStmt.executeQuery(); + + while (rs.next()) { + complaints.add(new FlagSiteComplaintModel( + categoryItemMap.getOrDefault(rs.getString(1), unknownCategory).categoryDesc, + rs.getString(2), + rs.getBoolean(3), + rs.getString(4))); + } + + stmt.setInt(1, id); + rs = stmt.executeQuery(); + if (!rs.next()) { + Spark.halt(404); + } + return new FlagSiteViewModel(id, + rs.getString(1), + categories, + complaints, + isSubmitted); + } + } + + public record CategoryItem(String categoryName, String categoryDesc) {} + public record FlagSiteViewModel(int domainId, String domain, List category, List complaints, boolean isSubmitted) {} + public record FlagSiteComplaintModel(String category, String submitTime, boolean isReviewed, String decision) {} + public record FlagSiteFormData(int domainId, String category, String description, String sampleQuery) {}; +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchQueryIndexService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchQueryIndexService.java index 29983675..93e61562 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchQueryIndexService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchQueryIndexService.java @@ -6,7 +6,7 @@ import nu.marginalia.wmsa.configuration.server.Context; import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.model.search.*; -import nu.marginalia.wmsa.edge.search.EdgeSearchProfile; +import nu.marginalia.wmsa.edge.search.model.EdgeSearchProfile; import nu.marginalia.wmsa.edge.search.query.model.EdgeSearchQuery; import nu.marginalia.wmsa.edge.search.results.SearchResultDecorator; import nu.marginalia.wmsa.edge.search.results.UrlDeduplicator; @@ -20,13 +20,15 @@ public class EdgeSearchQueryIndexService { private final SearchResultDecorator resultDecorator; private final Comparator resultListComparator; private final EdgeIndexClient indexClient; + @Inject public EdgeSearchQueryIndexService(SearchResultDecorator resultDecorator, EdgeIndexClient indexClient) { this.resultDecorator = resultDecorator; this.indexClient = indexClient; Comparator c = Comparator.comparing(ud -> Math.round(10*(ud.getTermScore() - ud.rankingIdAdjustment()))); - resultListComparator = c.thenComparing(EdgeUrlDetails::getRanking) + resultListComparator = c + .thenComparing(EdgeUrlDetails::getRanking) .thenComparing(EdgeUrlDetails::getId); } @@ -35,7 +37,7 @@ public class EdgeSearchQueryIndexService { sqs.add(new EdgeSearchSubquery(Arrays.asList(termsInclude), Collections.emptyList(), Collections.emptyList(), block)); - EdgeSearchSpecification specs = new EdgeSearchSpecification(profile.buckets, sqs, limitPerDomain, limitTotal, "", 150, 2048); + EdgeSearchSpecification specs = new EdgeSearchSpecification(profile.buckets, sqs, Collections.emptyList(), limitPerDomain, limitTotal, "", 150, 2048, null, null); return performQuery(ctx, new EdgeSearchQuery(specs)); } @@ -43,10 +45,13 @@ public class EdgeSearchQueryIndexService { public List performQuery(Context ctx, EdgeSearchQuery processedQuery) { final List results = indexClient.query(ctx, processedQuery.specs); + final List resultList = new ArrayList<>(results.size()); + long badQCount = 0; for (var details : resultDecorator.getAllUrlDetails(results)) { if (details.getUrlQuality() <= -100) { + badQCount++; continue; } @@ -61,6 +66,9 @@ public class EdgeSearchQueryIndexService { UrlDeduplicator deduplicator = new UrlDeduplicator(processedQuery.specs.limitByDomain); List retList = new ArrayList<>(processedQuery.specs.limitTotal); + if (badQCount > 0) { + System.out.println(badQCount); + } for (var item : resultList) { if (retList.size() >= processedQuery.specs.limitTotal) break; @@ -108,7 +116,7 @@ public class EdgeSearchQueryIndexService { long domainHits = Arrays.stream(searchTermsLC).filter(domainLC::contains).count(); double descHitsAdj = 0.; - for (String word : descLC.split("[^\\w]+")) { + for (String word : descLC.split("\\W+")) { descHitsAdj += Arrays.stream(searchTermsLC) .filter(term -> term.length() > word.length()) .filter(term -> term.contains(word)) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchQueryService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchQueryService.java new file mode 100644 index 00000000..41a50ee6 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchQueryService.java @@ -0,0 +1,70 @@ +package nu.marginalia.wmsa.edge.search.svc; + +import com.google.inject.Inject; +import lombok.SneakyThrows; +import nu.marginalia.wmsa.configuration.WebsiteUrl; +import nu.marginalia.wmsa.configuration.server.Context; +import nu.marginalia.wmsa.edge.search.command.CommandEvaluator; +import nu.marginalia.wmsa.edge.search.command.SearchJsParameter; +import nu.marginalia.wmsa.edge.search.command.SearchParameters; +import nu.marginalia.wmsa.edge.search.exceptions.RedirectException; +import nu.marginalia.wmsa.edge.search.model.EdgeSearchProfile; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import spark.Request; +import spark.Response; + +import java.util.Optional; + +public class EdgeSearchQueryService { + + private WebsiteUrl websiteUrl; + private final EdgeSearchErrorPageService errorPageService; + private final CommandEvaluator searchCommandEvaulator; + private final Logger logger = LoggerFactory.getLogger(getClass()); + + @Inject + public EdgeSearchQueryService( + WebsiteUrl websiteUrl, + EdgeSearchErrorPageService errorPageService, + CommandEvaluator searchCommandEvaulator) { + this.websiteUrl = websiteUrl; + this.errorPageService = errorPageService; + this.searchCommandEvaulator = searchCommandEvaulator; + } + + @SneakyThrows + public Object pathSearch(Request request, Response response) { + + final var ctx = Context.fromRequest(request); + + final String queryParam = request.queryParams("query"); + if (null == queryParam || queryParam.isBlank()) { + response.redirect(websiteUrl.url()); + return null; + } + + final String profileStr = Optional.ofNullable(request.queryParams("profile")).orElse(EdgeSearchProfile.YOLO.name); + final String humanQuery = queryParam.trim(); + + var params = new SearchParameters( + EdgeSearchProfile.getSearchProfile(profileStr), + SearchJsParameter.parse(request.queryParams("js")), + Boolean.parseBoolean(request.queryParams("detailed")) + ); + + try { + return searchCommandEvaulator.eval(ctx, params, humanQuery); + } + catch (RedirectException ex) { + response.redirect(ex.newUrl); + } + catch (Exception ex) { + logger.error("Error", ex); + errorPageService.serveError(ctx, response); + } + + return ""; + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/valuation/SearchResultValuator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/valuation/SearchResultValuator.java new file mode 100644 index 00000000..76a74408 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/valuation/SearchResultValuator.java @@ -0,0 +1,292 @@ +package nu.marginalia.wmsa.edge.search.valuation; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.util.language.WordPatterns; +import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; +import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags; +import nu.marginalia.wmsa.edge.index.model.IndexBlock; +import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultKeywordScore; +import nu.marginalia.wmsa.edge.model.search.EdgeSearchSubquery; +import org.jetbrains.annotations.NotNull; + +import java.util.Arrays; +import java.util.EnumSet; +import java.util.Iterator; +import java.util.List; +import java.util.regex.Pattern; + +import static java.lang.Math.min; + +@Singleton +public class SearchResultValuator { + private final TermFrequencyDict dict; + + private static final Pattern separator = Pattern.compile("_"); + + private static final int MIN_LENGTH = 2000; + private static final int AVG_LENGTH = 5000; + + @Inject + public SearchResultValuator(TermFrequencyDict dict) { + this.dict = dict; + } + + + public double preEvaluate(EdgeSearchSubquery sq) { + final String[] terms = sq.searchTermsInclude.stream().filter(f -> !f.contains(":")).toArray(String[]::new); + final IndexBlock index = sq.block; + + double termSum = 0.; + double factorSum = 0.; + + final double[] weights = getTermWeights(terms); + + for (int i = 0; i < terms.length; i++) { + final double factor = 1. / (1.0 + weights[i]); + + factorSum += factor; + termSum += (index.ordinal() + 0.5) * factor; + } + + return termSum / factorSum; + } + + public double evaluateTerms(List rawScores, int length, int titleLength) { + int sets = 1 + rawScores.stream().mapToInt(EdgeSearchResultKeywordScore::set).max().orElse(0); + + double bestPosFactor = 10; + double bestScore = 10; + double bestAllTermsFactor = 1.; + + for (int set = 0; set <= sets; set++) { + SearchResultsKeywordSet keywordSet = createKeywordSet(rawScores, set); + + if (keywordSet == null) + continue; + + final double lengthPenalty = getLengthPenalty(length); + + final double bm25Factor = getBM25(keywordSet, lengthPenalty); + final double minCountFactor = getMinCountFactor(keywordSet); + final double posFactor = posFactor(keywordSet); + + bestScore = min(bestScore, bm25Factor * minCountFactor); + bestPosFactor = min(bestPosFactor, posFactor); + bestAllTermsFactor = min(bestAllTermsFactor, getAllTermsFactorForSet(keywordSet, titleLength)); + } + + return (0.7 + 0.3 * bestPosFactor) * bestScore * (0.3 + 0.7 * bestAllTermsFactor); + } + + private double getMinCountFactor(SearchResultsKeywordSet keywordSet) { + // Penalize results with few keyword hits + + int min = 32; + + for (var keyword : keywordSet) { + min = min(min, keyword.count()); + } + + if (min <= 1) return 2; + if (min <= 2) return 1; + if (min <= 3) return 0.75; + return 0.5; + } + + private double getBM25(SearchResultsKeywordSet keywordSet, double lengthPenalty) { + + // This is a fairly bastardized BM25; the weight factors below are used to + // transform it on a scale from 0 ... 10; where 0 is best, 10+ is worst. + // + // ... for historical reasons + // + + final double wf1 = 1.0; + final double wf2 = 2000.; + + double termSum = 0.; + double factorSum = 0.; + + for (var keyword : keywordSet) { + double tfIdf = Math.min(255, keyword.tfIdf()); + final double factor = 1.0 / (1.0 + keyword.weight()); + + factorSum += factor; + termSum += (1 + wf1*tfIdf) * factor; + } + + termSum /= lengthPenalty; + + return Math.sqrt(wf2 / (termSum / factorSum)); + } + + private double posFactor(SearchResultsKeywordSet keywordSet) { + // Penalize keywords that first appear late in the document + + double avgPos = 0; + for (var keyword : keywordSet) { + avgPos += keyword.score().firstPos(); + } + avgPos /= keywordSet.length(); + + return Math.sqrt(1 + avgPos / 3.); + } + + + private double getAllTermsFactorForSet(SearchResultsKeywordSet set, int titleLength) { + double totalFactor = 1.; + + double totalWeight = 0; + for (var keyword : set) { + totalWeight += keyword.weight(); + } + + for (var keyword : set) { + totalFactor *= getAllTermsFactor(keyword, totalWeight, titleLength); + } + + return totalFactor; + } + + private double getAllTermsFactor(SearchResultsKeyword keyword, double totalWeight, int titleLength) { + double f = 1.; + + final double k = keyword.weight() / totalWeight; + + EnumSet flags = keyword.flags(); + + final boolean title = flags.contains(EdgePageWordFlags.Title); + final boolean site = flags.contains(EdgePageWordFlags.Site); + final boolean siteAdjacent = flags.contains(EdgePageWordFlags.SiteAdjacent); + final boolean subject = flags.contains(EdgePageWordFlags.Subjects); + final boolean names = flags.contains(EdgePageWordFlags.NamesWords); + + if (title) { + if (titleLength <= 64) { + f *= Math.pow(0.5, k); + } + else if (titleLength < 96) { + f *= Math.pow(0.75, k); + } + else { // likely keyword stuffing if the title is this long + f *= Math.pow(0.9, k); + } + } + + if (site) { + f *= Math.pow(0.75, k); + } + else if (siteAdjacent) { + f *= Math.pow(0.8, k); + } + + if (subject) { + f *= Math.pow(0.8, k); + } + + if (!title && !subject && names) { + f *= Math.pow(0.9, k); + } + + return f; + } + + private double getLengthPenalty(int length) { + if (length < MIN_LENGTH) { + length = MIN_LENGTH; + } + if (length > AVG_LENGTH) { + length = AVG_LENGTH; + } + return (0.5 + 0.5 * length / AVG_LENGTH); + } + + private double[] getTermWeights(EdgeSearchResultKeywordScore[] scores) { + double[] weights = new double[scores.length]; + + for (int i = 0; i < scores.length; i++) { + String[] parts = separator.split(scores[i].keyword()); + double sumScore = 0.; + + int count = 0; + for (String part : parts) { + if (!WordPatterns.isStopWord(part)) { + sumScore += dict.getTermFreq(part); + count++; + } + } + if (count == 0) count = 1; + + weights[i] = Math.sqrt(sumScore)/count; + } + + return weights; + } + + + private double[] getTermWeights(String[] words) { + double[] weights = new double[words.length]; + + for (int i = 0; i < words.length; i++) { + String[] parts = separator.split(words[i]); + double sumScore = 0.; + + int count = 0; + for (String part : parts) { + if (!WordPatterns.isStopWord(part)) { + sumScore += dict.getTermFreq(part); + count++; + } + } + if (count == 0) count = 1; + + weights[i] = Math.sqrt(sumScore)/count; + } + + return weights; + } + + private SearchResultsKeywordSet createKeywordSet(List rawScores, int thisSet) { + EdgeSearchResultKeywordScore[] scores = rawScores.stream().filter(w -> w.set() == thisSet && !w.keyword().contains(":")).toArray(EdgeSearchResultKeywordScore[]::new); + if (scores.length == 0) { + return null; + } + final double[] weights = getTermWeights(scores); + + SearchResultsKeyword[] keywords = new SearchResultsKeyword[scores.length]; + for (int i = 0; i < scores.length; i++) { + keywords[i] = new SearchResultsKeyword(scores[i], weights[i]); + } + + return new SearchResultsKeywordSet(keywords); + + } + + + private record SearchResultsKeyword(EdgeSearchResultKeywordScore score, double weight) { + public int tfIdf() { + return score.metadata().tfIdf(); + } + public int count() { + return score.metadata().count(); + } + public EnumSet flags() { + return score().metadata().flags(); + } + } + + private record SearchResultsKeywordSet( + SearchResultsKeyword[] keywords) implements Iterable + { + @NotNull + @Override + public Iterator iterator() { + return Arrays.stream(keywords).iterator(); + } + + public int length() { + return keywords.length; + } + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ConverterLogicTestTool.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ConverterLogicTestTool.java index afcb22ed..5288eac1 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ConverterLogicTestTool.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ConverterLogicTestTool.java @@ -8,6 +8,7 @@ import nu.marginalia.wmsa.configuration.WmsaHome; import nu.marginalia.wmsa.edge.converting.ConverterModule; import nu.marginalia.wmsa.edge.converting.processor.DomainProcessor; import nu.marginalia.wmsa.edge.converting.processor.logic.DomPruningFilter; +import nu.marginalia.wmsa.edge.converting.processor.logic.topic.GoogleAnwersSpamDetector; import nu.marginalia.wmsa.edge.converting.processor.logic.topic.RecipeDetector; import nu.marginalia.wmsa.edge.converting.processor.logic.topic.TextileCraftDetector; import nu.marginalia.wmsa.edge.converting.processor.logic.topic.WoodworkingDetector; @@ -28,6 +29,7 @@ public class ConverterLogicTestTool { RecipeDetector recipeDetector = new RecipeDetector(); WoodworkingDetector woodworkingDetector = new WoodworkingDetector(); TextileCraftDetector textileCraftDetector = new TextileCraftDetector(); + GoogleAnwersSpamDetector spamDetector = new GoogleAnwersSpamDetector(); SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels()); @@ -72,12 +74,15 @@ public class ConverterLogicTestTool { if (textileCraftDetector.testP(dld) > 0.3) { System.out.println("textilecraft\t" + doc.url); } - if (woodworkingDetector.testP(dld) > 0.2) { + if (woodworkingDetector.testP(dld) > 0.1) { System.out.println("woodworking\t" + doc.url); } if (recipeDetector.testP(dld) > 0.5) { System.out.println("recipe\t" + doc.url); } + if (spamDetector.testP(parsed) > 0.5) { + System.out.println("GA spam\t" + doc.url); + } }; if (cp.getQueuedSubmissionCount() > 32) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/FeaturesLoaderTool.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/FeaturesLoaderTool.java index 97d42baa..55590d1a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/FeaturesLoaderTool.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/FeaturesLoaderTool.java @@ -49,7 +49,7 @@ public class FeaturesLoaderTool { } client.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId), - new DocumentKeywords(IndexBlock.Meta, feature.getKeyword()) + new DocumentKeywords(IndexBlock.Meta, new String[] { feature.getKeyword() }, new long[] { 0 }) , 0); }); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/SearchIndexScrubberMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/SearchIndexScrubberMain.java index 4927780c..846b4751 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/SearchIndexScrubberMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/SearchIndexScrubberMain.java @@ -59,7 +59,7 @@ public class SearchIndexScrubberMain { channel.read(inByteBuffer); - if (chunkBlock == IndexBlock.Link.id) { + if (chunkBlock == IndexBlock.Link.ordinal()) { for (int i = 0; i < randomAccessFiles.length; i++) { inByteBuffer.flip(); fileChannels[i].write(inByteBuffer); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaModule.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaModule.java index 0f46b164..74301b2d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaModule.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaModule.java @@ -1,10 +1,8 @@ package nu.marginalia.wmsa.encyclopedia; import com.google.inject.AbstractModule; -import lombok.SneakyThrows; public class EncyclopediaModule extends AbstractModule { - @SneakyThrows @Override public void configure() { } diff --git a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql index d2def737..994c6473 100644 --- a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql +++ b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql @@ -103,6 +103,16 @@ CREATE TABLE EC_DOMAIN_NEIGHBORS ( CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci; +CREATE TABLE EC_DOMAIN_NEIGHBORS_2 ( + DOMAIN_ID INT NOT NULL, + NEIGHBOR_ID INT NOT NULL, + RELATEDNESS DOUBLE NOT NULL, + + PRIMARY KEY (DOMAIN_ID, NEIGHBOR_ID), + FOREIGN KEY (DOMAIN_ID) REFERENCES EC_DOMAIN(ID) ON DELETE CASCADE, + FOREIGN KEY (NEIGHBOR_ID) REFERENCES EC_DOMAIN(ID) ON DELETE CASCADE +); + CREATE TABLE IF NOT EXISTS EC_DOMAIN_LINK ( ID INT PRIMARY KEY AUTO_INCREMENT, SOURCE_DOMAIN_ID INT NOT NULL, @@ -150,6 +160,18 @@ CREATE OR REPLACE VIEW EC_URL_VIEW AS INNER JOIN EC_DOMAIN ON EC_URL.DOMAIN_ID = EC_DOMAIN.ID; +CREATE OR REPLACE VIEW EC_NEIGHBORS_VIEW AS + SELECT + DOM.DOMAIN_NAME AS DOMAIN_NAME, + DOM.ID AS DOMAIN_ID, + NEIGHBOR.DOMAIN_NAME AS NEIGHBOR_NAME, + NEIGHBOR.ID AS NEIGHBOR_ID, + ROUND(100 * RELATEDNESS) AS RELATEDNESS + FROM EC_DOMAIN_NEIGHBORS_2 + INNER JOIN EC_DOMAIN DOM ON DOMAIN_ID=DOM.ID + INNER JOIN EC_DOMAIN NEIGHBOR ON NEIGHBOR_ID=NEIGHBOR.ID; + + CREATE OR REPLACE VIEW EC_RELATED_LINKS_VIEW AS SELECT SOURCE_DOMAIN_ID, @@ -230,4 +252,36 @@ CREATE TABLE DATA_DOMAIN_HISTORY ( CREATE TABLE CRAWL_QUEUE( DOMAIN_NAME VARCHAR(255) UNIQUE, SOURCE VARCHAR(255) -) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci; \ No newline at end of file +) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci; + +CREATE TABLE DOMAIN_COMPLAINT( + ID INT PRIMARY KEY AUTO_INCREMENT, + DOMAIN_ID INT NOT NULL, + + CATEGORY VARCHAR(255) NOT NULL, + DESCRIPTION TEXT, + SAMPLE VARCHAR(255), + FILE_DATE TIMESTAMP NOT NULL DEFAULT NOW(), + + REVIEWED BOOLEAN AS (REVIEW_DATE > 0) VIRTUAL, + DECISION VARCHAR(255), + REVIEW_DATE TIMESTAMP, + + FOREIGN KEY (DOMAIN_ID) REFERENCES EC_DOMAIN(ID) ON DELETE CASCADE +); + +--- + +CREATE TABLE WMSA_PROCESS( + ID BIGINT PRIMARY KEY, + NAME VARCHAR(255) NOT NULL, + TYPE ENUM('SERVICE', 'TASK') NOT NULL, + START DATETIME NOT NULL DEFAULT NOW(), + UPDATED DATETIME, + FINISHED DATETIME, + PROGRESS DOUBLE DEFAULT 0, + PROCESS_STATUS ENUM('RUNNING', 'FINISHED', 'DEAD') NOT NULL DEFAULT 'RUNNING', + PROCESS_SUBSTATUS ENUM('NA', 'OK', 'FAIL') NOT NULL DEFAULT 'NA', + MUTEX VARCHAR(255), + TIMEOUT INT NOT NULL DEFAULT 60 +); diff --git a/marginalia_nu/src/main/resources/static/edge/style-new.css b/marginalia_nu/src/main/resources/static/edge/style-new.css index 599a487f..23da585b 100644 --- a/marginalia_nu/src/main/resources/static/edge/style-new.css +++ b/marginalia_nu/src/main/resources/static/edge/style-new.css @@ -59,6 +59,53 @@ header nav a:hover, header nav a:focus { color: #fff !important; } +.form { + max-width: 60ch; + padding: 1ch; + margin: auto; +} + +.form { + background-color: #fff; +} + +.form h1 { + color: #fff; + background-color: #2f4858; + font-size: medium; + font-family: serif; + padding: .5ch .5ch .5ch .5ch; + margin-top: unset; + margin-left: unset; + + font-family: 'Trebuchet', 'Noto Sans', sans-serif; + + text-decoration: none; +} + +.form input[type="submit"] { + float: right; +} + +.form input[type="text"], .form select, .form textarea { + width: 80%; +} + +table > * { + vertical-align: baseline; + text-align: left; +} + +thead th { + font-weight: normal; + font-size: 14pt; +} + +footer td, footer th { + padding-bottom: 8px; + padding-right: 8px; +} + article { max-width: 160ch; margin-left: auto; @@ -88,8 +135,6 @@ ul.semantic-results a { gap: 1ch; } -article > section > p { display: none; } - .cards.big .card { flex-grow: 1 } .cards.big { padding-right: 1ch; } @@ -97,15 +142,20 @@ article > section > p { display: none; } display: none; } -/* -.card.rs-rank-1,.card.rs-rank-2,.card.rs-rank-3,.card.rs-rank-4 { - border: 1px solid #fe0; - box-sizing: border-box; - box-shadow: 0 0 5px #fe0; +.ms-rank-1 > .url, .ms-rank-1 > h2 { filter: grayscale(0%); } +.ms-rank-2 > .url, .ms-rank-2 > h2 { filter: grayscale(5%); } +.ms-rank-3 > .url, .ms-rank-3 > h2 { filter: grayscale(15%); } +.ms-rank-4 > .url, .ms-rank-4 > h2 { filter: grayscale(20%); } +.ms-rank-5 > .url, .ms-rank-5 > h2 { filter: grayscale(30%); } +.ms-rank-10 > .url, .ms-rank-10 > h2 { filter: grayscale(60%); } + +.positions { + box-shadow: 0px 0px 2px #888; + background-color: #e4e4e4; + padding: 2px; + margin-right: -1ch; + margin-left: 1ch; } -*/ - - .big .card { min-width: 40ch; } @@ -323,9 +373,40 @@ footer { clear: both; padding: 2ch; margin: 16ch 0px 0px 0px; - background-color: #acae89; - height: 20ch; - font-size: 10pt; + background: #e2e2c2; + border-top: 1px solid #acae89; + box-shadow: 5px 0px 5px #acae89; + + font-size: 12pt; + display: flex; + flex-direction: row; + flex-wrap: wrap; + justify-content: flex-start; +} + +footer h2 { + font-size: 14pt; + border-bottom: 1px solid #888; + width: 80%; +} + +footer > section { + line-height: 1.5; + background-color: #f8f8ee; + flex-basis: 40ch; + + flex-grow: 1.1; + + border-left: 1px solid #000; + padding-left: 1ch; + padding-right: 1ch; + box-shadow: -1px -1px 5px #000; + margin-left: 1ch; + padding-bottom: 1ch; +} + +footer > section { + margin-bottom: 1ch; } a.underline { @@ -360,6 +441,24 @@ a.underline { background-color: #000; color: #eee; } +.query-samples dd { + margin-top: .5ch; + margin-bottom: .5ch; +} +.query-samples sample { + font-family: monospace; + color: #444; +} +.query-samples dt { + font-family: monospace; + + display: inline-block; + box-shadow: -2px -2px 5px #444; + margin-bottom: .5ch; + margin-top: .5ch; + background-color: #eee; + padding: .25ch; +} @media only screen and (max-device-width: 1024px) { .rightbox { width: 30ch !important; } @@ -416,7 +515,13 @@ a.underline { color: #fff; border: 3px outset #000; } + .positions { + box-shadow: 0px 0px 2px #222; + background-color: #222; + padding: 2px; + color: #fff; + } a { color: #acf; @@ -440,6 +545,8 @@ a.underline { footer { background-color: #000; color: #fff; + box-shadow: 5px 0px 5px #000; + border-top: 1px solid #888; } body { background-color: #444; @@ -455,11 +562,7 @@ a.underline { background-color: unset; border: none; } -/* .card.rs-rank-1,.card.rs-rank-2,.card.rs-rank-3,.card.rs-rank-4 { - border: 2px solid #fe05; - box-sizing: border-box; - box-shadow: 0 0 20px #fe03; - }*/ + .search-box input[name="query"] { background-color: #000 !important; color: #aaa; @@ -509,6 +612,42 @@ a.underline { background-color: #2f4858; } + footer { + background: #000; + border-top: 1px solid #444; + box-shadow: 5px 0px 5px #000; + } + + footer h2 { + border-bottom: 1px solid #888; + } + + footer > section { + background-color: #222; + border-left: 1px solid #444; + box-shadow: -1px -1px 5px #444; + } + + .form { + background-color: #000; + color: #eee; + } + + .form h1 { + color: #fff; + background-color: #2f4858; + } + + .form input[type="submit"] { + background-color: #222; + color: #fff; + } + + .form input[type="text"], .form select, .form textarea { + background-color: #222; + color: #fff; + } + } @media only print { diff --git a/marginalia_nu/src/main/resources/static/explore/style.css b/marginalia_nu/src/main/resources/static/explore/style.css new file mode 100644 index 00000000..b7af8d17 --- /dev/null +++ b/marginalia_nu/src/main/resources/static/explore/style.css @@ -0,0 +1,20 @@ +body { + max-width: 80ch; + margin: auto; + font-size: 14pt; + font-family: sans-serif; + color: #222; + line-height: 1.5; +} +th { text-align: left; } +input { font-family: monospace; font-size: 14pt; } +input[type="text"] { width: 50%; } +table { width: 100%; font-size: 14pt; } + +a.external { + color: darkcyan !important; +} +a.external:before { + content: '\01F30E'; + padding-right: .25ch; +} \ No newline at end of file diff --git a/marginalia_nu/src/main/resources/templates/edge/index.hdb b/marginalia_nu/src/main/resources/templates/edge/index.hdb index 6d639865..1f71e6f6 100644 --- a/marginalia_nu/src/main/resources/templates/edge/index.hdb +++ b/marginalia_nu/src/main/resources/templates/edge/index.hdb @@ -28,6 +28,8 @@

Publicity, Discussion and Events

+
Google ei enää tideä
+
Helsing Sanomat 🇫🇮 2022-10-19
Kritik an Googles Suche - Platzhirsch auf dem Nebenschauplatz
Deutschlandfunk Kultur 🇩🇪, 2022-08-18
Marginalia Goes Open Source
@@ -75,6 +77,9 @@ href="https://git.marginalia.nu/marginalia/marginalia.nu">source code or contribute to the development!

+

+ The entire search engine is hosted off a single PC in Sweden, albeit with pretty solid specs. +

Consider supporting the project!

diff --git a/marginalia_nu/src/main/resources/templates/edge/indict/indict-form.hdb b/marginalia_nu/src/main/resources/templates/edge/indict/indict-form.hdb new file mode 100644 index 00000000..2fa75eab --- /dev/null +++ b/marginalia_nu/src/main/resources/templates/edge/indict/indict-form.hdb @@ -0,0 +1,80 @@ + + + + + Marginalia Search - File complaint against {{domain}} + + + + + + + + + +{{>edge/parts/search-header}} + +
+{{>edge/parts/search-form}} + +
+ +{{#if isSubmitted}} +

Your complaint against {{domain}} has been submitted

+

The review process is manual and may take a while.

+{{/if}} + +{{#unless isSubmitted}} +

Flag {{domain}} for review

+Note, this is not intended to police acceptable thoughts or ideas. +

+That said, offensive content in obvious bad faith is not tolerated, especially when designed +to crop up when you didn't go looking for it. How and where it is said is more +important than what is said. +

+This form can also be used to appeal unfairly blacklisted sites. +

+ +

+
+ Flag for Review + +
+ +
+
+
+
+
+
+
+
+
+ +
+
+

+Communicating through forms and tables is a bit impersonal, +you may also reach a human being through email at kontakt@marginalia.nu. +{{/unless}} + +{{#if complaints}} +


+

Complaints against {{domain}}

+ + +{{#each complaints}} + + + + + +{{/each}} +
CategorySubmittedReviewed
{{category}}{{submitTime}}{{#if reviewed}}✓{{/if}}
+{{/if}} +
+ +{{>edge/parts/search-footer}} + diff --git a/marginalia_nu/src/main/resources/templates/edge/parts/search-footer.hdb b/marginalia_nu/src/main/resources/templates/edge/parts/search-footer.hdb index 3e1a8637..5454f364 100644 --- a/marginalia_nu/src/main/resources/templates/edge/parts/search-footer.hdb +++ b/marginalia_nu/src/main/resources/templates/edge/parts/search-footer.hdb @@ -1,9 +1,96 @@
- This website complies with the GDPR by not collecting any personal - information, and with the EU Cookie Directive by not using - cookies. More Information. -

- Reach me at kontakt@marginalia.nu. -

- - \ No newline at end of file +
+

Syntax

+ This is a keyword-based search engine. When entering multiple search terms, the search engine will + attempt to match them against documents where the terms occur in close proximity.

+ + Search terms can be excluded with a hyphen.

+ + While the search engine at present does not allow full text search, quotes can be used to + specifically search for names or terms in the title. Using quotes will also cause the search engine + to be as literal as possible in interpreting the query.

+ + Parentheses can be used to add terms to the query without giving weight to the terms when ranking + the search results.

+ +

Samples

+
+
soup -chicken
+
Look for keywords that contain soup, but not + chicken.
+
"keyboard"
+
Look for pages containing the exact word + keyboard, not keyboards or the like.
+
"steve mcqueen"
+
Look for pages containing the exact words steve mcqueen + in that order, with no words in between.
+
apology (plato)
+
Look for pages containing apology and plato, but only rank them + based on their relevance to apology
+
+
+
+

Special Keywords

+ Several special keywords are supported by the search engine. +

+ + + + + + + + + + + + + + + + + + + + + + + + + + +
KeywordMeaning
site:example.comDisplay site information about example.com
site:example.com keywordSearch example.com for keyword
explore:example.comShow similar websites to example.com
ip:127.0.0.1Search documents hosted at 127.0.0.1
links:example.comSearch documents linking to example.com
q:-5The amount of javascript and modern features is at least 5 (on a scale 0 to 25)
q:5The amount of javascript and modern features is at most 5 (on a scale 0 to 25)
r:5000The domain ranking is at most 5000 (goes up to about 100k)
format:html5Filter documents using the HTML5 standard. This is typically modern websites.
format:xhtmlFilter documents using the XHTML standard
format:html123Filter documents using the HTML standards 1, 2, and 3. This is typically very old websites.
-special:mediaFilter out documents with audio or video tags
-special:scriptsFilter out documents with javascript
-special:affiliateFilter out documents with likely Amazon affiliate links
-special:trackingFilter out documents with analytics or tracking code
-special:cookiesFilter out documents with cookies
+

+
+

Results Legend

+

+ The estimated relevance of the search result is indicated using the color saturation + of the color of the search result, as well as the order the results are presented. +

+

+ Information about the position of the match is indicated using a dot matrix + in the bottom bar of each search result. Each dot represents four sentences, + and are presented in an order of top-to-bottom, left-to-right. + +

⣿⠃⠀⠀   — The terms occur heavily toward the beginning of the document. +

⠠⠀⡄⠁   — The terms occur sparsely throughout the document. +

⠀⠁⠀⠀   — The terms occur only in a single sentence. +

+

Potentially problems with the document are presented with a warning triangle, e.g. ⚠ 3. + Desktop users can mouse-over this to get a detailed breakdown. +

+ + \ No newline at end of file diff --git a/marginalia_nu/src/main/resources/templates/edge/parts/search-form.hdb b/marginalia_nu/src/main/resources/templates/edge/parts/search-form.hdb index 89e72d22..75e7f6c7 100644 --- a/marginalia_nu/src/main/resources/templates/edge/parts/search-form.hdb +++ b/marginalia_nu/src/main/resources/templates/edge/parts/search-form.hdb @@ -12,11 +12,12 @@ - + + diff --git a/marginalia_nu/src/main/resources/templates/edge/parts/site-info-index.hdb b/marginalia_nu/src/main/resources/templates/edge/parts/site-info-index.hdb new file mode 100644 index 00000000..9acf8bed --- /dev/null +++ b/marginalia_nu/src/main/resources/templates/edge/parts/site-info-index.hdb @@ -0,0 +1,50 @@ +

Indexing Information

+
+
+ {{#if blacklisted}} + This website is blacklisted. This excludes it from crawling and indexing. + +

This is usually because of some form of misbehavior on the webmaster's end. + Either annoying search engine spam, or tasteless content bad faith content. + +

Occasionally this is done hastily and in error. If you would like the decision + reviewed, you may use this form to file a report. + {{/if}} + + {{#unless blacklisted}} +

+ Index + State: {{state}}
+ Pages Known: {{pagesKnown}}
+ Pages Crawled: {{pagesFetched}}
+ Pages Indexed: {{pagesIndexed}}
+
+
+ {{#if inCrawlQueue}} + This website is in the queue for crawling. + It may take up to a month before it is indexed. + {{/if}} + + {{#if suggestForCrawling}} +
+
+ Crawling + This website is not queued for crawling. If you would like it to be crawled, + use the checkbox and button below.

+ + +
+
+ +

+
+ {{/if}} + {{#if pagesFetched}} +

+ If you've found a reason why this website should not be indexed, + you may use this form to file a report.

+ {{/if}} + {{/unless}} +

+

+ \ No newline at end of file diff --git a/marginalia_nu/src/main/resources/templates/edge/parts/site-info-links.hdb b/marginalia_nu/src/main/resources/templates/edge/parts/site-info-links.hdb new file mode 100644 index 00000000..0e16be4b --- /dev/null +++ b/marginalia_nu/src/main/resources/templates/edge/parts/site-info-links.hdb @@ -0,0 +1,18 @@ +
+

Links

+
+
+
+ Link Graph + Ranking: {{ranking}}%
+ Incoming Links: {{incomingLinks}}
+ Outbound Links: {{outboundLinks}}
+
+
+
+ Explore + Which pages link here?
+ Explore similar domains
+
+
+
\ No newline at end of file diff --git a/marginalia_nu/src/main/resources/templates/edge/search-result-metadata.hdb b/marginalia_nu/src/main/resources/templates/edge/search-result-metadata.hdb index 0c2e9fed..9ccaa23e 100644 --- a/marginalia_nu/src/main/resources/templates/edge/search-result-metadata.hdb +++ b/marginalia_nu/src/main/resources/templates/edge/search-result-metadata.hdb @@ -1,10 +1,3 @@ -{{#if scripts}}🏭️{{/if}} -{{#if tracking}}🕵️️{{/if}} -{{#if media}}🎞️{{/if}} -{{#if affiliate}}💳️{{/if}} -{{#if cookies}}👁️️{{/if}} -{{#if ads}}⚠️️️{{/if}} -{{format}} -{{#unless hideRanking}} -{{{rankingSymbol}}} -{{/unless}} \ No newline at end of file +{{#if problems}} ⚠ {{problemCount}} {{/if}} + +{{positions}} diff --git a/marginalia_nu/src/main/resources/templates/edge/search-result.hdb b/marginalia_nu/src/main/resources/templates/edge/search-result.hdb index 4c6c9cf7..c5d220c1 100644 --- a/marginalia_nu/src/main/resources/templates/edge/search-result.hdb +++ b/marginalia_nu/src/main/resources/templates/edge/search-result.hdb @@ -1,12 +1,14 @@ -
+
-

{{title}}

+

{{title}}

{{description}}

+ {{#unless focusDomain}} Info - {{#unless focusDomain}}{{#if hasMoreResults}}{{resultsFromSameDomain}}+{{/if}}{{/unless}} + {{#if hasMoreResults}}{{resultsFromSameDomain}}+{{/if}} + {{/unless}}
{{>edge/search-result-metadata}}

diff --git a/marginalia_nu/src/main/resources/templates/edge/site-info.hdb b/marginalia_nu/src/main/resources/templates/edge/site-info.hdb index 837f320d..6563844c 100644 --- a/marginalia_nu/src/main/resources/templates/edge/site-info.hdb +++ b/marginalia_nu/src/main/resources/templates/edge/site-info.hdb @@ -24,28 +24,8 @@
-

Indexing Information

-

- Blacklisted: {{blacklisted}}
- Pages Known: {{pagesKnown}}
- Pages Crawled: {{pagesFetched}}
- Pages Indexed: {{pagesIndexed}}
- Crawl State: {{state}}
-

-
- -
-

Links

-

- Crawl Ranking: {{ranking}}%
- Incoming Links: {{incomingLinks}}
- Outbound Links: {{outboundLinks}}
-

- -
+ {{>edge/parts/site-info-index}} + {{>edge/parts/site-info-links}} {{#each results}}{{>edge/search-result}}{{/each}}
diff --git a/marginalia_nu/src/main/resources/templates/explorer/explorer-about.hdb b/marginalia_nu/src/main/resources/templates/explorer/explorer-about.hdb new file mode 100644 index 00000000..57b01e89 --- /dev/null +++ b/marginalia_nu/src/main/resources/templates/explorer/explorer-about.hdb @@ -0,0 +1,15 @@ +

Marginalia Similar Website Finder

+ This is an experiment in website similarity ranking. It will be integrated into + Marginalia Search in the future, but it's + cool enough it merits its own page for now. +

+

+ How does it work? + In plain English, this service looks at which websites link to a particular target website, + and then it ranks websites that are popular among those linking websites using a method commonly + used in recommendation algorithms. +

+ In technical jargon, it reinterprets the incident edges in the adjacency matrix as sparse high dimensional vector, + and uses cosine similarity to find the nearest neighbors nodes within this feature-space. +

+
\ No newline at end of file diff --git a/marginalia_nu/src/main/resources/templates/explorer/explorer-messages.hdb b/marginalia_nu/src/main/resources/templates/explorer/explorer-messages.hdb new file mode 100644 index 00000000..163747e9 --- /dev/null +++ b/marginalia_nu/src/main/resources/templates/explorer/explorer-messages.hdb @@ -0,0 +1,2 @@ +{{#if message}}

{{message}}

{{/if}} +{{#if aliasDomain}}

Note: {{query}} redirects to {{aliasDomain}} according to the Marginalia Search crawler.

{{/if}} \ No newline at end of file diff --git a/marginalia_nu/src/main/resources/templates/explorer/explorer-results.hdb b/marginalia_nu/src/main/resources/templates/explorer/explorer-results.hdb new file mode 100644 index 00000000..01038ba0 --- /dev/null +++ b/marginalia_nu/src/main/resources/templates/explorer/explorer-results.hdb @@ -0,0 +1,23 @@ +

Websites Similar To {{query}}

+ + + + + + {{#each resultList}} + + + + + + + + + {{/each}} +
SimilarityDomain Name
{{relatedness}} %{{#unless active}}{{domain}}{{/unless}}{{#if active}}{{domain}}{{/if}}{{#if indexed}}MS{{/if}}WM{{#if hasMore}}Show Related{{/if}}
+ +
+If this is your jam, you may also like other takes on the same problem: +explore.marginalia.nu, +search.marginalia.nu/explore/random.

+They use a worse algorithm, but the visual aspect does quite a lot too. diff --git a/marginalia_nu/src/main/resources/templates/explorer/explorer-search.hdb b/marginalia_nu/src/main/resources/templates/explorer/explorer-search.hdb new file mode 100644 index 00000000..83e50932 --- /dev/null +++ b/marginalia_nu/src/main/resources/templates/explorer/explorer-search.hdb @@ -0,0 +1,7 @@ +

Search

+
+ + + +
+
\ No newline at end of file diff --git a/marginalia_nu/src/main/resources/templates/explorer/explorer.hdb b/marginalia_nu/src/main/resources/templates/explorer/explorer.hdb new file mode 100644 index 00000000..68b91601 --- /dev/null +++ b/marginalia_nu/src/main/resources/templates/explorer/explorer.hdb @@ -0,0 +1,25 @@ + + + + +{{#if query}} Websites Similar To {{query}} {{/if}} +{{#unless query}} Marginalia Similar Website Finder {{/unless}} + + + +{{>explorer/explorer-about}} +{{>explorer/explorer-search}} +{{>explorer/explorer-messages}} +{{#if resultList}}{{>explorer/explorer-results}}{{/if}} +{{#unless query}} +Don't know what to try? How about + +{{/unless}} + + \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/AndCardIntSetTest.java b/marginalia_nu/src/test/java/nu/marginalia/util/AndCardIntSetTest.java new file mode 100644 index 00000000..8f0e7d11 --- /dev/null +++ b/marginalia_nu/src/test/java/nu/marginalia/util/AndCardIntSetTest.java @@ -0,0 +1,24 @@ +package nu.marginalia.util; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +class AndCardIntSetTest { + + @Test + public void testCardinality() { + assertEquals(1, AndCardIntSet.of(1).getCardinality()); + assertEquals(0, AndCardIntSet.of().getCardinality()); + assertEquals(0, new AndCardIntSet().getCardinality()); + assertEquals(4, AndCardIntSet.of(0,5,2,4).getCardinality()); + } + @Test + public void testAndCardinality() { + assertEquals(1, AndCardIntSet.andCardinality(AndCardIntSet.of(1,3,5), AndCardIntSet.of(2,3,4))); + assertEquals(4, AndCardIntSet.andCardinality(AndCardIntSet.of(1,2,3,4,5, 6), AndCardIntSet.of(1,2,3,4))); + assertEquals(4, AndCardIntSet.andCardinality(AndCardIntSet.of(0, 1,2,3,4,5, 6), AndCardIntSet.of(1,2,3,4))); + assertEquals(4, AndCardIntSet.andCardinality(AndCardIntSet.of(1,2,3,4,5, 6), AndCardIntSet.of(0, 1,2,3,4))); + } + +} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/BrailleBlockPunchCardsTest.java b/marginalia_nu/src/test/java/nu/marginalia/util/BrailleBlockPunchCardsTest.java new file mode 100644 index 00000000..0efe59d8 --- /dev/null +++ b/marginalia_nu/src/test/java/nu/marginalia/util/BrailleBlockPunchCardsTest.java @@ -0,0 +1,17 @@ +package nu.marginalia.util; + +import org.junit.jupiter.api.Test; + +class BrailleBlockPunchCardsTest { + @Test + public void test() { + for (int i = 0; i <= 512; i++) { + if ((i % 8) == 0) { + System.out.println(); + } + System.out.print(BrailleBlockPunchCards.printBits(i, 8)); + + } + } + +} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java b/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java index 73aa4dc3..c55b597d 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java @@ -16,7 +16,8 @@ import java.util.HashSet; import java.util.Set; import java.util.StringJoiner; -import static org.junit.jupiter.api.Assertions.*; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; class BTreeWriterTest { @@ -100,10 +101,9 @@ class BTreeWriterTest { } { - var reader = new BTreeReader(mmf, ctx); - var header = reader.getHeader(0); + var reader = new BTreeReader(mmf, ctx, 0); for (int i = 0; i < data.length; i++) { - long offset = reader.findEntry(header, data[i]); + long offset = reader.findEntry(data[i]); assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset); assertEquals(i, mmf.get(offset+1)); } @@ -143,10 +143,9 @@ class BTreeWriterTest { } { - var reader = new BTreeReader(mmf, ctx); - var header = reader.getHeader(0); + var reader = new BTreeReader(mmf, ctx, 0); for (int i = 0; i < data.length; i++) { - long offset = reader.findEntry(header, data[i]); + long offset = reader.findEntry(data[i]); assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset); assertEquals(i, mmf.get(offset+1)); } @@ -154,7 +153,7 @@ class BTreeWriterTest { for (int i = 0; i < 500; i++) { long val = (long)(Long.MAX_VALUE * Math.random()); while (toPut.contains((int)val)) val = (long)(Long.MAX_VALUE * Math.random()); - assertEquals(-1, reader.findEntry(header, val)); + assertTrue(reader.findEntry( val) < 0); } } } catch (Exception e) { @@ -191,13 +190,12 @@ class BTreeWriterTest { } { - var reader = new BTreeReader(mmf, ctx); - var header = reader.getHeader(0); + var reader = new BTreeReader(mmf, ctx, 0); - printTreeLayout(toPut.size(), header, ctx); + printTreeLayout(toPut.size(), reader.getHeader(), ctx); for (int i = 0; i < data.length; i++) { - long offset = reader.findEntry(header, data[i]); + long offset = reader.findEntry(data[i]); assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset); assertEquals(data[i], mmf.get(offset)); } @@ -205,7 +203,7 @@ class BTreeWriterTest { for (int i = 0; i < 500; i++) { long val = (long) (Long.MAX_VALUE * Math.random()); while (toPut.contains(val)) val = (long) (Long.MAX_VALUE * Math.random()); - assertEquals(-1, reader.findEntry(header, val)); + assertTrue(reader.findEntry( val) < 0); } } } catch (Exception e) { @@ -244,13 +242,12 @@ class BTreeWriterTest { } { - var reader = new BTreeReader(mmf, ctx); - var header = reader.getHeader(0); + var reader = new BTreeReader(mmf, ctx, 0); - printTreeLayout(toPut.size(), header, ctx); + printTreeLayout(toPut.size(), reader.getHeader(), ctx); for (int i = 0; i < data.length; i++) { - long offset = reader.findEntry(header, data[i] & mask); + long offset = reader.findEntry(data[i] & mask); assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset); assertEquals(data[i], mmf.get(offset)); } @@ -258,7 +255,7 @@ class BTreeWriterTest { for (int i = 0; i < 500; i++) { long val = (long) (Long.MAX_VALUE * Math.random()); while (toPut.contains(val)) val = (long) (Long.MAX_VALUE * Math.random()); - assertEquals(-1, reader.findEntry(header, val & mask)); + assertTrue(reader.findEntry(val & mask) < 0); } } } catch (Exception e) { @@ -298,13 +295,12 @@ class BTreeWriterTest { } { - var reader = new BTreeReader(mmf, ctx); - var header = reader.getHeader(0); + var reader = new BTreeReader(mmf, ctx, 0); - printTreeLayout(toPut.size(), header, ctx); + printTreeLayout(toPut.size(), reader.getHeader(), ctx); for (int i = 0; i < data.length; i++) { - long offset = reader.findEntry(header, data[i] & mask); + long offset = reader.findEntry(data[i] & mask); assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset); assertEquals(data[i], mmf.get(offset)); assertEquals(i, mmf.get(offset+1)); @@ -313,7 +309,7 @@ class BTreeWriterTest { for (int i = 0; i < 500; i++) { long val = (long) (Long.MAX_VALUE * Math.random()); while (toPut.contains(val)) val = (long) (Long.MAX_VALUE * Math.random()); - assertEquals(-1, reader.findEntry(header, val & mask)); + assertTrue(reader.findEntry(val & mask) < 0); } } } catch (Exception e) { diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTestCachedReader.java b/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTestCachedReader.java deleted file mode 100644 index 7969e236..00000000 --- a/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTestCachedReader.java +++ /dev/null @@ -1,335 +0,0 @@ -package nu.marginalia.util.btree; - -import nu.marginalia.util.btree.model.BTreeContext; -import nu.marginalia.util.btree.model.BTreeHeader; -import nu.marginalia.util.multimap.MultimapFileLong; -import org.junit.jupiter.api.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.io.RandomAccessFile; -import java.nio.channels.FileChannel; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.HashSet; -import java.util.Set; -import java.util.StringJoiner; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertTrue; - -class BTreeWriterTestCachedReader { - - final BTreeContext ctx = new BTreeContext(4, 2, 0xFFFF_FFFF_FFFF_FFFFL, 3); - final BTreeWriter writer = new BTreeWriter(null, ctx); - - Logger logger = LoggerFactory.getLogger(getClass()); - @Test - void testSmallDataBlock() { - var header = writer.makeHeader(1024, ctx.BLOCK_SIZE_WORDS()/2); - assertEquals(1024 + BTreeHeader.BTreeHeaderSizeLongs, header.dataOffsetLongs()); - assertEquals(header.dataOffsetLongs(), header.indexOffsetLongs()); - } - - @Test - void testLayerCount() { - int wsq = ctx.BLOCK_SIZE_WORDS()*ctx.BLOCK_SIZE_WORDS(); - int wcub = ctx.BLOCK_SIZE_WORDS()*ctx.BLOCK_SIZE_WORDS()*ctx.BLOCK_SIZE_WORDS(); - - assertEquals(2, writer.makeHeader(1024, wsq-1).layers()); - assertEquals(2, writer.makeHeader(1024, wsq).layers()); - assertEquals(3, writer.makeHeader(1024, wsq+1).layers()); - - assertEquals(3, writer.makeHeader(1024, wcub-1).layers()); - assertEquals(3, writer.makeHeader(1024, wcub).layers()); - assertEquals(4, writer.makeHeader(1024, wcub+1).layers()); - } - - @Test - void testLayerOffset() { - int wcub = ctx.BLOCK_SIZE_WORDS()*ctx.BLOCK_SIZE_WORDS()*ctx.BLOCK_SIZE_WORDS(); - System.out.println(writer.makeHeader(1025, wcub).relativeIndexLayerOffset(ctx, 0)); - System.out.println(writer.makeHeader(1025, wcub).relativeIndexLayerOffset(ctx, 1)); - System.out.println(writer.makeHeader(1025, wcub).relativeIndexLayerOffset(ctx, 2)); - - for (int i = 0; i < 1024; i++) { - var header = writer.makeHeader(0, i); - - - printTreeLayout(i, header, ctx); - - if (header.layers() >= 1) { - assertEquals(1, ctx.indexLayerSize(i, header.layers() - 1) / ctx.BLOCK_SIZE_WORDS()); - } - } - } - - private void printTreeLayout(int numEntries, BTreeHeader header, BTreeContext ctx) { - StringJoiner sj = new StringJoiner(","); - for (int l = 0; l < header.layers(); l++) { - sj.add(""+ctx.indexLayerSize(numEntries, l)/ctx.BLOCK_SIZE_WORDS()); - } - System.out.println(numEntries + ":" + sj); - } - - @Test - public void testWriteEntrySize2() throws IOException { - - var tempFile = Files.createTempFile(Path.of("/tmp"), "tst", "dat"); - Set toPut = new HashSet<>(); - - for (int i = 0; i < 500; i++) { - while (!toPut.add((int)(Integer.MAX_VALUE * Math.random()))); - } - - int[] data = toPut.stream().mapToInt(Integer::valueOf).sorted().toArray(); - - try { - RandomAccessFile raf = new RandomAccessFile(tempFile.toFile(), "rw"); - MultimapFileLong mmf = new MultimapFileLong(raf, FileChannel.MapMode.READ_WRITE, 10000, 1000); - - { - var writer = new BTreeWriter(mmf, ctx); - writer.write(0, toPut.size(), (slice) -> { - for (int i = 0; i < data.length; i++) { - slice.put(2L*i, data[i]); - slice.put( 2L*i + 1, i); - } - }); - mmf.force(); - } - - { - var reader = new CachingBTreeReader(mmf, ctx); - var header = reader.getHeader(0); - var cache = reader.prepareCache(header); - for (int i = 0; i < data.length; i++) { - long offset = reader.findEntry(cache, data[i]); - assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset); - assertEquals(i, mmf.get(offset+1)); - } - } - } catch (Exception e) { - e.printStackTrace(); - } finally { - Files.delete(tempFile); - } - } - - @Test - public void testWriteEntrySize2Small() throws IOException { - - var tempFile = Files.createTempFile(Path.of("/tmp"), "tst", "dat"); - Set toPut = new HashSet<>(); - - for (int i = 0; i < 5; i++) { - while (!toPut.add((int)(Integer.MAX_VALUE * Math.random()))); - } - - int[] data = toPut.stream().mapToInt(Integer::valueOf).sorted().toArray(); - - try { - RandomAccessFile raf = new RandomAccessFile(tempFile.toFile(), "rw"); - MultimapFileLong mmf = new MultimapFileLong(raf, FileChannel.MapMode.READ_WRITE, 10000, 1000); - - { - var writer = new BTreeWriter(mmf, ctx); - writer.write( 0, toPut.size(), (slice) -> { - for (int i = 0; i < data.length; i++) { - slice.put(2L*i, data[i]); - slice.put(2L*i + 1, i); - } - }); - mmf.force(); - } - - { - var reader = new CachingBTreeReader(mmf, ctx); - var header = reader.getHeader(0); - var cache = reader.prepareCache(header); - for (int i = 0; i < data.length; i++) { - long offset = reader.findEntry(cache, data[i]); - assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset); - assertEquals(i, mmf.get(offset+1)); - } - - for (int i = 0; i < 500; i++) { - long val = (long)(Long.MAX_VALUE * Math.random()); - while (toPut.contains((int)val)) val = (long)(Long.MAX_VALUE * Math.random()); - assertEquals(-1, reader.findEntry(cache, val)); - } - } - } catch (Exception e) { - e.printStackTrace(); - } finally { - Files.delete(tempFile); - } - } - - - @Test - public void testWriteEqualityNotMasked() throws IOException { - for (int bs = 2; bs <= 4; bs++) { - var tempFile = Files.createTempFile(Path.of("/tmp"), "tst", "dat"); - Set toPut = new HashSet<>(); - - var ctx = new BTreeContext(5, 1, ~0, bs); - - for (int i = 0; i < 500; i++) { - while (!toPut.add((long) (Long.MAX_VALUE * Math.random()))) ; - } - - long[] data = toPut.stream().mapToLong(Long::valueOf).sorted().toArray(); - - try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) { - { - var writer = new BTreeWriter(mmf, ctx); - writer.write(0, toPut.size(), (slice) -> { - for (int i = 0; i < data.length; i++) { - slice.put(i, data[i]); - } - }); - mmf.force(); - } - - { - var reader = new CachingBTreeReader(mmf, ctx); - var header = reader.getHeader(0); - var cache = reader.prepareCache(header); - - printTreeLayout(toPut.size(), header, ctx); - - for (int i = 0; i < data.length; i++) { - long offset = reader.findEntry(cache, data[i]); - assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset); - assertEquals(data[i], mmf.get(offset)); - } - - for (int i = 0; i < 500; i++) { - long val = (long) (Long.MAX_VALUE * Math.random()); - while (toPut.contains(val)) val = (long) (Long.MAX_VALUE * Math.random()); - assertEquals(-1, reader.findEntry(cache, val)); - } - } - } catch (Exception e) { - e.printStackTrace(); - } finally { - Files.delete(tempFile); - } - } - } - - @Test - public void testWriteEqualityMasked() throws IOException { - - for (int bs = 2; bs <= 4; bs++) { - var tempFile = Files.createTempFile(Path.of("/tmp"), "tst", "dat"); - Set toPut = new HashSet<>(); - - long mask = 0xFFFF_FFFF_0000_0000L; - var ctx = new BTreeContext(5, 1, mask, bs); - - for (int i = 0; i < 500; i++) { - while (!toPut.add((long) (Long.MAX_VALUE * Math.random()))) ; - } - - long[] data = toPut.stream().mapToLong(Long::valueOf).sorted().toArray(); - - try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) { - { - var writer = new BTreeWriter(mmf, ctx); - writer.write(0, toPut.size(), (slice) -> { - for (int i = 0; i < data.length; i++) { - slice.put(i, data[i]); - } - }); - mmf.force(); - } - - { - var reader = new CachingBTreeReader(mmf, ctx); - var header = reader.getHeader(0); - var cache = reader.prepareCache(header); - - printTreeLayout(toPut.size(), header, ctx); - - for (int i = 0; i < data.length; i++) { - long offset = reader.findEntry(cache,data[i] & mask); - assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset); - assertEquals(data[i], mmf.get(offset)); - } - - for (int i = 0; i < 500; i++) { - long val = (long) (Long.MAX_VALUE * Math.random()); - while (toPut.contains(val)) val = (long) (Long.MAX_VALUE * Math.random()); - assertEquals(-1, reader.findEntry(cache, val & mask)); - } - } - } catch (Exception e) { - e.printStackTrace(); - } finally { - Files.delete(tempFile); - } - } - } - - @Test - public void testWriteTwoEqualityMasked() throws IOException { - - for (int bs = 2; bs <= 4; bs++) { - var tempFile = Files.createTempFile(Path.of("/tmp"), "tst", "dat"); - Set toPut = new HashSet<>(); - - long mask = 0xFFFF_FFFF_0000_0000L; - var ctx = new BTreeContext(5, 2, mask, bs); - - for (int i = 0; i < 500; i++) { - while (!toPut.add((long) (Long.MAX_VALUE * Math.random()))) ; - } - - long[] data = toPut.stream().mapToLong(Long::valueOf).sorted().toArray(); - - try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) { - { - var writer = new BTreeWriter(mmf, ctx); - writer.write(0, toPut.size(), (slice) -> { - for (int i = 0; i < data.length; i++) { - slice.put(i*2L, data[i]); - slice.put(i*2L+1, i); - } - }); - mmf.force(); - } - - { - var reader = new CachingBTreeReader(mmf, ctx); - var header = reader.getHeader(0); - var cache = reader.prepareCache(header); - - printTreeLayout(toPut.size(), header, ctx); - - for (int i = 0; i < data.length; i++) { - long offset = reader.findEntry(cache, data[i] & mask); - assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset); - assertEquals(data[i], mmf.get(offset)); - assertEquals(i, mmf.get(offset+1)); - } - - for (int i = 0; i < 500; i++) { - long val = (long) (Long.MAX_VALUE * Math.random()); - while (toPut.contains(val)) val = (long) (Long.MAX_VALUE * Math.random()); - assertEquals(-1, reader.findEntry(cache,val & mask)); - } - } - } catch (Exception e) { - e.printStackTrace(); - } finally { - Files.delete(tempFile); - } - } - } - - - -} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/LanguageFilterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/LanguageFilterTest.java index 93171310..50730390 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/LanguageFilterTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/LanguageFilterTest.java @@ -4,7 +4,8 @@ import nu.marginalia.util.language.LanguageFilter; import org.jsoup.Jsoup; import org.junit.jupiter.api.Test; -import static org.junit.jupiter.api.Assertions.*; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; class LanguageFilterTest { @@ -17,7 +18,7 @@ class LanguageFilterTest { } @Test - public void isStringChinsese() { + public void isStringChinese() { var languageFilter = new LanguageFilter(); assertTrue(languageFilter.isBlockedUnicodeRange("溶岩ドームの手前に広がる斜面(木が生えているところ)は普賢岳の山体です.今回の噴火にともない,このあたりの山体がマグマに押されて変形し,北(写真では左)にむかって100mほどせりだしました\n")); } diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/SentenceExtractorTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/SentenceExtractorTest.java index 118229b1..b8da0723 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/SentenceExtractorTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/SentenceExtractorTest.java @@ -7,6 +7,7 @@ import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.util.language.processing.DocumentKeywordExtractor; import nu.marginalia.util.language.processing.KeywordExtractor; import nu.marginalia.util.language.processing.SentenceExtractor; +import nu.marginalia.util.language.processing.model.KeywordMetadata; import nu.marginalia.util.language.processing.model.WordRep; import nu.marginalia.util.language.processing.model.WordSpan; import nu.marginalia.util.language.processing.model.tag.WordSeparator; @@ -15,11 +16,9 @@ import nu.marginalia.wmsa.edge.integration.wikipedia.WikipediaReader; import nu.marginalia.wmsa.edge.model.EdgeDomain; import org.jsoup.Jsoup; import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import java.io.IOException; -import java.net.URL; import java.nio.file.Files; import java.nio.file.Path; import java.util.*; @@ -56,7 +55,7 @@ class SentenceExtractorTest { var doc = Jsoup.parse(Files.readString(file.toPath())); long start = System.currentTimeMillis(); var dld = se.extractSentences(doc); - documentKeywordExtractor.extractKeywords(dld); + documentKeywordExtractor.extractKeywords(dld, new KeywordMetadata(0)); total += (System.currentTimeMillis() - start); } System.out.println(total); @@ -119,7 +118,7 @@ class SentenceExtractorTest { var newResult = newSe.extractSentences(Jsoup.parse(post.body)); - var newRes = documentKeywordExtractor.extractKeywords(newResult); + var newRes = documentKeywordExtractor.extractKeywords(newResult, new KeywordMetadata(0)); System.out.println(newRes); }); reader.join(); @@ -141,7 +140,7 @@ class SentenceExtractorTest { long st = System.currentTimeMillis(); for (var file : Objects.requireNonNull(data.toFile().listFiles())) { var newResult = newSe.extractSentences(Jsoup.parse(Files.readString(file.toPath()))); - var newRes = documentKeywordExtractor.extractKeywords(newResult); + var newRes = documentKeywordExtractor.extractKeywords(newResult, new KeywordMetadata(0)); System.out.println(newRes); } System.out.println(System.currentTimeMillis() - st); @@ -150,12 +149,12 @@ class SentenceExtractorTest { @SneakyThrows @Test - @Disabled public void testSE() { - var result = newSe.extractSentences(Jsoup.parse(new URL("https://memex.marginalia.nu/log/26-personalized-pagerank.gmi"), 10000)); + var result = newSe.extractSentences( + Jsoup.parse(Files.readString(Path.of("/home/vlofgren/man open (2) openat.html")))); var dict = new TermFrequencyDict(lm); - System.out.println(new DocumentKeywordExtractor(dict).extractKeywords(result)); + System.out.println(new DocumentKeywordExtractor(dict).extractKeywords(result, new KeywordMetadata(0))); // diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/reader/query/types/QueryFilterStepIfTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/reader/query/types/QueryFilterStepIfTest.java deleted file mode 100644 index 3dc2d57c..00000000 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/reader/query/types/QueryFilterStepIfTest.java +++ /dev/null @@ -1,29 +0,0 @@ -package nu.marginalia.wmsa.edge.index.reader.query.types; - -import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepFromPredicate; -import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepIf; -import org.junit.jupiter.api.Test; - -import java.util.List; - -class QueryFilterStepIfTest { - QueryFilterStepIf even = new QueryFilterStepFromPredicate(l -> (l%2) == 0); - QueryFilterStepIf divisibleByThree = new QueryFilterStepFromPredicate(l -> (l%3) == 0); - QueryFilterStepIf either = QueryFilterStepIf.anyOf(List.of(even, divisibleByThree)); - @Test - public void test() { - long[] values = new long[100]; - - for (int i = 0; i < values.length; i++) { - values[i] = i; - } - - int end = either.retainDestructive(values, 100); -// end = even.retainReorder(values, end, 100); - - for (int i = 0; i < values.length; i++) { - if (i == end) System.out.println("*"); - System.out.println(values[i]); - } - } -} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/MultimapFileTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/MultimapFileTest.java index bb7b360e..99785031 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/MultimapFileTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/MultimapFileTest.java @@ -2,6 +2,7 @@ package nu.marginalia.wmsa.edge.index.service; import lombok.SneakyThrows; import nu.marginalia.util.multimap.MultimapFileLong; +import org.apache.commons.lang3.ArrayUtils; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -96,16 +97,131 @@ class MultimapFileTest { } + @Test + void testQuickSort() throws IOException { + var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 128, 8); + var sorter = file.createSorter(Path.of("/tmp"), 16, 2); + + for (int start = 0; start < 8; start+=2) { + System.out.println("~"); + for (int end = start; end < 128; end+=2) { + for (int i = 0; i < 128; i+=2) { + file.put(i, -i/2); + file.put(i+1, i/2); + } + sorter.quickSortLH(start, end); + for (int i = start+2; i < end; i+=2) { + + System.out.println("**" + i); + System.out.println(file.get(i-2)); + System.out.println(file.get(i-1)); + System.out.println(file.get(i)); + System.out.println(file.get(i+1)); + + assertTrue(file.get(i-2) <= file.get(i)); + assertEquals(file.get(i+1), -file.get(i)); + } + System.out.println("~"); + } + } + + } + + @Test + void testSort() throws IOException { + var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 128, 8); + var sorter = file.createSorter(Path.of("/tmp"), 1024, 2); + + long[] values = new long[65536]; + for (int i = 0; i < values.length; i++) { + values[i] = i; + } + ArrayUtils.shuffle(values); + + int start = 6; + System.out.println(start); + for (int end = start+2; end < values.length; end+=100) { + + for (long i = 0; i < end+1; i+=2) { + file.put(i, values[(int)i/2]); + file.put(i+1, i/2); + } + + + file.put(start-2, 100000); + file.put(end, 1); + sorter.sortRange(start, end); + + for (int i = start+2; i < end; i+=2) { + assertTrue(file.get(i-2) < file.get(i)); + } + + assertEquals(100000, file.get(start-2)); + assertEquals(1, file.get(end)); + } + + } + + @Test + void testInsertionSort() throws IOException { + var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 128, 8); + var sorter = file.createSorter(Path.of("/tmp"), 16, 2); + + for (int start = 2; start < 8; start+=2) { + for (int end = start+2; end < 126; end+=2) { + for (int i = 0; i < 128; i+=2) { + file.put(i, -(128-i/2)); + file.put(i+1, (128-i/2)); + } + file.put(0, 0xFFFF_FFFFL); + file.put(end, 0x7FFF_FFFFL); + sorter.insertionSort(start, (end - start)/2); + assertEquals(0xFFFF_FFFFL, file.get(0)); + assertEquals(file.get(end), 0x7FFF_FFFFL); + for (int i = start+2; i < end; i+=2) { + assertTrue(file.get(i-2) <= file.get(i)); + assertEquals(file.get(i+1), -file.get(i)); + } + } + } + } + + @Test + void testMergeSort() throws IOException { + var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 128, 8); + var sorter = file.createSorter(Path.of("/tmp"), 16, 2); + + for (int start = 0; start < 512; start+=18) { + System.out.println(start); + for (int end = start+2; end < 8192; end+=68) { + for (int i = 0; i < 8192; i+=2) { + file.put(i, -i/2); + file.put(i+1, i/2); + } + sorter.mergeSort(start, end-start); + + assertEquals(file.get(start+1), -file.get(start)); + for (int i = start+2; i < end; i+=2) { +// System.out.println(file.get(i-2) + "," + file.get(i)); + assertTrue(file.get(i-2) <= file.get(i)); + +// System.out.println(file.get(i+1) + ":" + -file.get(i)); + assertEquals(file.get(i+1), -file.get(i)); + } + } + } + } + @Test void sortInternal() throws IOException { var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8); - var sorter = file.createSorter(Path.of("/tmp"), 16); + var sorter = file.createSorter(Path.of("/tmp"), 16, 1); var searcher = file.createSearcher(); for (int i = 0; i < 32; i++) { file.put(i, 32-i); } - sorter.sort( 2, 14); + sorter.sortRange( 2, 16); for (int i = 2+1; i < 16; i++) { assertTrue(file.get(i) > file.get(i-1)); @@ -116,14 +232,14 @@ class MultimapFileTest { @Test void sortExternal() throws IOException { var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8); - var sorter = file.createSorter(Path.of("/tmp"), 2); + var sorter = file.createSorter(Path.of("/tmp"), 2, 1); var searcher = file.createSearcher(); for (int i = 0; i < 32; i++) { file.put(i, 32-i); } - sorter.sort( 2, 14); + sorter.sortRange( 2, 16); file.force(); for (int i = 2+1; i < 16; i++) { @@ -132,6 +248,7 @@ class MultimapFileTest { } } + @Test void close() { } diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexJournalWriterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexJournalWriterTest.java index f0e6ecc0..aaa072e1 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexJournalWriterTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexJournalWriterTest.java @@ -3,6 +3,8 @@ package nu.marginalia.wmsa.edge.index.service; import lombok.SneakyThrows; import nu.marginalia.util.dict.DictionaryHashMap; import nu.marginalia.util.multimap.MultimapFileLong; +import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter; +import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner; import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalReader; import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl; import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry; @@ -10,6 +12,7 @@ import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon; import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal; import nu.marginalia.wmsa.edge.index.model.IndexBlock; +import nu.marginalia.wmsa.edge.index.reader.SearchIndex; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -19,6 +22,7 @@ import org.slf4j.LoggerFactory; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; +import java.util.List; class SearchIndexJournalWriterTest { KeywordLexicon keywordLexicon; @@ -58,18 +62,102 @@ class SearchIndexJournalWriterTest { } @Test - void put() throws IOException { - writer.put(new SearchIndexJournalEntryHeader(4, (1234L << 32) | 5678, IndexBlock.Link), - new SearchIndexJournalEntry(new long[] { 1, 2, 3, 4 })); - writer.put(new SearchIndexJournalEntryHeader(4, (2345L << 32) | 2244, IndexBlock.Words_1), - new SearchIndexJournalEntry(new long[] { 5, 6, 7 })); + void put() throws IOException, InterruptedException { + + for (int i = 0; i < 512; i++) { + if (i % 2 == 0) { + writer.put(new SearchIndexJournalEntryHeader(4, i, IndexBlock.Words_1), + new SearchIndexJournalEntry(new long[]{keywordLexicon.getOrInsert("one"), + 0x000000, + keywordLexicon.getOrInsert("two"), + 0xFFFFFF})); + } + else { + writer.put(new SearchIndexJournalEntryHeader(2, i, IndexBlock.Words_1), + new SearchIndexJournalEntry(new long[]{keywordLexicon.getOrInsert("one"), + 0x000000})); + } + } + keywordLexicon.commitToDisk(); + Thread.sleep(1000); writer.forceWrite(); var reader = new SearchIndexJournalReader(MultimapFileLong.forReading(indexFile)); - reader.forEach(entry -> { + + for (var entry : reader) { logger.info("{}, {} {}", entry, entry.urlId(), entry.domainId()); - logger.info("{}", entry.readEntry().toArray()); - }); + for (var record : entry.readEntry()) { + logger.info("{}", record); + } + } + + new SearchIndexConverter(IndexBlock.Words_1, 7, Path.of("/tmp"), + indexFile.toFile(), + wordsFile1.toFile(), + urlsFile1.toFile(), + new SearchIndexPartitioner(null), (url) -> false) + .convert(); + + MultimapFileLong mmf = MultimapFileLong.forReading(urlsFile1); + for (int i = 0; i < 1056; i++) { + System.out.println(i + ":" + mmf.get(i)); + } + try (var idx = new SearchIndex("test", urlsFile1.toFile(), wordsFile1.toFile())) { + for (String s : List.of("one", "two", "3")) { + System.out.println("***" + s); + var range = idx.rangeForWord(keywordLexicon.getOrInsert(s)); + System.out.println(range); + + System.out.println(1 + "? " + range.hasUrl(1)); + System.out.println(2 + "? " + range.hasUrl(2)); + + var source = range.asEntrySource(); + System.out.println(source); + + } + + } catch (Exception e) { + throw new RuntimeException(e); + } + + } + + @Test + void testWeirdScenario() throws IOException, InterruptedException { + long[] vals = new long[]{3818531806586L, 1696527885824L, 3818531806586L, 1679348016640L, 3818531806611L, 1168242909952L, 3818531806611L, 1168242909952L, 4316748027839L, 549761847552L, 47240643248522L, 285873040601600L, 51101820141195L, 1099517497600L, 51101820141295L, 549762863360L}; + + for (int v = 0; v < vals.length / 2; v++) { + writer.put(new SearchIndexJournalEntryHeader(4, vals[v * 2], IndexBlock.Words_1), + new SearchIndexJournalEntry(new long[]{keywordLexicon.getOrInsert("one"), vals[v * 2 + 1]})); + } + + keywordLexicon.commitToDisk(); + Thread.sleep(1000); + writer.forceWrite(); + + var reader = new SearchIndexJournalReader(MultimapFileLong.forReading(indexFile)); + + for (var entry : reader) { + logger.info("{}, {} {}", entry, entry.urlId(), entry.domainId()); + for (var record : entry.readEntry()) { + logger.info("{}", record); + } + } + + new SearchIndexConverter(IndexBlock.Words_1, 7, Path.of("/tmp"), + indexFile.toFile(), + wordsFile1.toFile(), + urlsFile1.toFile(), + new SearchIndexPartitioner(null), (url) -> false) + .convert(); + + try (var idx = new SearchIndex("test", urlsFile1.toFile(), wordsFile1.toFile())) { + var range = idx.rangeForWord(keywordLexicon.getOrInsert("one")); + long[] buffer = new long[128]; + + } + catch (Exception ex) { ex.printStackTrace(); } + } } \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryTest.java new file mode 100644 index 00000000..7290a01a --- /dev/null +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryTest.java @@ -0,0 +1,218 @@ +package nu.marginalia.wmsa.edge.index.svc.query; + +import nu.marginalia.util.btree.BTreeQueryBuffer; +import nu.marginalia.util.btree.BTreeWriter; +import nu.marginalia.util.multimap.MultimapFileLong; +import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter; +import nu.marginalia.wmsa.edge.index.reader.SearchIndexURLRange; +import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterAnyOf; +import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterBTreeRangeReject; +import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterBTreeRangeRetain; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Arrays; +import java.util.List; +import java.util.stream.LongStream; + +import static org.junit.jupiter.api.Assertions.*; + +class IndexQueryTest { + static Path file; + + static long threesOffset; + static long fivesOffset; + static long sevensOffset; + static long smallSeventeenOffset; + + // sz should be large enough to ensure the tree has multiple layers to shake out bugs + static int sz = 128*512*512*2; + + static MultimapFileLong mmf; + @BeforeAll + static void setUpAll() throws IOException { + file = Files.createTempFile(IndexQueryTest.class.getSimpleName(), ".dat"); + + try (var mmf = MultimapFileLong.forOutput(file, 10_000_000)) { + var bTreeWriter = new BTreeWriter(mmf, SearchIndexConverter.urlsBTreeContext); + + threesOffset = 0; + long written = 0; + written = bTreeWriter.write(0, sz / 2, w -> { + for (int i = 0; i < sz; i+=2) { + w.put(i, 3L*(i/2)); + w.put(i+1, i/2); + } + }); + + fivesOffset += written; + sevensOffset += written; + smallSeventeenOffset += written; + + written = bTreeWriter.write(fivesOffset, sz/2, w -> { + for (int i = 0; i < sz; i+=2) { + w.put(i, 5L*(i/2)); + w.put(i+1, (i/2)); + } + }); + + sevensOffset += written; + smallSeventeenOffset += written; + + written = bTreeWriter.write(sevensOffset, sz / 2, w -> { + for (int i = 0; i < sz; i+=2) { + w.put(i, 7L*(i/2)); + w.put(i+1, (i/2)); + } + }); + + smallSeventeenOffset += written; + + written = bTreeWriter.write(smallSeventeenOffset, 100, w -> { + for (int i = 0; i < 200; i+=2) { + w.put(i, 17L*(i/2)); + w.put(i+1, (i/2)); + } + }); + } + + mmf = MultimapFileLong.forReading(file); + + + } + + public SearchIndexURLRange threesRange() { + return new SearchIndexURLRange(mmf, threesOffset); + } + public SearchIndexURLRange fivesRange() { + return new SearchIndexURLRange(mmf, fivesOffset); + } + public SearchIndexURLRange sevensRange() { + return new SearchIndexURLRange(mmf, sevensOffset); + } + public SearchIndexURLRange seventeensRange() { + return new SearchIndexURLRange(mmf, smallSeventeenOffset); + } + + @AfterAll + static void tearDownAll() throws IOException { + mmf.close(); + Files.deleteIfExists(file); + } + + @Test + public void testMergeRanges() { + BTreeQueryBuffer buffer = new BTreeQueryBuffer(300); + + IndexQuery query = new IndexQuery(List.of(seventeensRange().asEntrySource(), threesRange().asEntrySource())); + + /** Read from 17s range */ + + // 17s range is shorter and should read fully in one go + + query.getMoreResults(buffer); + assertFalse(buffer.isEmpty()); + assertArrayEquals(LongStream.range(0, 100).map(l -> l*17).toArray(), buffer.copyData()); + + /** Read from 3s range */ + + assertTrue(query.hasMore()); + query.getMoreResults(buffer); + assertArrayEquals(LongStream.range(0, 150).map(l -> l*3).toArray(), buffer.copyData()); + + /** Ensure 3s range is not flagged as finished */ + + assertFalse(buffer.isEmpty()); + assertTrue(query.hasMore()); + } + + @Test + public void test() { + BTreeQueryBuffer buffer = new BTreeQueryBuffer(300); + + IndexQuery query = new IndexQuery(List.of(threesRange().asPrefixSource(102, 200))); + + /** Read from 3s range */ + + query.getMoreResults(buffer); + System.out.println(Arrays.toString(buffer.copyData())); + assertFalse(buffer.isEmpty()); + assertArrayEquals(LongStream.range(100, 200).filter(v -> (v % 3) == 0).toArray(), buffer.copyData()); + + } + + @Test + public void testInclude() { + BTreeQueryBuffer buffer = new BTreeQueryBuffer(300); + + /** Set up filters */ + var es = threesRange().asEntrySource(); + es.skip(10000); + IndexQuery query = new IndexQuery(List.of(es)); + + query.addInclusionFilter(new QueryFilterBTreeRangeRetain(fivesRange())); + query.addInclusionFilter(new QueryFilterBTreeRangeRetain(sevensRange())); + + /** Do it */ + query.getMoreResults(buffer); + assertArrayEquals(LongStream.range(10000, 10150) + .map(l -> l*3) + .filter(l -> (l % 5) == 0) + .filter(l -> (l % 7) == 0) + .toArray(), buffer.copyData()); + } + + @Test + public void testIncludeReject() { + BTreeQueryBuffer buffer = new BTreeQueryBuffer(300); + + /** Set up filters */ + var es = threesRange().asEntrySource(); + es.skip(10000); + IndexQuery query = new IndexQuery(List.of(es)); + + query.addInclusionFilter(new QueryFilterBTreeRangeRetain(fivesRange())); + query.addInclusionFilter(new QueryFilterBTreeRangeReject(sevensRange())); + + /** Do it */ + query.getMoreResults(buffer); + assertArrayEquals(LongStream.range(10000, 10150) + .map(l -> l*3) + .filter(l -> (l % 5) == 0) + .filter(l -> (l % 7) != 0) + .toArray(), buffer.copyData()); + } + + + @Test + public void testIncludeEither() { + BTreeQueryBuffer buffer = new BTreeQueryBuffer(300); + + /** Set up filters */ + var es = threesRange().asEntrySource(); + es.skip(10000); + IndexQuery query = new IndexQuery(List.of(es)); + query.addInclusionFilter(new QueryFilterAnyOf( + List.of(new QueryFilterBTreeRangeRetain(fivesRange()), + new QueryFilterBTreeRangeRetain(sevensRange())))); + + /** Do it */ + query.getMoreResults(buffer); + assertArrayEquals(LongStream.range(10000, 10150) + .map(l -> l*3) + .filter(l -> (l % 5) == 0 || (l % 7) == 0) + .toArray(), buffer.copyData()); + } + + @Test + public void testLoadMeta() { + long[] data = new long[] { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 }; + threesRange().getMetadata(data); + System.out.println(Arrays.toString(data)); + + } +} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/arxiv/ArxivParserTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/arxiv/ArxivParserTest.java index 8f250456..62ddde7b 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/arxiv/ArxivParserTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/arxiv/ArxivParserTest.java @@ -4,6 +4,7 @@ import nu.marginalia.util.TestLanguageModels; import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.util.language.processing.DocumentKeywordExtractor; import nu.marginalia.util.language.processing.SentenceExtractor; +import nu.marginalia.util.language.processing.model.KeywordMetadata; import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; import nu.marginalia.wmsa.edge.integration.arxiv.model.ArxivMetadata; import org.junit.jupiter.api.Disabled; @@ -35,6 +36,6 @@ class ArxivParserTest { var se = new SentenceExtractor(lm); - data.stream().map(meta -> documentKeywordExtractor.extractKeywords(se.extractSentences(meta.getAbstract(), meta.getTitle()))).limit(100).forEach(System.out::println); + data.stream().map(meta -> documentKeywordExtractor.extractKeywords(se.extractSentences(meta.getAbstract(), meta.getTitle()), new KeywordMetadata(0))).limit(100).forEach(System.out::println); } } \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWordMetadataTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWordMetadataTest.java new file mode 100644 index 00000000..ecb182f4 --- /dev/null +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWordMetadataTest.java @@ -0,0 +1,64 @@ +package nu.marginalia.wmsa.edge.model.crawl; + +import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags; +import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata; +import org.junit.jupiter.api.Test; + +import java.util.EnumSet; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +class EdgePageWordMetadataTest { + + @Test + public void codecTest() { + verifyCodec("Vanilla case", new EdgePageWordMetadata(32, 0x7f0f0000, 5, 1, EnumSet.allOf(EdgePageWordFlags.class))); + verifyCodec("Position high", new EdgePageWordMetadata(32, 0xff0f0000, 5, 1, EnumSet.allOf(EdgePageWordFlags.class))); + verifyCodec("No flags", new EdgePageWordMetadata(32, 0xff0f0000, 5, 1, EnumSet.noneOf(EdgePageWordFlags.class))); + System.out.println(new EdgePageWordMetadata(32, 0x7f0f0005, 5, 1, EnumSet.allOf(EdgePageWordFlags.class))); + System.out.println(new EdgePageWordMetadata(32, 0xff0f0013, 5, 1, EnumSet.noneOf(EdgePageWordFlags.class))); + } + + @Test + public void testClampTfIdfLow() { + var original = new EdgePageWordMetadata(0x8000FFFF, 0, 5, 1, EnumSet.noneOf(EdgePageWordFlags.class)); + var encoded = new EdgePageWordMetadata(original.encode()); + + assertEquals(original.positions(), encoded.positions()); + assertEquals(0, encoded.tfIdf()); + } + + @Test + public void testClampTfIdfHigh() { + var original = new EdgePageWordMetadata(0x7000FFFF, 0, 5, 1, EnumSet.noneOf(EdgePageWordFlags.class)); + var encoded = new EdgePageWordMetadata(original.encode()); + + assertEquals(original.positions(), encoded.positions()); + assertEquals(65535, encoded.tfIdf()); + } + + @Test + public void testClampCountLow() { + var original = new EdgePageWordMetadata(40, 0, 5, -1, EnumSet.noneOf(EdgePageWordFlags.class)); + var encoded = new EdgePageWordMetadata(original.encode()); + + assertEquals(original.positions(), encoded.positions()); + assertEquals(0, encoded.count()); + } + + @Test + public void testClampCountHigh() { + var original = new EdgePageWordMetadata(40, 0, 5, 17, EnumSet.noneOf(EdgePageWordFlags.class)); + var encoded = new EdgePageWordMetadata(original.encode()); + + assertEquals(original.positions(), encoded.positions()); + assertEquals(15, encoded.count()); + } + + + public void verifyCodec(String message, EdgePageWordMetadata data) { + assertEquals(data, new EdgePageWordMetadata(data.encode()), message); + } + + +} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/BodyQueryParserTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/BodyQueryParserTest.java index 4e1bc2b6..4b040983 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/BodyQueryParserTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/BodyQueryParserTest.java @@ -64,6 +64,15 @@ class BodyQueryParserTest { assertEquals("-hello", results.get(0).displayStr); } + @Test + void parseNear() { + var results = parser.parse("near:memex.marginalia.nu"); + results.forEach(System.out::println); + assertEquals(TokenType.NEAR_TERM, results.get(0).type); + assertEquals("memex.marginalia.nu", results.get(0).str); + assertEquals("near:memex.marginalia.nu", results.get(0).displayStr); + } + @Test void parseCombined() { for (var list : parser.permuteQueries(parser.parse("dune 2 remake"))) { diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java index 79946d82..706986b9 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java @@ -37,6 +37,8 @@ class QueryVariantsTest { void getQueryVariants() { System.out.println(se.extractSentence("we are alone")); testCase("inside job reviews"); + testCase("plato apology"); + testCase("mechanical keyboard"); testCase("DOS"); testCase("dos"); testCase("we are alone"); diff --git a/protocol/def/index.proto b/protocol/def/index.proto index 30cf916d..53c82cf6 100644 --- a/protocol/def/index.proto +++ b/protocol/def/index.proto @@ -13,6 +13,7 @@ message IndexPutKeywordsReq { message WordSet { int32 index = 1; repeated string words = 2; + repeated int64 meta = 3; } } diff --git a/third_party/src/main/java/com/upserve/uppend/blobs/NativeIO.java b/third_party/src/main/java/com/upserve/uppend/blobs/NativeIO.java index 0698c5c3..abe47a73 100644 --- a/third_party/src/main/java/com/upserve/uppend/blobs/NativeIO.java +++ b/third_party/src/main/java/com/upserve/uppend/blobs/NativeIO.java @@ -1,12 +1,12 @@ package com.upserve.uppend.blobs; -import jnr.ffi.*; -import jnr.ffi.types.size_t; import com.kenai.jffi.MemoryIO; +import jnr.ffi.LibraryLoader; +import jnr.ffi.types.size_t; import java.io.IOException; -import java.nio.*; +import java.nio.MappedByteBuffer; // https://github.com/upserve/uppend/blob/70967c6f24d7f1a3bbc18799f485d981da93f53b/src/main/java/com/upserve/uppend/blobs/NativeIO.java // MIT License diff --git a/third_party/src/main/java/opennlp/tools/sentdetect/DefaultSDContextGenerator.java b/third_party/src/main/java/opennlp/tools/sentdetect/DefaultSDContextGenerator.java new file mode 100644 index 00000000..5f87a0b0 --- /dev/null +++ b/third_party/src/main/java/opennlp/tools/sentdetect/DefaultSDContextGenerator.java @@ -0,0 +1,296 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.sentdetect; + +import opennlp.tools.util.StringUtil; + +import java.util.*; + + +// https://opennlp.apache.org/ + + +// This code is hot patched to increase the default buffer size, which is sub-optimal in +// the released version + + +/** + * Generate event contexts for maxent decisions for sentence detection. + * + */ +public class DefaultSDContextGenerator implements SDContextGenerator { + + /** + * String buffer for generating features. + */ + protected StringBuilder buf; + + /** + * List for holding features as they are generated. + */ + protected List collectFeats; + + private Set inducedAbbreviations; + + private Set eosCharacters; + + /** + * Creates a new SDContextGenerator instance with + * no induced abbreviations. + * + * @param eosCharacters + */ + public DefaultSDContextGenerator(char[] eosCharacters) { + this(Collections.emptySet(), eosCharacters); + } + + /** + * Creates a new SDContextGenerator instance which uses + * the set of induced abbreviations. + * + * @param inducedAbbreviations a Set of Strings + * representing induced abbreviations in the training data. + * Example: "Mr." + * + * @param eosCharacters + */ + public DefaultSDContextGenerator(Set inducedAbbreviations, char[] eosCharacters) { + this.inducedAbbreviations = inducedAbbreviations; + this.eosCharacters = new HashSet<>(); + for (char eosChar: eosCharacters) { + this.eosCharacters.add(eosChar); + } + buf = new StringBuilder(128); + collectFeats = new ArrayList<>(); + } + + private static String escapeChar(Character c) { + if (c == '\n') { + return ""; + } + + if (c == '\r') { + return ""; + } + + return new String(new char[]{c}); + } + + /* (non-Javadoc) + * @see opennlp.tools.sentdetect.SDContextGenerator#getContext(java.lang.CharSequence, int) + */ + public String[] getContext(CharSequence sb, int position) { + + /* + * String preceding the eos character in the eos token. + */ + String prefix; + + /* + * Space delimited token preceding token containing eos character. + */ + String previous; + + /* + * String following the eos character in the eos token. + */ + String suffix; + + /* + * Space delimited token following token containing eos character. + */ + String next; + + int lastIndex = sb.length() - 1; + { // compute space previous and space next features. + if (position > 0 && StringUtil.isWhitespace(sb.charAt(position - 1))) + collectFeats.add("sp"); + if (position < lastIndex && StringUtil.isWhitespace(sb.charAt(position + 1))) + collectFeats.add("sn"); + collectFeats.add("eos=" + escapeChar(sb.charAt(position))); + } + int prefixStart = previousSpaceIndex(sb, position); + + int c = position; + { ///assign prefix, stop if you run into a period though otherwise stop at space + while (--c > prefixStart) { + if (eosCharacters.contains(sb.charAt(c))) { + prefixStart = c; + c++; // this gets us out of while loop. + } + } + prefix = String.valueOf(sb.subSequence(prefixStart, position)).trim(); + } + int prevStart = previousSpaceIndex(sb, prefixStart); + previous = String.valueOf(sb.subSequence(prevStart, prefixStart)).trim(); + + int suffixEnd = nextSpaceIndex(sb, position, lastIndex); + { + c = position; + while (++c < suffixEnd) { + if (eosCharacters.contains(sb.charAt(c))) { + suffixEnd = c; + c--; // this gets us out of while loop. + } + } + } + int nextEnd = nextSpaceIndex(sb, suffixEnd + 1, lastIndex + 1); + if (position == lastIndex) { + suffix = ""; + next = ""; + } + else { + suffix = String.valueOf(sb.subSequence(position + 1, suffixEnd)).trim(); + next = String.valueOf(sb.subSequence(suffixEnd + 1, nextEnd)).trim(); + } + + collectFeatures(prefix,suffix,previous,next, sb.charAt(position)); + + String[] context = new String[collectFeats.size()]; + context = collectFeats.toArray(context); + collectFeats.clear(); + return context; + } + + /** + * Determines some of the features for the sentence detector and adds them to list features. + * + * @param prefix String preceding the eos character in the eos token. + * @param suffix String following the eos character in the eos token. + * @param previous Space delimited token preceding token containing eos character. + * @param next Space delimited token following token containing eos character. + * + * @deprecated use {@link #collectFeatures(String, String, String, String, Character)} instead. + */ + protected void collectFeatures(String prefix, String suffix, String previous, String next) { + collectFeatures(prefix, suffix, previous, next, null); + } + + /** + * Determines some of the features for the sentence detector and adds them to list features. + * + * @param prefix String preceding the eos character in the eos token. + * @param suffix String following the eos character in the eos token. + * @param previous Space delimited token preceding token containing eos character. + * @param next Space delimited token following token containing eos character. + * @param eosChar the EOS character been analyzed + */ + protected void collectFeatures(String prefix, String suffix, String previous, + String next, Character eosChar) { + buf.append("x="); + buf.append(prefix); + collectFeats.add(buf.toString()); + buf.setLength(0); + if (!prefix.equals("")) { + collectFeats.add(Integer.toString(prefix.length())); + if (isFirstUpper(prefix)) { + collectFeats.add("xcap"); + } + if (eosChar != null && inducedAbbreviations.contains(prefix + eosChar)) { + collectFeats.add("xabbrev"); + } + } + + buf.append("v="); + buf.append(previous); + collectFeats.add(buf.toString()); + + buf.setLength(0); + if (!previous.equals("")) { + if (isFirstUpper(previous)) { + collectFeats.add("vcap"); + } + if (inducedAbbreviations.contains(previous)) { + collectFeats.add("vabbrev"); + } + } + + buf.append("s="); + buf.append(suffix); + collectFeats.add(buf.toString()); + buf.setLength(0); + if (!suffix.equals("")) { + if (isFirstUpper(suffix)) { + collectFeats.add("scap"); + } + if (inducedAbbreviations.contains(suffix)) { + collectFeats.add("sabbrev"); + } + } + + buf.append("n="); + buf.append(next); + collectFeats.add(buf.toString()); + buf.setLength(0); + if (!next.equals("")) { + if (isFirstUpper(next)) { + collectFeats.add("ncap"); + } + if (inducedAbbreviations.contains(next)) { + collectFeats.add("nabbrev"); + } + } + } + + private static boolean isFirstUpper(String s) { + return Character.isUpperCase(s.charAt(0)); + } + + /** + * Finds the index of the nearest space before a specified index which is not itself preceded by a space. + * + * @param sb The string buffer which contains the text being examined. + * @param seek The index to begin searching from. + * @return The index which contains the nearest space. + */ + private static int previousSpaceIndex(CharSequence sb, int seek) { + seek--; + while (seek > 0 && !StringUtil.isWhitespace(sb.charAt(seek))) { + seek--; + } + if (seek > 0 && StringUtil.isWhitespace(sb.charAt(seek))) { + while (seek > 0 && StringUtil.isWhitespace(sb.charAt(seek - 1))) + seek--; + return seek; + } + return 0; + } + + /** + * Finds the index of the nearest space after a specified index. + * + * @param sb The string buffer which contains the text being examined. + * @param seek The index to begin searching from. + * @param lastIndex The highest index of the StringBuffer sb. + * @return The index which contains the nearest space. + */ + private static int nextSpaceIndex(CharSequence sb, int seek, int lastIndex) { + seek++; + char c; + while (seek < lastIndex) { + c = sb.charAt(seek); + if (StringUtil.isWhitespace(c)) { + while (sb.length() > seek + 1 && StringUtil.isWhitespace(sb.charAt(seek + 1))) + seek++; + return seek; + } + seek++; + } + return lastIndex; + } +} diff --git a/third_party/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java b/third_party/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java new file mode 100644 index 00000000..ad43bbe8 --- /dev/null +++ b/third_party/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java @@ -0,0 +1,336 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package opennlp.tools.sentdetect; + +import opennlp.tools.dictionary.Dictionary; +import opennlp.tools.ml.EventTrainer; +import opennlp.tools.ml.TrainerFactory; +import opennlp.tools.ml.model.Event; +import opennlp.tools.ml.model.MaxentModel; +import opennlp.tools.sentdetect.lang.Factory; +import opennlp.tools.util.ObjectStream; +import opennlp.tools.util.Span; +import opennlp.tools.util.StringUtil; +import opennlp.tools.util.TrainingParameters; +import opennlp.tools.util.model.ModelUtil; + +import java.io.IOException; +import java.util.*; + +/** + * A sentence detector for splitting up raw text into sentences. + *

+ * A maximum entropy model is used to evaluate end-of-sentence characters in a + * string to determine if they signify the end of a sentence. + */ +public class SentenceDetectorME implements SentenceDetector { + + /** + * Constant indicates a sentence split. + */ + public static final String SPLIT = "s"; + + /** + * Constant indicates no sentence split. + */ + public static final String NO_SPLIT = "n"; + + /** + * The maximum entropy model to use to evaluate contexts. + */ + private MaxentModel model; + + /** + * The feature context generator. + */ + private final SDContextGenerator cgen; + + /** + * The {@link EndOfSentenceScanner} to use when scanning for end of sentence offsets. + */ + private final EndOfSentenceScanner scanner; + + /** + * The list of probabilities associated with each decision. + */ + private List sentProbs = new ArrayList<>(); + + protected boolean useTokenEnd; + + /** + * Initializes the current instance. + * + * @param model the {@link SentenceModel} + */ + public SentenceDetectorME(SentenceModel model) { + SentenceDetectorFactory sdFactory = model.getFactory(); + this.model = model.getMaxentModel(); + cgen = sdFactory.getSDContextGenerator(); + scanner = sdFactory.getEndOfSentenceScanner(); + useTokenEnd = sdFactory.isUseTokenEnd(); + } + + /** + * @deprecated Use a {@link SentenceDetectorFactory} to extend + * SentenceDetector functionality. + */ + public SentenceDetectorME(SentenceModel model, Factory factory) { + this.model = model.getMaxentModel(); + // if the model has custom EOS characters set, use this to get the context + // generator and the EOS scanner; otherwise use language-specific defaults + char[] customEOSCharacters = model.getEosCharacters(); + if (customEOSCharacters == null) { + cgen = factory.createSentenceContextGenerator(model.getLanguage(), + getAbbreviations(model.getAbbreviations())); + scanner = factory.createEndOfSentenceScanner(model.getLanguage()); + } else { + cgen = factory.createSentenceContextGenerator( + getAbbreviations(model.getAbbreviations()), customEOSCharacters); + scanner = factory.createEndOfSentenceScanner(customEOSCharacters); + } + useTokenEnd = model.useTokenEnd(); + } + + private static Set getAbbreviations(Dictionary abbreviations) { + if (abbreviations == null) { + return Collections.emptySet(); + } + return abbreviations.asStringSet(); + } + + /** + * Detect sentences in a String. + * + * @param s The string to be processed. + * + * @return A string array containing individual sentences as elements. + */ + public String[] sentDetect(String s) { + Span[] spans = sentPosDetect(s); + String[] sentences; + if (spans.length != 0) { + sentences = new String[spans.length]; + for (int si = 0; si < spans.length; si++) { + sentences[si] = spans[si].getCoveredText(s).toString(); + } + } + else { + sentences = new String[] {}; + } + return sentences; + } + + private int getFirstWS(String s, int pos) { + while (pos < s.length() && !StringUtil.isWhitespace(s.charAt(pos))) + pos++; + return pos; + } + + private int getFirstNonWS(String s, int pos) { + while (pos < s.length() && StringUtil.isWhitespace(s.charAt(pos))) + pos++; + return pos; + } + + /** + * Detect the position of the first words of sentences in a String. + * + * @param s The string to be processed. + * @return A integer array containing the positions of the end index of + * every sentence + * + */ + public Span[] sentPosDetect(String s) { + sentProbs.clear(); + StringBuilder sb = new StringBuilder(s); + List enders = scanner.getPositions(s); + List positions = new ArrayList<>(enders.size()); + + for (int i = 0, end = enders.size(), index = 0; i < end; i++) { + int cint = enders.get(i); + // skip over the leading parts of non-token final delimiters + int fws = getFirstWS(s,cint + 1); + if (i + 1 < end && enders.get(i + 1) < fws) { + continue; + } + if (positions.size() > 0 && cint < positions.get(positions.size() - 1)) continue; + + double[] probs = model.eval(cgen.getContext(sb, cint)); + String bestOutcome = model.getBestOutcome(probs); + + if (bestOutcome.equals(SPLIT) && isAcceptableBreak(s, index, cint)) { + if (index != cint) { + if (useTokenEnd) { + positions.add(getFirstNonWS(s, getFirstWS(s,cint + 1))); + } + else { + positions.add(getFirstNonWS(s, cint + 1)); + } + sentProbs.add(probs[model.getIndex(bestOutcome)]); + } + + index = cint + 1; + } + } + + int[] starts = new int[positions.size()]; + for (int i = 0; i < starts.length; i++) { + starts[i] = positions.get(i); + } + + // string does not contain sentence end positions + if (starts.length == 0) { + + // remove leading and trailing whitespace + int start = 0; + int end = s.length(); + + while (start < s.length() && StringUtil.isWhitespace(s.charAt(start))) + start++; + + while (end > 0 && StringUtil.isWhitespace(s.charAt(end - 1))) + end--; + + if (end - start > 0) { + sentProbs.add(1d); + return new Span[] {new Span(start, end)}; + } + else + return new Span[0]; + } + + // Convert the sentence end indexes to spans + + boolean leftover = starts[starts.length - 1] != s.length(); + Span[] spans = new Span[leftover ? starts.length + 1 : starts.length]; + + for (int si = 0; si < starts.length; si++) { + int start; + + if (si == 0) { + start = 0; + } + else { + start = starts[si - 1]; + } + + // A span might contain only white spaces, in this case the length of + // the span will be zero after trimming and should be ignored. + Span span = new Span(start, starts[si]).trim(s); + if (span.length() > 0) { + spans[si] = span; + } + else { + sentProbs.remove(si); + } + } + + if (leftover) { + Span span = new Span(starts[starts.length - 1], s.length()).trim(s); + if (span.length() > 0) { + spans[spans.length - 1] = span; + sentProbs.add(1d); + } + } + /* + * set the prob for each span + */ + for (int i = 0; i < spans.length; i++) { + double prob = sentProbs.get(i); + spans[i] = new Span(spans[i], prob); + + } + + return spans; + } + + /** + * Returns the probabilities associated with the most recent + * calls to sentDetect(). + * + * @return probability for each sentence returned for the most recent + * call to sentDetect. If not applicable an empty array is returned. + */ + public double[] getSentenceProbabilities() { + double[] sentProbArray = new double[sentProbs.size()]; + for (int i = 0; i < sentProbArray.length; i++) { + sentProbArray[i] = sentProbs.get(i); + } + return sentProbArray; + } + + /** + * Allows subclasses to check an overzealous (read: poorly + * trained) model from flagging obvious non-breaks as breaks based + * on some boolean determination of a break's acceptability. + * + *

The implementation here always returns true, which means + * that the MaxentModel's outcome is taken as is.

+ * + * @param s the string in which the break occurred. + * @param fromIndex the start of the segment currently being evaluated + * @param candidateIndex the index of the candidate sentence ending + * @return true if the break is acceptable + */ + protected boolean isAcceptableBreak(String s, int fromIndex, int candidateIndex) { + return true; + } + + /** + * @deprecated Use + * {@link #train(String, ObjectStream, SentenceDetectorFactory, TrainingParameters)} + * and pass in af {@link SentenceDetectorFactory}. + */ + public static SentenceModel train(String languageCode, + ObjectStream samples, boolean useTokenEnd, + Dictionary abbreviations, TrainingParameters mlParams) throws IOException { + SentenceDetectorFactory sdFactory = new SentenceDetectorFactory( + languageCode, useTokenEnd, abbreviations, null); + return train(languageCode, samples, sdFactory, mlParams); + } + + public static SentenceModel train(String languageCode, + ObjectStream samples, SentenceDetectorFactory sdFactory, + TrainingParameters mlParams) throws IOException { + + Map manifestInfoEntries = new HashMap<>(); + + // TODO: Fix the EventStream to throw exceptions when training goes wrong + ObjectStream eventStream = new SDEventStream(samples, + sdFactory.getSDContextGenerator(), sdFactory.getEndOfSentenceScanner()); + + EventTrainer trainer = TrainerFactory.getEventTrainer(mlParams, manifestInfoEntries); + + MaxentModel sentModel = trainer.train(eventStream); + + return new SentenceModel(languageCode, sentModel, manifestInfoEntries, sdFactory); + } + + /** + * @deprecated Use + * {@link #train(String, ObjectStream, SentenceDetectorFactory, TrainingParameters)} + * and pass in af {@link SentenceDetectorFactory}. + */ + @Deprecated + public static SentenceModel train(String languageCode, ObjectStream samples, + boolean useTokenEnd, Dictionary abbreviations) throws IOException { + return train(languageCode, samples, useTokenEnd, abbreviations, + ModelUtil.createDefaultTrainingParameters()); + } +}