fab36d6e63
Adds experimental sideloading support for pusshift.io style reddit data. This dataset is limited to data older than 2023, due to licensing changes making large-scale data extraction difficult. Since the median post quality on reddit is not very good, he sideloader will only load a subset of self-texts and top-level comments that have sufficiently many upvotes. Empirically this appears to mostly return good matches, even if it probably could index more. Tests were written for this, but all require local reddit data which can't be distributed with the source code. If these can not be found, the tests will shortcircuit as OK. They're mostly there for debugging, and it's fine if they don't always run. The change also refactors the sideloading a bit since it was a bit messy.
99 lines
3.2 KiB
Groovy
99 lines
3.2 KiB
Groovy
plugins {
|
|
id 'java'
|
|
|
|
id 'application'
|
|
id 'jvm-test-suite'
|
|
}
|
|
|
|
java {
|
|
toolchain {
|
|
languageVersion.set(JavaLanguageVersion.of(21))
|
|
}
|
|
}
|
|
|
|
application {
|
|
mainClass = 'nu.marginalia.converting.ConverterMain'
|
|
applicationName = 'converter-process'
|
|
}
|
|
|
|
tasks.distZip.enabled = false
|
|
|
|
dependencies {
|
|
implementation project(':code:common:process')
|
|
|
|
implementation project(':third-party:porterstemmer')
|
|
implementation project(':third-party:count-min-sketch')
|
|
|
|
implementation project(':code:api:index-api')
|
|
implementation project(':code:api:process-mqapi')
|
|
|
|
implementation project(':code:common:model')
|
|
implementation project(':code:common:db')
|
|
implementation project(':code:common:service')
|
|
implementation project(':code:common:config')
|
|
implementation project(':code:libraries:message-queue')
|
|
implementation project(':code:libraries:blocking-thread-pool')
|
|
implementation project(':code:common:service-discovery')
|
|
implementation project(':code:common:service-client')
|
|
|
|
implementation project(':code:libraries:guarded-regex')
|
|
implementation project(':code:libraries:easy-lsh')
|
|
implementation project(':code:libraries:geo-ip')
|
|
implementation project(':code:libraries:big-string')
|
|
implementation project(':code:libraries:language-processing')
|
|
|
|
implementation project(':code:process-models:processed-data')
|
|
implementation project(':code:process-models:work-log')
|
|
implementation project(':code:process-models:crawling-model')
|
|
|
|
implementation project(':code:features-convert:adblock')
|
|
implementation project(':code:features-convert:anchor-keywords')
|
|
implementation project(':code:features-convert:topic-detection')
|
|
implementation project(':code:features-convert:pubdate')
|
|
implementation project(':code:features-convert:keyword-extraction')
|
|
implementation project(':code:features-convert:summary-extraction')
|
|
implementation project(':code:features-convert:stackexchange-xml')
|
|
implementation project(':code:features-convert:reddit-json')
|
|
|
|
implementation project(':code:features-crawl:crawl-blocklist')
|
|
implementation project(':code:features-crawl:link-parser')
|
|
implementation project(':code:features-crawl:content-type')
|
|
|
|
testImplementation project(':code:libraries:term-frequency-dict')
|
|
testImplementation project(':code:process-models:crawl-spec')
|
|
|
|
implementation libs.bundles.slf4j
|
|
|
|
implementation libs.notnull
|
|
implementation libs.jwarc
|
|
|
|
implementation libs.jsoup
|
|
|
|
implementation libs.guice
|
|
implementation libs.guava
|
|
implementation libs.bundles.gson
|
|
|
|
implementation libs.zstd
|
|
|
|
implementation libs.bundles.mariadb
|
|
implementation libs.bundles.nlp
|
|
|
|
implementation libs.trove
|
|
implementation libs.fastutil
|
|
|
|
implementation libs.snakeyaml
|
|
|
|
implementation libs.crawlercommons
|
|
|
|
implementation libs.commons.lang3
|
|
implementation libs.commons.compress
|
|
implementation libs.sqlite
|
|
|
|
testImplementation libs.bundles.slf4j.test
|
|
testImplementation libs.bundles.junit
|
|
testImplementation libs.mockito
|
|
|
|
testImplementation project(':code:processes:test-data')
|
|
testImplementation project(':code:processes:crawling-process')
|
|
}
|