diff --git a/code/api/assistant-api/build.gradle b/code/api/assistant-api/build.gradle index 6a46e5ef..5dc33d5c 100644 --- a/code/api/assistant-api/build.gradle +++ b/code/api/assistant-api/build.gradle @@ -11,7 +11,6 @@ java { } } dependencies { - implementation project(':third-party') implementation project(':code:common:model') implementation project(':code:common:config') implementation project(':code:common:service-discovery') diff --git a/code/api/index-api/build.gradle b/code/api/index-api/build.gradle index 93672230..73520b36 100644 --- a/code/api/index-api/build.gradle +++ b/code/api/index-api/build.gradle @@ -12,7 +12,6 @@ java { } dependencies { - implementation project(':third-party') implementation project(':code:common:model') implementation project(':code:common:config') implementation project(':code:common:service-discovery') diff --git a/code/api/search-api/build.gradle b/code/api/search-api/build.gradle index a314ea6c..8c38b5f3 100644 --- a/code/api/search-api/build.gradle +++ b/code/api/search-api/build.gradle @@ -12,7 +12,6 @@ java { } dependencies { - implementation project(':third-party') implementation project(':code:common:model') implementation project(':code:common:config') implementation project(':code:common:service-discovery') diff --git a/code/common/service-client/build.gradle b/code/common/service-client/build.gradle index 834fc45f..6ba7f806 100644 --- a/code/common/service-client/build.gradle +++ b/code/common/service-client/build.gradle @@ -12,7 +12,6 @@ java { } dependencies { - implementation project(':third-party') implementation project(':code:common:service-discovery') diff --git a/code/crawl/common/build.gradle b/code/crawl/common/build.gradle index 67174793..483dbee8 100644 --- a/code/crawl/common/build.gradle +++ b/code/crawl/common/build.gradle @@ -12,7 +12,6 @@ java { } dependencies { - implementation project(':third-party') implementation project(':code:common:model') implementation project(':code:common:config') implementation project(':code:libraries:guarded-regex') diff --git a/code/crawl/converting-model/build.gradle b/code/crawl/converting-model/build.gradle index 15382ff2..29bbd025 100644 --- a/code/crawl/converting-model/build.gradle +++ b/code/crawl/converting-model/build.gradle @@ -11,7 +11,6 @@ java { } } dependencies { - implementation project(':third-party') implementation project(':code:common:model') implementation project(':code:api:index-api') implementation project(':code:common:service-discovery') diff --git a/code/crawl/converting-process/build.gradle b/code/crawl/converting-process/build.gradle index 805a5bc8..e33f0b44 100644 --- a/code/crawl/converting-process/build.gradle +++ b/code/crawl/converting-process/build.gradle @@ -19,7 +19,7 @@ application { tasks.distZip.enabled = false dependencies { - implementation project(':third-party') + implementation project(':third-party:porterstemmer') implementation project(':code:api:index-api') implementation project(':code:common:model') diff --git a/code/crawl/crawling-model/build.gradle b/code/crawl/crawling-model/build.gradle index e9cdbc01..d2803c21 100644 --- a/code/crawl/crawling-model/build.gradle +++ b/code/crawl/crawling-model/build.gradle @@ -13,7 +13,6 @@ java { } dependencies { - implementation project(':third-party') implementation project(':code:common:model') implementation project(':code:libraries:big-string') implementation project(':code:api:index-api') diff --git a/code/crawl/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java b/code/crawl/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java index 49dee5b3..744236c0 100644 --- a/code/crawl/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java +++ b/code/crawl/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java @@ -2,13 +2,13 @@ package nu.marginalia.crawling.io; import com.github.luben.zstd.ZstdInputStream; import com.google.gson.Gson; -import jdkoverride.LargeLineBufferedReader; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.model.gson.GsonFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.BufferedReader; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; @@ -29,7 +29,7 @@ public class CrawledDomainReader { public CrawledDomain read(Path path) throws IOException { DomainDataAssembler domainData = new DomainDataAssembler(); - try (var br = new LargeLineBufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(path.toFile()))))) { + try (var br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(path.toFile()))))) { String line; while ((line = br.readLine()) != null) { if (line.startsWith("//")) { diff --git a/code/crawl/crawling-process/build.gradle b/code/crawl/crawling-process/build.gradle index 18e841b3..29630437 100644 --- a/code/crawl/crawling-process/build.gradle +++ b/code/crawl/crawling-process/build.gradle @@ -19,7 +19,6 @@ application { tasks.distZip.enabled = false dependencies { - implementation project(':third-party') implementation project(':code:common:model') implementation project(':code:common:config') implementation project(':code:common:service') diff --git a/code/crawl/experimental/build.gradle b/code/crawl/experimental/build.gradle index cfa239b2..85063928 100644 --- a/code/crawl/experimental/build.gradle +++ b/code/crawl/experimental/build.gradle @@ -12,7 +12,6 @@ java { } dependencies { - implementation project(':third-party') implementation project(':code:common:model') implementation project(':code:common:config') implementation project(':code:common:service') diff --git a/code/crawl/loading-process/build.gradle b/code/crawl/loading-process/build.gradle index bf93444a..ea233dda 100644 --- a/code/crawl/loading-process/build.gradle +++ b/code/crawl/loading-process/build.gradle @@ -18,7 +18,6 @@ application { tasks.distZip.enabled = false dependencies { - implementation project(':third-party') implementation project(':code:api:index-api') implementation project(':code:common:model') implementation project(':code:common:config') diff --git a/code/features/random-websites/build.gradle b/code/features/random-websites/build.gradle index 4e1ff560..1e018f03 100644 --- a/code/features/random-websites/build.gradle +++ b/code/features/random-websites/build.gradle @@ -15,7 +15,6 @@ java { } dependencies { - implementation project(':third-party') implementation project(':code:common:model') implementation project(':code:common:service') diff --git a/code/features/screenshots/build.gradle b/code/features/screenshots/build.gradle index db62f15c..0e014011 100644 --- a/code/features/screenshots/build.gradle +++ b/code/features/screenshots/build.gradle @@ -11,7 +11,6 @@ java { } dependencies { - implementation project(':third-party') implementation project(':code:common:model') implementation project(':code:common:service') diff --git a/code/features/topic-detection/build.gradle b/code/features/topic-detection/build.gradle index 035e9974..0f764b67 100644 --- a/code/features/topic-detection/build.gradle +++ b/code/features/topic-detection/build.gradle @@ -16,7 +16,7 @@ java { dependencies { implementation project(':code:common:config') implementation project(':code:libraries:language-processing') - implementation project(':third-party') + implementation project(':third-party:porterstemmer') implementation libs.lombok annotationProcessor libs.lombok diff --git a/code/index/index-forward/build.gradle b/code/index/index-forward/build.gradle index 8a789e59..011a39ac 100644 --- a/code/index/index-forward/build.gradle +++ b/code/index/index-forward/build.gradle @@ -18,7 +18,8 @@ dependencies { implementation project(':code:index:index-journal') implementation project(':code:index:lexicon') implementation project(':code:common:model') - implementation project(':third-party') + + implementation project(':third-party:uppend') implementation libs.lombok annotationProcessor libs.lombok diff --git a/code/index/index-journal/build.gradle b/code/index/index-journal/build.gradle index 65910447..9aeaa209 100644 --- a/code/index/index-journal/build.gradle +++ b/code/index/index-journal/build.gradle @@ -14,7 +14,6 @@ dependencies { implementation project(':code:libraries:array') implementation project(':code:common:model') implementation project(':code:index:lexicon') - implementation project(':third-party') implementation libs.lombok annotationProcessor libs.lombok diff --git a/code/libraries/array/build.gradle b/code/libraries/array/build.gradle index 65574dfd..295a4dc5 100644 --- a/code/libraries/array/build.gradle +++ b/code/libraries/array/build.gradle @@ -9,7 +9,7 @@ java { } dependencies { - implementation project(':third-party') + implementation project(':third-party:uppend') implementation libs.lombok annotationProcessor libs.lombok diff --git a/code/libraries/btree/build.gradle b/code/libraries/btree/build.gradle index c5a9950e..83b5fc7b 100644 --- a/code/libraries/btree/build.gradle +++ b/code/libraries/btree/build.gradle @@ -9,7 +9,6 @@ java { } dependencies { - implementation project(':third-party') implementation project(':code:libraries:array') implementation project(':code:libraries:next-prime') diff --git a/code/libraries/language-processing/build.gradle b/code/libraries/language-processing/build.gradle index dfcec644..f0d52d1f 100644 --- a/code/libraries/language-processing/build.gradle +++ b/code/libraries/language-processing/build.gradle @@ -15,7 +15,9 @@ java { } dependencies { - implementation project(':third-party') + implementation project(':third-party:rdrpostagger') + implementation project(':third-party:porterstemmer') + implementation project(':third-party:monkey-patch-opennlp') implementation project(':code:common:model') implementation project(':code:common:config') implementation project(':code:libraries:easy-lsh') diff --git a/code/services-core/assistant-service/build.gradle b/code/services-core/assistant-service/build.gradle index ad6e0167..d9b04f0e 100644 --- a/code/services-core/assistant-service/build.gradle +++ b/code/services-core/assistant-service/build.gradle @@ -22,7 +22,7 @@ java { } dependencies { - implementation project(':third-party') + implementation project(':third-party:symspell') implementation project(':code:api:assistant-api') implementation project(':code:common:config') implementation project(':code:common:service') diff --git a/code/services-core/assistant-service/src/test/java/nu/marginalia/assistant/dict/WikiCleanerTest.java b/code/services-core/assistant-service/src/test/java/nu/marginalia/assistant/dict/WikiCleanerTest.java deleted file mode 100644 index 3c2bda7a..00000000 --- a/code/services-core/assistant-service/src/test/java/nu/marginalia/assistant/dict/WikiCleanerTest.java +++ /dev/null @@ -1,45 +0,0 @@ -package nu.marginalia.assistant.dict; - -import org.junit.jupiter.api.Disabled; -import org.junit.jupiter.api.Test; -import org.openzim.ZIMTypes.ZIMFile; -import org.openzim.ZIMTypes.ZIMReader; - -import java.io.IOException; - -class WikiCleanerTest { - - @Test - void cleanWikiJunk() throws IOException { -// String str = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/Scamander", new String(Files.readAllBytes(Path.of("/home/vlofgren/Work/wiki-cleaner/Scamander.wiki.html")))); -// String str2 = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/Plato", new String(Files.readAllBytes(Path.of("/home/vlofgren/Work/wiki-cleaner/Plato.wiki.html")))); -// String str3 = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/C++", new String(Files.readAllBytes(Path.of("/home/vlofgren/Work/wiki-cleaner/Cpp.wiki.html")))); -// String str4 = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/Memex", new String(Files.readAllBytes(Path.of("/home/vlofgren/Work/wiki-cleaner/Memex.wiki.html")))); -// Files.writeString(Path.of("/home/vlofgren/Work/wiki-cleaner/Scamander.out.html"), str); -// Files.writeString(Path.of("/home/vlofgren/Work/wiki-cleaner/Plato.out.html"), str2); -// Files.writeString(Path.of("/home/vlofgren/Work/wiki-cleaner/Cpp.out.html"), str3); -// Files.writeString(Path.of("/home/vlofgren/Work/wiki-cleaner/Memex.out.html"), str4); - } - - @Test @Disabled - public void readZim() throws IOException { - var zr = new ZIMReader(new ZIMFile("/home/vlofgren/Work/wikipedia_en_all_nopic_2021-01.zim")); -// try (var pw = new PrintWriter(new File("/home/vlofgren/Work/article-clusters.tsv"))) { -// zr.enumerateArticles(pw); -// } - zr.forEachArticles((url, art) -> { - if (art != null) { - System.out.println(url); - } -// if (art != null && art.length() > 5) { -// System.out.println(url + " -> " + art.substring(0, 5)); -// } - }, (p) -> true); - - /*try (var baos = zr.getArticleData("Giraffe", 'A')) { - String str = baos.toString(); - Files.writeString(Path.of("/home/vlofgren/Work/wiki-cleaner/Giraffe.wiki.html"), str); - Files.writeString(Path.of("/home/vlofgren/Work/wiki-cleaner/Giraffe.out.html"), new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/Giraffe", str)); - }*/ - } -} \ No newline at end of file diff --git a/code/services-core/index-service/build.gradle b/code/services-core/index-service/build.gradle index 42b6d8c7..1ed30ea5 100644 --- a/code/services-core/index-service/build.gradle +++ b/code/services-core/index-service/build.gradle @@ -21,7 +21,6 @@ java { } } dependencies { - implementation project(':third-party') implementation project(':code:common:config') implementation project(':code:common:model') implementation project(':code:common:service') diff --git a/code/services-core/search-service/build.gradle b/code/services-core/search-service/build.gradle index 58a205e3..47e9f1d7 100644 --- a/code/services-core/search-service/build.gradle +++ b/code/services-core/search-service/build.gradle @@ -21,7 +21,6 @@ java { } } dependencies { - implementation project(':third-party') implementation project(':code:common:model') implementation project(':code:common:service') implementation project(':code:common:config') diff --git a/code/services-satellite/api-service/build.gradle b/code/services-satellite/api-service/build.gradle index acf0dbfc..6a40dd89 100644 --- a/code/services-satellite/api-service/build.gradle +++ b/code/services-satellite/api-service/build.gradle @@ -22,7 +22,6 @@ tasks.distZip.enabled = false apply from: "$rootProject.projectDir/docker-service.gradle" dependencies { - implementation project(':third-party') implementation project(':code:common:model') implementation project(':code:common:service') implementation project(':code:common:config') diff --git a/code/services-satellite/dating-service/build.gradle b/code/services-satellite/dating-service/build.gradle index 0e72441b..3208e477 100644 --- a/code/services-satellite/dating-service/build.gradle +++ b/code/services-satellite/dating-service/build.gradle @@ -21,7 +21,6 @@ java { } } dependencies { - implementation project(':third-party') implementation project(':code:common:model') implementation project(':code:common:service') implementation project(':code:common:service-discovery') diff --git a/code/services-satellite/explorer-service/build.gradle b/code/services-satellite/explorer-service/build.gradle index 8034a42d..fee0e6dd 100644 --- a/code/services-satellite/explorer-service/build.gradle +++ b/code/services-satellite/explorer-service/build.gradle @@ -21,7 +21,6 @@ java { } } dependencies { - implementation project(':third-party') implementation project(':code:common:model') implementation project(':code:common:service') implementation project(':code:common:service-discovery') diff --git a/docker-service.gradle b/docker-service.gradle index 9f8f1bbe..15086ce7 100644 --- a/docker-service.gradle +++ b/docker-service.gradle @@ -4,10 +4,12 @@ ext { serviceToolOpts='-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5000' } -docker { - var df = new File(buildDir, "Dockerfile") +tasks.register('dockerFile') { + buildDir.mkdir() - df.text = """# + var df = new File(buildDir, "Dockerfile") + doLast { + df.text = """# # I'm auto-generated, please don't make changes to me or commit me to git # # The template exists in docker-service.gradle @@ -22,11 +24,23 @@ ENV JAVA_OPTS="${serviceJvmOpts} " ENTRYPOINT WMSA_HOME=/wmsa /${application.applicationName}/bin/${application.applicationName} \${arg0} \${arg1} """ + } + it.outputs.file(df) +} - dockerfile = new File(buildDir, "Dockerfile") +dockerPrepare { + dependsOn tasks.dockerFile +} + +dockerfileZip { + dependsOn tasks.dockerFile +} + + +docker { + dockerfile = tasks.dockerFile.outputs.files.singleFile name = 'marginalia.nu/'+application.applicationName+':latest' files tasks.distTar.outputs tags 'latest' - dependsOn tasks.distTar } diff --git a/other/memex/build.gradle b/other/memex/build.gradle index 78b13789..5e422b57 100644 --- a/other/memex/build.gradle +++ b/other/memex/build.gradle @@ -59,7 +59,6 @@ jmhJar { zip64 true } dependencies { - implementation project(':third-party') implementation project(':code:common:service') implementation project(':code:common:config') implementation project(':code:common:service-discovery') diff --git a/other/wmsa_old/build.gradle b/other/wmsa_old/build.gradle index 8b41cbc9..4f38b40c 100644 --- a/other/wmsa_old/build.gradle +++ b/other/wmsa_old/build.gradle @@ -30,7 +30,6 @@ java { } } dependencies { - implementation project(':third-party') implementation project(':code:common:service') implementation project(':code:common:service-discovery') implementation project(':code:common:service-client') diff --git a/settings.gradle b/settings.gradle index b871494c..db1822aa 100644 --- a/settings.gradle +++ b/settings.gradle @@ -52,7 +52,14 @@ include 'code:crawl:loading-process' include 'code:crawl:common' include 'code:crawl:experimental' -include 'third-party' +include 'third-party:porterstemmer' +include 'third-party:xz' +include 'third-party:symspell' +include 'third-party:rdrpostagger' +include 'third-party:uppend' +include 'third-party:openzim' +include 'third-party:monkey-patch-opennlp' + include 'other:memex' include 'other:wmsa_old' diff --git a/third-party/README.md b/third-party/README.md index 70b6340c..577566bd 100644 --- a/third-party/README.md +++ b/third-party/README.md @@ -6,14 +6,11 @@ or lack an artifact, or to override some default that is inappropriate for the t ## Sources and Licenses ### Modified -* [RDRPosTagger](https://github.com/datquocnguyen/RDRPOSTagger) - GPL3 -* [PorterStemmer](https://github.com/caarmen/porter-stemmer) - LGPL3 -* [Uppend](https://github.com/upserve/uppend) - MIT -* [OpenZIM](https://github.com/openzim/libzim) - GPL-2.0 -* [XZ for Java](https://tukaani.org/xz/) - Public Domain -* [SymSpell](https://github.com/wolfgarbe/symspell) - LGPL-3.0 +* [RDRPosTagger](rdrpostagger/) - GPL3 +* [PorterStemmer](porterstemmer/) - LGPL3 +* [Uppend](uppend/) - MIT +* [OpenZIM](openzim/) - GPL-2.0 +* [SymSpell](symspell/) - LGPL-3.0 ### Monkey Patched -* [GSON](https://github.com/google/gson) - Apache-2.0 -* OpenJDK - GPL-2.0 (packaged under jdkoverride) -* Stanford OpenNLP - Apache-2.0 +* [Stanford OpenNLP](monkey-patch-opennlp/) - Apache-2.0 diff --git a/third-party/build.gradle b/third-party/monkey-patch-opennlp/build.gradle similarity index 96% rename from third-party/build.gradle rename to third-party/monkey-patch-opennlp/build.gradle index 4128ac2b..1d6a4bc3 100644 --- a/third-party/build.gradle +++ b/third-party/monkey-patch-opennlp/build.gradle @@ -27,5 +27,5 @@ dependencies { } test { - useJUnitPlatform() + useJUnitPlatform()\ } diff --git a/third-party/monkey-patch-opennlp/readme.md b/third-party/monkey-patch-opennlp/readme.md new file mode 100644 index 00000000..be7d7490 --- /dev/null +++ b/third-party/monkey-patch-opennlp/readme.md @@ -0,0 +1,11 @@ +# Monkey Patched OpenNLP + +Stanford OpenNLP - Apache-2.0 + +## Rationale + +OpenNLP's sentence detector uses a slow StringBuffer instead of a StringBuilder where it makes no +no sense to do so. This makes it much slower than it needs to be. I've found no way to file issues with the +project to get it fixed. Instead we're doing this monkey patch where the class is overridden with something +better. + diff --git a/third-party/src/main/java/opennlp/tools/sentdetect/DefaultSDContextGenerator.java b/third-party/monkey-patch-opennlp/src/main/java/opennlp/tools/sentdetect/DefaultSDContextGenerator.java similarity index 100% rename from third-party/src/main/java/opennlp/tools/sentdetect/DefaultSDContextGenerator.java rename to third-party/monkey-patch-opennlp/src/main/java/opennlp/tools/sentdetect/DefaultSDContextGenerator.java diff --git a/third-party/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java b/third-party/monkey-patch-opennlp/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java similarity index 100% rename from third-party/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java rename to third-party/monkey-patch-opennlp/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java diff --git a/third-party/openzim/build.gradle b/third-party/openzim/build.gradle new file mode 100644 index 00000000..e56a9df5 --- /dev/null +++ b/third-party/openzim/build.gradle @@ -0,0 +1,24 @@ +plugins { + id 'java' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +dependencies { + implementation libs.bundles.nlp + implementation libs.zstd + implementation libs.commons.compress + implementation libs.ffi + implementation libs.databind + implementation libs.bundles.gson + + implementation project(':third-party:xz') +} + +test { + useJUnitPlatform() +} diff --git a/third-party/openzim/readme.md b/third-party/openzim/readme.md new file mode 100644 index 00000000..ee47e601 --- /dev/null +++ b/third-party/openzim/readme.md @@ -0,0 +1,11 @@ +# OpenZIM + +[OpenZIM](https://github.com/openzim/libzim) - GPL-2.0 + +OpenZIM is a ZIM file reader. This code has been modified in a fairly crude manner +to be much faster than the original code base which seems quite antique. It also +supports XZ compression. + +**Important Note** the license is incompatible with AGPL 3, so we can't link Marginalia +directly to this. It's still very useful for building tools that deal with +wikipedia data which would be stand-alone. \ No newline at end of file diff --git a/third-party/src/main/java/org/openzim/ZIMTypes/ArticleEntry.java b/third-party/openzim/src/main/java/org/openzim/ZIMTypes/ArticleEntry.java similarity index 100% rename from third-party/src/main/java/org/openzim/ZIMTypes/ArticleEntry.java rename to third-party/openzim/src/main/java/org/openzim/ZIMTypes/ArticleEntry.java diff --git a/third-party/src/main/java/org/openzim/ZIMTypes/DirectoryEntry.java b/third-party/openzim/src/main/java/org/openzim/ZIMTypes/DirectoryEntry.java similarity index 100% rename from third-party/src/main/java/org/openzim/ZIMTypes/DirectoryEntry.java rename to third-party/openzim/src/main/java/org/openzim/ZIMTypes/DirectoryEntry.java diff --git a/third-party/src/main/java/org/openzim/ZIMTypes/RedirectEntry.java b/third-party/openzim/src/main/java/org/openzim/ZIMTypes/RedirectEntry.java similarity index 100% rename from third-party/src/main/java/org/openzim/ZIMTypes/RedirectEntry.java rename to third-party/openzim/src/main/java/org/openzim/ZIMTypes/RedirectEntry.java diff --git a/third-party/src/main/java/org/openzim/ZIMTypes/ZIMFile.java b/third-party/openzim/src/main/java/org/openzim/ZIMTypes/ZIMFile.java similarity index 100% rename from third-party/src/main/java/org/openzim/ZIMTypes/ZIMFile.java rename to third-party/openzim/src/main/java/org/openzim/ZIMTypes/ZIMFile.java diff --git a/third-party/src/main/java/org/openzim/ZIMTypes/ZIMReader.java b/third-party/openzim/src/main/java/org/openzim/ZIMTypes/ZIMReader.java similarity index 100% rename from third-party/src/main/java/org/openzim/ZIMTypes/ZIMReader.java rename to third-party/openzim/src/main/java/org/openzim/ZIMTypes/ZIMReader.java diff --git a/third-party/src/main/java/org/openzim/util/RandomAcessFileZIMInputStream.java b/third-party/openzim/src/main/java/org/openzim/util/RandomAcessFileZIMInputStream.java similarity index 100% rename from third-party/src/main/java/org/openzim/util/RandomAcessFileZIMInputStream.java rename to third-party/openzim/src/main/java/org/openzim/util/RandomAcessFileZIMInputStream.java diff --git a/third-party/src/main/java/org/openzim/util/Utilities.java b/third-party/openzim/src/main/java/org/openzim/util/Utilities.java similarity index 100% rename from third-party/src/main/java/org/openzim/util/Utilities.java rename to third-party/openzim/src/main/java/org/openzim/util/Utilities.java diff --git a/third-party/porterstemmer/build.gradle b/third-party/porterstemmer/build.gradle new file mode 100644 index 00000000..de627417 --- /dev/null +++ b/third-party/porterstemmer/build.gradle @@ -0,0 +1,16 @@ +plugins { + id 'java' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +dependencies { +} + +test { + useJUnitPlatform() +} diff --git a/third-party/porterstemmer/readme.md b/third-party/porterstemmer/readme.md new file mode 100644 index 00000000..c5e17793 --- /dev/null +++ b/third-party/porterstemmer/readme.md @@ -0,0 +1,6 @@ +# Porterstemmer + +[PorterStemmer](https://github.com/caarmen/porter-stemmer) - LGPL3 + +It's a [porter stemmer](https://tartarus.org/martin/PorterStemmer/) library, although one comes with OpenNLP +too. TBD which one to use, they're fairly equivalent. \ No newline at end of file diff --git a/third-party/src/main/java/ca/rmen/porterstemmer/PorterStemmer.java b/third-party/porterstemmer/src/main/java/ca/rmen/porterstemmer/PorterStemmer.java similarity index 100% rename from third-party/src/main/java/ca/rmen/porterstemmer/PorterStemmer.java rename to third-party/porterstemmer/src/main/java/ca/rmen/porterstemmer/PorterStemmer.java diff --git a/third-party/rdrpostagger/build.gradle b/third-party/rdrpostagger/build.gradle new file mode 100644 index 00000000..de627417 --- /dev/null +++ b/third-party/rdrpostagger/build.gradle @@ -0,0 +1,16 @@ +plugins { + id 'java' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +dependencies { +} + +test { + useJUnitPlatform() +} diff --git a/third-party/rdrpostagger/readme.md b/third-party/rdrpostagger/readme.md new file mode 100644 index 00000000..1627eb55 --- /dev/null +++ b/third-party/rdrpostagger/readme.md @@ -0,0 +1,12 @@ +# RDRPosTagger + +[RDRPosTagger](https://github.com/datquocnguyen/RDRPOSTagger) - GPL3 + +datquocnguyen's excellent fast POS tagger. It's been crudely modified to be faster. +Unlike the original, it only does English. + +## Citations + +- Dat Quoc Nguyen, Dai Quoc Nguyen, Dang Duc Pham and Son Bao Pham. [RDRPOSTagger: A Ripple Down Rules-based Part-Of-Speech Tagger](http://www.aclweb.org/anthology/E14-2005). In *Proceedings of the Demonstrations at the 14th Conference of the European Chapter of the Association for Computational Linguistics*, EACL 2014, pp. 17-20, 2014. [[.PDF]](http://www.aclweb.org/anthology/E14-2005) [[.bib]](http://www.aclweb.org/anthology/E14-2005.bib) + +- Dat Quoc Nguyen, Dai Quoc Nguyen, Dang Duc Pham and Son Bao Pham. [A Robust Transformation-Based Learning Approach Using Ripple Down Rules for Part-Of-Speech Tagging](http://content.iospress.com/articles/ai-communications/aic698). *AI Communications* (AICom), vol. 29, no. 3, pp. 409-422, 2016. [[.PDF]](http://arxiv.org/pdf/1412.4021.pdf) [[.bib]](http://rdrpostagger.sourceforge.net/AICom.bib) diff --git a/third-party/src/main/java/com/github/datquocnguyen/FWObject.java b/third-party/rdrpostagger/src/main/java/com/github/datquocnguyen/FWObject.java similarity index 100% rename from third-party/src/main/java/com/github/datquocnguyen/FWObject.java rename to third-party/rdrpostagger/src/main/java/com/github/datquocnguyen/FWObject.java diff --git a/third-party/src/main/java/com/github/datquocnguyen/InitialTagger.java b/third-party/rdrpostagger/src/main/java/com/github/datquocnguyen/InitialTagger.java similarity index 100% rename from third-party/src/main/java/com/github/datquocnguyen/InitialTagger.java rename to third-party/rdrpostagger/src/main/java/com/github/datquocnguyen/InitialTagger.java diff --git a/third-party/src/main/java/com/github/datquocnguyen/Node.java b/third-party/rdrpostagger/src/main/java/com/github/datquocnguyen/Node.java similarity index 100% rename from third-party/src/main/java/com/github/datquocnguyen/Node.java rename to third-party/rdrpostagger/src/main/java/com/github/datquocnguyen/Node.java diff --git a/third-party/src/main/java/com/github/datquocnguyen/RDRPOSTagger.java b/third-party/rdrpostagger/src/main/java/com/github/datquocnguyen/RDRPOSTagger.java similarity index 100% rename from third-party/src/main/java/com/github/datquocnguyen/RDRPOSTagger.java rename to third-party/rdrpostagger/src/main/java/com/github/datquocnguyen/RDRPOSTagger.java diff --git a/third-party/src/main/java/com/github/datquocnguyen/Utils.java b/third-party/rdrpostagger/src/main/java/com/github/datquocnguyen/Utils.java similarity index 100% rename from third-party/src/main/java/com/github/datquocnguyen/Utils.java rename to third-party/rdrpostagger/src/main/java/com/github/datquocnguyen/Utils.java diff --git a/third-party/src/main/java/com/github/datquocnguyen/WordTag.java b/third-party/rdrpostagger/src/main/java/com/github/datquocnguyen/WordTag.java similarity index 100% rename from third-party/src/main/java/com/github/datquocnguyen/WordTag.java rename to third-party/rdrpostagger/src/main/java/com/github/datquocnguyen/WordTag.java diff --git a/third-party/src/main/java/com/google/gson/stream/JsonReader.java b/third-party/src/main/java/com/google/gson/stream/JsonReader.java deleted file mode 100644 index 213feffa..00000000 --- a/third-party/src/main/java/com/google/gson/stream/JsonReader.java +++ /dev/null @@ -1,1637 +0,0 @@ -/* - * Copyright (C) 2010 Google Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.google.gson.stream; - -import com.google.gson.internal.JsonReaderInternalAccess; -import com.google.gson.internal.bind.JsonTreeReader; -import java.io.Closeable; -import java.io.EOFException; -import java.io.IOException; -import java.io.Reader; -import java.util.Arrays; - -/** - * Reads a JSON (RFC 7159) - * encoded value as a stream of tokens. This stream includes both literal - * values (strings, numbers, booleans, and nulls) as well as the begin and - * end delimiters of objects and arrays. The tokens are traversed in - * depth-first order, the same order that they appear in the JSON document. - * Within JSON objects, name/value pairs are represented by a single token. - * - *
Next, create handler methods for each structure in your JSON text. You'll - * need a method for each object type and for each array type. - *
When a nested object or array is encountered, delegate to the - * corresponding handler method. - * - *
When an unknown name is encountered, strict parsers should fail with an - * exception. Lenient parsers should call {@link #skipValue()} to recursively - * skip the value's nested tokens, which may otherwise conflict. - * - *
If a value may be null, you should first check using {@link #peek()}. - * Null literals can be consumed using either {@link #nextNull()} or {@link - * #skipValue()}. - * - *
{@code - * [ - * { - * "id": 912345678901, - * "text": "How do I read a JSON stream in Java?", - * "geo": null, - * "user": { - * "name": "json_newb", - * "followers_count": 41 - * } - * }, - * { - * "id": 912345678902, - * "text": "@json_newb just use JsonReader!", - * "geo": [50.454722, -104.606667], - * "user": { - * "name": "jesse", - * "followers_count": 2 - * } - * } - * ]}- * This code implements the parser for the above structure:
{@code - * - * public List- * - *readJsonStream(InputStream in) throws IOException { - * JsonReader reader = new JsonReader(new InputStreamReader(in, "UTF-8")); - * try { - * return readMessagesArray(reader); - * } finally { - * reader.close(); - * } - * } - * - * public List readMessagesArray(JsonReader reader) throws IOException { - * List messages = new ArrayList (); - * - * reader.beginArray(); - * while (reader.hasNext()) { - * messages.add(readMessage(reader)); - * } - * reader.endArray(); - * return messages; - * } - * - * public Message readMessage(JsonReader reader) throws IOException { - * long id = -1; - * String text = null; - * User user = null; - * List geo = null; - * - * reader.beginObject(); - * while (reader.hasNext()) { - * String name = reader.nextName(); - * if (name.equals("id")) { - * id = reader.nextLong(); - * } else if (name.equals("text")) { - * text = reader.nextString(); - * } else if (name.equals("geo") && reader.peek() != JsonToken.NULL) { - * geo = readDoublesArray(reader); - * } else if (name.equals("user")) { - * user = readUser(reader); - * } else { - * reader.skipValue(); - * } - * } - * reader.endObject(); - * return new Message(id, text, user, geo); - * } - * - * public List readDoublesArray(JsonReader reader) throws IOException { - * List doubles = new ArrayList (); - * - * reader.beginArray(); - * while (reader.hasNext()) { - * doubles.add(reader.nextDouble()); - * } - * reader.endArray(); - * return doubles; - * } - * - * public User readUser(JsonReader reader) throws IOException { - * String username = null; - * int followersCount = -1; - * - * reader.beginObject(); - * while (reader.hasNext()) { - * String name = reader.nextName(); - * if (name.equals("name")) { - * username = reader.nextString(); - * } else if (name.equals("followers_count")) { - * followersCount = reader.nextInt(); - * } else { - * reader.skipValue(); - * } - * } - * reader.endObject(); - * return new User(username, followersCount); - * }}