The refactoring will continue until morale improves.
This commit is contained in:
parent
4cec89da91
commit
616effdb3c
@ -11,7 +11,6 @@ java {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
dependencies {
|
dependencies {
|
||||||
implementation project(':third-party')
|
|
||||||
implementation project(':code:common:model')
|
implementation project(':code:common:model')
|
||||||
implementation project(':code:common:config')
|
implementation project(':code:common:config')
|
||||||
implementation project(':code:common:service-discovery')
|
implementation project(':code:common:service-discovery')
|
||||||
|
@ -12,7 +12,6 @@ java {
|
|||||||
}
|
}
|
||||||
|
|
||||||
dependencies {
|
dependencies {
|
||||||
implementation project(':third-party')
|
|
||||||
implementation project(':code:common:model')
|
implementation project(':code:common:model')
|
||||||
implementation project(':code:common:config')
|
implementation project(':code:common:config')
|
||||||
implementation project(':code:common:service-discovery')
|
implementation project(':code:common:service-discovery')
|
||||||
|
@ -12,7 +12,6 @@ java {
|
|||||||
}
|
}
|
||||||
|
|
||||||
dependencies {
|
dependencies {
|
||||||
implementation project(':third-party')
|
|
||||||
implementation project(':code:common:model')
|
implementation project(':code:common:model')
|
||||||
implementation project(':code:common:config')
|
implementation project(':code:common:config')
|
||||||
implementation project(':code:common:service-discovery')
|
implementation project(':code:common:service-discovery')
|
||||||
|
@ -12,7 +12,6 @@ java {
|
|||||||
}
|
}
|
||||||
|
|
||||||
dependencies {
|
dependencies {
|
||||||
implementation project(':third-party')
|
|
||||||
implementation project(':code:common:service-discovery')
|
implementation project(':code:common:service-discovery')
|
||||||
|
|
||||||
|
|
||||||
|
@ -12,7 +12,6 @@ java {
|
|||||||
}
|
}
|
||||||
|
|
||||||
dependencies {
|
dependencies {
|
||||||
implementation project(':third-party')
|
|
||||||
implementation project(':code:common:model')
|
implementation project(':code:common:model')
|
||||||
implementation project(':code:common:config')
|
implementation project(':code:common:config')
|
||||||
implementation project(':code:libraries:guarded-regex')
|
implementation project(':code:libraries:guarded-regex')
|
||||||
|
@ -11,7 +11,6 @@ java {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
dependencies {
|
dependencies {
|
||||||
implementation project(':third-party')
|
|
||||||
implementation project(':code:common:model')
|
implementation project(':code:common:model')
|
||||||
implementation project(':code:api:index-api')
|
implementation project(':code:api:index-api')
|
||||||
implementation project(':code:common:service-discovery')
|
implementation project(':code:common:service-discovery')
|
||||||
|
@ -19,7 +19,7 @@ application {
|
|||||||
tasks.distZip.enabled = false
|
tasks.distZip.enabled = false
|
||||||
|
|
||||||
dependencies {
|
dependencies {
|
||||||
implementation project(':third-party')
|
implementation project(':third-party:porterstemmer')
|
||||||
implementation project(':code:api:index-api')
|
implementation project(':code:api:index-api')
|
||||||
|
|
||||||
implementation project(':code:common:model')
|
implementation project(':code:common:model')
|
||||||
|
@ -13,7 +13,6 @@ java {
|
|||||||
}
|
}
|
||||||
|
|
||||||
dependencies {
|
dependencies {
|
||||||
implementation project(':third-party')
|
|
||||||
implementation project(':code:common:model')
|
implementation project(':code:common:model')
|
||||||
implementation project(':code:libraries:big-string')
|
implementation project(':code:libraries:big-string')
|
||||||
implementation project(':code:api:index-api')
|
implementation project(':code:api:index-api')
|
||||||
|
@ -2,13 +2,13 @@ package nu.marginalia.crawling.io;
|
|||||||
|
|
||||||
import com.github.luben.zstd.ZstdInputStream;
|
import com.github.luben.zstd.ZstdInputStream;
|
||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
import jdkoverride.LargeLineBufferedReader;
|
|
||||||
import nu.marginalia.crawling.model.CrawledDocument;
|
import nu.marginalia.crawling.model.CrawledDocument;
|
||||||
import nu.marginalia.crawling.model.CrawledDomain;
|
import nu.marginalia.crawling.model.CrawledDomain;
|
||||||
import nu.marginalia.model.gson.GsonFactory;
|
import nu.marginalia.model.gson.GsonFactory;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
import java.io.FileInputStream;
|
import java.io.FileInputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStreamReader;
|
import java.io.InputStreamReader;
|
||||||
@ -29,7 +29,7 @@ public class CrawledDomainReader {
|
|||||||
public CrawledDomain read(Path path) throws IOException {
|
public CrawledDomain read(Path path) throws IOException {
|
||||||
DomainDataAssembler domainData = new DomainDataAssembler();
|
DomainDataAssembler domainData = new DomainDataAssembler();
|
||||||
|
|
||||||
try (var br = new LargeLineBufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(path.toFile()))))) {
|
try (var br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(path.toFile()))))) {
|
||||||
String line;
|
String line;
|
||||||
while ((line = br.readLine()) != null) {
|
while ((line = br.readLine()) != null) {
|
||||||
if (line.startsWith("//")) {
|
if (line.startsWith("//")) {
|
||||||
|
@ -19,7 +19,6 @@ application {
|
|||||||
tasks.distZip.enabled = false
|
tasks.distZip.enabled = false
|
||||||
|
|
||||||
dependencies {
|
dependencies {
|
||||||
implementation project(':third-party')
|
|
||||||
implementation project(':code:common:model')
|
implementation project(':code:common:model')
|
||||||
implementation project(':code:common:config')
|
implementation project(':code:common:config')
|
||||||
implementation project(':code:common:service')
|
implementation project(':code:common:service')
|
||||||
|
@ -12,7 +12,6 @@ java {
|
|||||||
}
|
}
|
||||||
|
|
||||||
dependencies {
|
dependencies {
|
||||||
implementation project(':third-party')
|
|
||||||
implementation project(':code:common:model')
|
implementation project(':code:common:model')
|
||||||
implementation project(':code:common:config')
|
implementation project(':code:common:config')
|
||||||
implementation project(':code:common:service')
|
implementation project(':code:common:service')
|
||||||
|
@ -18,7 +18,6 @@ application {
|
|||||||
tasks.distZip.enabled = false
|
tasks.distZip.enabled = false
|
||||||
|
|
||||||
dependencies {
|
dependencies {
|
||||||
implementation project(':third-party')
|
|
||||||
implementation project(':code:api:index-api')
|
implementation project(':code:api:index-api')
|
||||||
implementation project(':code:common:model')
|
implementation project(':code:common:model')
|
||||||
implementation project(':code:common:config')
|
implementation project(':code:common:config')
|
||||||
|
@ -15,7 +15,6 @@ java {
|
|||||||
}
|
}
|
||||||
|
|
||||||
dependencies {
|
dependencies {
|
||||||
implementation project(':third-party')
|
|
||||||
implementation project(':code:common:model')
|
implementation project(':code:common:model')
|
||||||
implementation project(':code:common:service')
|
implementation project(':code:common:service')
|
||||||
|
|
||||||
|
@ -11,7 +11,6 @@ java {
|
|||||||
}
|
}
|
||||||
|
|
||||||
dependencies {
|
dependencies {
|
||||||
implementation project(':third-party')
|
|
||||||
implementation project(':code:common:model')
|
implementation project(':code:common:model')
|
||||||
implementation project(':code:common:service')
|
implementation project(':code:common:service')
|
||||||
|
|
||||||
|
@ -16,7 +16,7 @@ java {
|
|||||||
dependencies {
|
dependencies {
|
||||||
implementation project(':code:common:config')
|
implementation project(':code:common:config')
|
||||||
implementation project(':code:libraries:language-processing')
|
implementation project(':code:libraries:language-processing')
|
||||||
implementation project(':third-party')
|
implementation project(':third-party:porterstemmer')
|
||||||
|
|
||||||
implementation libs.lombok
|
implementation libs.lombok
|
||||||
annotationProcessor libs.lombok
|
annotationProcessor libs.lombok
|
||||||
|
@ -18,7 +18,8 @@ dependencies {
|
|||||||
implementation project(':code:index:index-journal')
|
implementation project(':code:index:index-journal')
|
||||||
implementation project(':code:index:lexicon')
|
implementation project(':code:index:lexicon')
|
||||||
implementation project(':code:common:model')
|
implementation project(':code:common:model')
|
||||||
implementation project(':third-party')
|
|
||||||
|
implementation project(':third-party:uppend')
|
||||||
|
|
||||||
implementation libs.lombok
|
implementation libs.lombok
|
||||||
annotationProcessor libs.lombok
|
annotationProcessor libs.lombok
|
||||||
|
@ -14,7 +14,6 @@ dependencies {
|
|||||||
implementation project(':code:libraries:array')
|
implementation project(':code:libraries:array')
|
||||||
implementation project(':code:common:model')
|
implementation project(':code:common:model')
|
||||||
implementation project(':code:index:lexicon')
|
implementation project(':code:index:lexicon')
|
||||||
implementation project(':third-party')
|
|
||||||
|
|
||||||
implementation libs.lombok
|
implementation libs.lombok
|
||||||
annotationProcessor libs.lombok
|
annotationProcessor libs.lombok
|
||||||
|
@ -9,7 +9,7 @@ java {
|
|||||||
}
|
}
|
||||||
|
|
||||||
dependencies {
|
dependencies {
|
||||||
implementation project(':third-party')
|
implementation project(':third-party:uppend')
|
||||||
|
|
||||||
implementation libs.lombok
|
implementation libs.lombok
|
||||||
annotationProcessor libs.lombok
|
annotationProcessor libs.lombok
|
||||||
|
@ -9,7 +9,6 @@ java {
|
|||||||
}
|
}
|
||||||
|
|
||||||
dependencies {
|
dependencies {
|
||||||
implementation project(':third-party')
|
|
||||||
implementation project(':code:libraries:array')
|
implementation project(':code:libraries:array')
|
||||||
implementation project(':code:libraries:next-prime')
|
implementation project(':code:libraries:next-prime')
|
||||||
|
|
||||||
|
@ -15,7 +15,9 @@ java {
|
|||||||
}
|
}
|
||||||
|
|
||||||
dependencies {
|
dependencies {
|
||||||
implementation project(':third-party')
|
implementation project(':third-party:rdrpostagger')
|
||||||
|
implementation project(':third-party:porterstemmer')
|
||||||
|
implementation project(':third-party:monkey-patch-opennlp')
|
||||||
implementation project(':code:common:model')
|
implementation project(':code:common:model')
|
||||||
implementation project(':code:common:config')
|
implementation project(':code:common:config')
|
||||||
implementation project(':code:libraries:easy-lsh')
|
implementation project(':code:libraries:easy-lsh')
|
||||||
|
@ -22,7 +22,7 @@ java {
|
|||||||
}
|
}
|
||||||
|
|
||||||
dependencies {
|
dependencies {
|
||||||
implementation project(':third-party')
|
implementation project(':third-party:symspell')
|
||||||
implementation project(':code:api:assistant-api')
|
implementation project(':code:api:assistant-api')
|
||||||
implementation project(':code:common:config')
|
implementation project(':code:common:config')
|
||||||
implementation project(':code:common:service')
|
implementation project(':code:common:service')
|
||||||
|
@ -1,45 +0,0 @@
|
|||||||
package nu.marginalia.assistant.dict;
|
|
||||||
|
|
||||||
import org.junit.jupiter.api.Disabled;
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
import org.openzim.ZIMTypes.ZIMFile;
|
|
||||||
import org.openzim.ZIMTypes.ZIMReader;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
class WikiCleanerTest {
|
|
||||||
|
|
||||||
@Test
|
|
||||||
void cleanWikiJunk() throws IOException {
|
|
||||||
// String str = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/Scamander", new String(Files.readAllBytes(Path.of("/home/vlofgren/Work/wiki-cleaner/Scamander.wiki.html"))));
|
|
||||||
// String str2 = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/Plato", new String(Files.readAllBytes(Path.of("/home/vlofgren/Work/wiki-cleaner/Plato.wiki.html"))));
|
|
||||||
// String str3 = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/C++", new String(Files.readAllBytes(Path.of("/home/vlofgren/Work/wiki-cleaner/Cpp.wiki.html"))));
|
|
||||||
// String str4 = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/Memex", new String(Files.readAllBytes(Path.of("/home/vlofgren/Work/wiki-cleaner/Memex.wiki.html"))));
|
|
||||||
// Files.writeString(Path.of("/home/vlofgren/Work/wiki-cleaner/Scamander.out.html"), str);
|
|
||||||
// Files.writeString(Path.of("/home/vlofgren/Work/wiki-cleaner/Plato.out.html"), str2);
|
|
||||||
// Files.writeString(Path.of("/home/vlofgren/Work/wiki-cleaner/Cpp.out.html"), str3);
|
|
||||||
// Files.writeString(Path.of("/home/vlofgren/Work/wiki-cleaner/Memex.out.html"), str4);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test @Disabled
|
|
||||||
public void readZim() throws IOException {
|
|
||||||
var zr = new ZIMReader(new ZIMFile("/home/vlofgren/Work/wikipedia_en_all_nopic_2021-01.zim"));
|
|
||||||
// try (var pw = new PrintWriter(new File("/home/vlofgren/Work/article-clusters.tsv"))) {
|
|
||||||
// zr.enumerateArticles(pw);
|
|
||||||
// }
|
|
||||||
zr.forEachArticles((url, art) -> {
|
|
||||||
if (art != null) {
|
|
||||||
System.out.println(url);
|
|
||||||
}
|
|
||||||
// if (art != null && art.length() > 5) {
|
|
||||||
// System.out.println(url + " -> " + art.substring(0, 5));
|
|
||||||
// }
|
|
||||||
}, (p) -> true);
|
|
||||||
|
|
||||||
/*try (var baos = zr.getArticleData("Giraffe", 'A')) {
|
|
||||||
String str = baos.toString();
|
|
||||||
Files.writeString(Path.of("/home/vlofgren/Work/wiki-cleaner/Giraffe.wiki.html"), str);
|
|
||||||
Files.writeString(Path.of("/home/vlofgren/Work/wiki-cleaner/Giraffe.out.html"), new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/Giraffe", str));
|
|
||||||
}*/
|
|
||||||
}
|
|
||||||
}
|
|
@ -21,7 +21,6 @@ java {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
dependencies {
|
dependencies {
|
||||||
implementation project(':third-party')
|
|
||||||
implementation project(':code:common:config')
|
implementation project(':code:common:config')
|
||||||
implementation project(':code:common:model')
|
implementation project(':code:common:model')
|
||||||
implementation project(':code:common:service')
|
implementation project(':code:common:service')
|
||||||
|
@ -21,7 +21,6 @@ java {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
dependencies {
|
dependencies {
|
||||||
implementation project(':third-party')
|
|
||||||
implementation project(':code:common:model')
|
implementation project(':code:common:model')
|
||||||
implementation project(':code:common:service')
|
implementation project(':code:common:service')
|
||||||
implementation project(':code:common:config')
|
implementation project(':code:common:config')
|
||||||
|
@ -22,7 +22,6 @@ tasks.distZip.enabled = false
|
|||||||
apply from: "$rootProject.projectDir/docker-service.gradle"
|
apply from: "$rootProject.projectDir/docker-service.gradle"
|
||||||
|
|
||||||
dependencies {
|
dependencies {
|
||||||
implementation project(':third-party')
|
|
||||||
implementation project(':code:common:model')
|
implementation project(':code:common:model')
|
||||||
implementation project(':code:common:service')
|
implementation project(':code:common:service')
|
||||||
implementation project(':code:common:config')
|
implementation project(':code:common:config')
|
||||||
|
@ -21,7 +21,6 @@ java {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
dependencies {
|
dependencies {
|
||||||
implementation project(':third-party')
|
|
||||||
implementation project(':code:common:model')
|
implementation project(':code:common:model')
|
||||||
implementation project(':code:common:service')
|
implementation project(':code:common:service')
|
||||||
implementation project(':code:common:service-discovery')
|
implementation project(':code:common:service-discovery')
|
||||||
|
@ -21,7 +21,6 @@ java {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
dependencies {
|
dependencies {
|
||||||
implementation project(':third-party')
|
|
||||||
implementation project(':code:common:model')
|
implementation project(':code:common:model')
|
||||||
implementation project(':code:common:service')
|
implementation project(':code:common:service')
|
||||||
implementation project(':code:common:service-discovery')
|
implementation project(':code:common:service-discovery')
|
||||||
|
@ -4,10 +4,12 @@ ext {
|
|||||||
serviceToolOpts='-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5000'
|
serviceToolOpts='-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5000'
|
||||||
}
|
}
|
||||||
|
|
||||||
docker {
|
tasks.register('dockerFile') {
|
||||||
var df = new File(buildDir, "Dockerfile")
|
buildDir.mkdir()
|
||||||
|
|
||||||
df.text = """#
|
var df = new File(buildDir, "Dockerfile")
|
||||||
|
doLast {
|
||||||
|
df.text = """#
|
||||||
# I'm auto-generated, please don't make changes to me or commit me to git
|
# I'm auto-generated, please don't make changes to me or commit me to git
|
||||||
#
|
#
|
||||||
# The template exists in docker-service.gradle
|
# The template exists in docker-service.gradle
|
||||||
@ -22,11 +24,23 @@ ENV JAVA_OPTS="${serviceJvmOpts} "
|
|||||||
|
|
||||||
ENTRYPOINT WMSA_HOME=/wmsa /${application.applicationName}/bin/${application.applicationName} \${arg0} \${arg1}
|
ENTRYPOINT WMSA_HOME=/wmsa /${application.applicationName}/bin/${application.applicationName} \${arg0} \${arg1}
|
||||||
"""
|
"""
|
||||||
|
}
|
||||||
|
it.outputs.file(df)
|
||||||
|
}
|
||||||
|
|
||||||
dockerfile = new File(buildDir, "Dockerfile")
|
dockerPrepare {
|
||||||
|
dependsOn tasks.dockerFile
|
||||||
|
}
|
||||||
|
|
||||||
|
dockerfileZip {
|
||||||
|
dependsOn tasks.dockerFile
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
docker {
|
||||||
|
dockerfile = tasks.dockerFile.outputs.files.singleFile
|
||||||
name = 'marginalia.nu/'+application.applicationName+':latest'
|
name = 'marginalia.nu/'+application.applicationName+':latest'
|
||||||
files tasks.distTar.outputs
|
files tasks.distTar.outputs
|
||||||
tags 'latest'
|
tags 'latest'
|
||||||
|
|
||||||
dependsOn tasks.distTar
|
dependsOn tasks.distTar
|
||||||
}
|
}
|
||||||
|
@ -59,7 +59,6 @@ jmhJar {
|
|||||||
zip64 true
|
zip64 true
|
||||||
}
|
}
|
||||||
dependencies {
|
dependencies {
|
||||||
implementation project(':third-party')
|
|
||||||
implementation project(':code:common:service')
|
implementation project(':code:common:service')
|
||||||
implementation project(':code:common:config')
|
implementation project(':code:common:config')
|
||||||
implementation project(':code:common:service-discovery')
|
implementation project(':code:common:service-discovery')
|
||||||
|
@ -30,7 +30,6 @@ java {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
dependencies {
|
dependencies {
|
||||||
implementation project(':third-party')
|
|
||||||
implementation project(':code:common:service')
|
implementation project(':code:common:service')
|
||||||
implementation project(':code:common:service-discovery')
|
implementation project(':code:common:service-discovery')
|
||||||
implementation project(':code:common:service-client')
|
implementation project(':code:common:service-client')
|
||||||
|
@ -52,7 +52,14 @@ include 'code:crawl:loading-process'
|
|||||||
include 'code:crawl:common'
|
include 'code:crawl:common'
|
||||||
include 'code:crawl:experimental'
|
include 'code:crawl:experimental'
|
||||||
|
|
||||||
include 'third-party'
|
include 'third-party:porterstemmer'
|
||||||
|
include 'third-party:xz'
|
||||||
|
include 'third-party:symspell'
|
||||||
|
include 'third-party:rdrpostagger'
|
||||||
|
include 'third-party:uppend'
|
||||||
|
include 'third-party:openzim'
|
||||||
|
include 'third-party:monkey-patch-opennlp'
|
||||||
|
|
||||||
include 'other:memex'
|
include 'other:memex'
|
||||||
include 'other:wmsa_old'
|
include 'other:wmsa_old'
|
||||||
|
|
||||||
|
15
third-party/README.md
vendored
15
third-party/README.md
vendored
@ -6,14 +6,11 @@ or lack an artifact, or to override some default that is inappropriate for the t
|
|||||||
## Sources and Licenses
|
## Sources and Licenses
|
||||||
|
|
||||||
### Modified
|
### Modified
|
||||||
* [RDRPosTagger](https://github.com/datquocnguyen/RDRPOSTagger) - GPL3
|
* [RDRPosTagger](rdrpostagger/) - GPL3
|
||||||
* [PorterStemmer](https://github.com/caarmen/porter-stemmer) - LGPL3
|
* [PorterStemmer](porterstemmer/) - LGPL3
|
||||||
* [Uppend](https://github.com/upserve/uppend) - MIT
|
* [Uppend](uppend/) - MIT
|
||||||
* [OpenZIM](https://github.com/openzim/libzim) - GPL-2.0
|
* [OpenZIM](openzim/) - GPL-2.0
|
||||||
* [XZ for Java](https://tukaani.org/xz/) - Public Domain
|
* [SymSpell](symspell/) - LGPL-3.0
|
||||||
* [SymSpell](https://github.com/wolfgarbe/symspell) - LGPL-3.0
|
|
||||||
|
|
||||||
### Monkey Patched
|
### Monkey Patched
|
||||||
* [GSON](https://github.com/google/gson) - Apache-2.0
|
* [Stanford OpenNLP](monkey-patch-opennlp/) - Apache-2.0
|
||||||
* OpenJDK - GPL-2.0 (packaged under jdkoverride)
|
|
||||||
* Stanford OpenNLP - Apache-2.0
|
|
||||||
|
@ -27,5 +27,5 @@ dependencies {
|
|||||||
}
|
}
|
||||||
|
|
||||||
test {
|
test {
|
||||||
useJUnitPlatform()
|
useJUnitPlatform()\
|
||||||
}
|
}
|
11
third-party/monkey-patch-opennlp/readme.md
vendored
Normal file
11
third-party/monkey-patch-opennlp/readme.md
vendored
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
# Monkey Patched OpenNLP
|
||||||
|
|
||||||
|
Stanford OpenNLP - Apache-2.0
|
||||||
|
|
||||||
|
## Rationale
|
||||||
|
|
||||||
|
OpenNLP's sentence detector uses a slow StringBuffer instead of a StringBuilder where it makes no
|
||||||
|
no sense to do so. This makes it much slower than it needs to be. I've found no way to file issues with the
|
||||||
|
project to get it fixed. Instead we're doing this monkey patch where the class is overridden with something
|
||||||
|
better.
|
||||||
|
|
24
third-party/openzim/build.gradle
vendored
Normal file
24
third-party/openzim/build.gradle
vendored
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
plugins {
|
||||||
|
id 'java'
|
||||||
|
}
|
||||||
|
|
||||||
|
java {
|
||||||
|
toolchain {
|
||||||
|
languageVersion.set(JavaLanguageVersion.of(17))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
dependencies {
|
||||||
|
implementation libs.bundles.nlp
|
||||||
|
implementation libs.zstd
|
||||||
|
implementation libs.commons.compress
|
||||||
|
implementation libs.ffi
|
||||||
|
implementation libs.databind
|
||||||
|
implementation libs.bundles.gson
|
||||||
|
|
||||||
|
implementation project(':third-party:xz')
|
||||||
|
}
|
||||||
|
|
||||||
|
test {
|
||||||
|
useJUnitPlatform()
|
||||||
|
}
|
11
third-party/openzim/readme.md
vendored
Normal file
11
third-party/openzim/readme.md
vendored
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
# OpenZIM
|
||||||
|
|
||||||
|
[OpenZIM](https://github.com/openzim/libzim) - GPL-2.0
|
||||||
|
|
||||||
|
OpenZIM is a ZIM file reader. This code has been modified in a fairly crude manner
|
||||||
|
to be much faster than the original code base which seems quite antique. It also
|
||||||
|
supports XZ compression.
|
||||||
|
|
||||||
|
**Important Note** the license is incompatible with AGPL 3, so we can't link Marginalia
|
||||||
|
directly to this. It's still very useful for building tools that deal with
|
||||||
|
wikipedia data which would be stand-alone.
|
16
third-party/porterstemmer/build.gradle
vendored
Normal file
16
third-party/porterstemmer/build.gradle
vendored
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
plugins {
|
||||||
|
id 'java'
|
||||||
|
}
|
||||||
|
|
||||||
|
java {
|
||||||
|
toolchain {
|
||||||
|
languageVersion.set(JavaLanguageVersion.of(17))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
dependencies {
|
||||||
|
}
|
||||||
|
|
||||||
|
test {
|
||||||
|
useJUnitPlatform()
|
||||||
|
}
|
6
third-party/porterstemmer/readme.md
vendored
Normal file
6
third-party/porterstemmer/readme.md
vendored
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
# Porterstemmer
|
||||||
|
|
||||||
|
[PorterStemmer](https://github.com/caarmen/porter-stemmer) - LGPL3
|
||||||
|
|
||||||
|
It's a [porter stemmer](https://tartarus.org/martin/PorterStemmer/) library, although one comes with OpenNLP
|
||||||
|
too. TBD which one to use, they're fairly equivalent.
|
16
third-party/rdrpostagger/build.gradle
vendored
Normal file
16
third-party/rdrpostagger/build.gradle
vendored
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
plugins {
|
||||||
|
id 'java'
|
||||||
|
}
|
||||||
|
|
||||||
|
java {
|
||||||
|
toolchain {
|
||||||
|
languageVersion.set(JavaLanguageVersion.of(17))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
dependencies {
|
||||||
|
}
|
||||||
|
|
||||||
|
test {
|
||||||
|
useJUnitPlatform()
|
||||||
|
}
|
12
third-party/rdrpostagger/readme.md
vendored
Normal file
12
third-party/rdrpostagger/readme.md
vendored
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
# RDRPosTagger
|
||||||
|
|
||||||
|
[RDRPosTagger](https://github.com/datquocnguyen/RDRPOSTagger) - GPL3
|
||||||
|
|
||||||
|
datquocnguyen's excellent fast POS tagger. It's been crudely modified to be faster.
|
||||||
|
Unlike the original, it only does English.
|
||||||
|
|
||||||
|
## Citations
|
||||||
|
|
||||||
|
- Dat Quoc Nguyen, Dai Quoc Nguyen, Dang Duc Pham and Son Bao Pham. [RDRPOSTagger: A Ripple Down Rules-based Part-Of-Speech Tagger](http://www.aclweb.org/anthology/E14-2005). In *Proceedings of the Demonstrations at the 14th Conference of the European Chapter of the Association for Computational Linguistics*, EACL 2014, pp. 17-20, 2014. [[.PDF]](http://www.aclweb.org/anthology/E14-2005) [[.bib]](http://www.aclweb.org/anthology/E14-2005.bib)
|
||||||
|
|
||||||
|
- Dat Quoc Nguyen, Dai Quoc Nguyen, Dang Duc Pham and Son Bao Pham. [A Robust Transformation-Based Learning Approach Using Ripple Down Rules for Part-Of-Speech Tagging](http://content.iospress.com/articles/ai-communications/aic698). *AI Communications* (AICom), vol. 29, no. 3, pp. 409-422, 2016. [[.PDF]](http://arxiv.org/pdf/1412.4021.pdf) [[.bib]](http://rdrpostagger.sourceforge.net/AICom.bib)
|
File diff suppressed because it is too large
Load Diff
@ -1,559 +0,0 @@
|
|||||||
package jdkoverride;/*
|
|
||||||
* Copyright (c) 1996, 2021, Oracle and/or its affiliates. All rights reserved.
|
|
||||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
|
||||||
*
|
|
||||||
* This code is free software; you can redistribute it and/or modify it
|
|
||||||
* under the terms of the GNU General Public License version 2 only, as
|
|
||||||
* published by the Free Software Foundation. Oracle designates this
|
|
||||||
* particular file as subject to the "Classpath" exception as provided
|
|
||||||
* by Oracle in the LICENSE file that accompanied this code.
|
|
||||||
*
|
|
||||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
|
||||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
||||||
* version 2 for more details (a copy is included in the LICENSE file that
|
|
||||||
* accompanied this code).
|
|
||||||
*
|
|
||||||
* You should have received a copy of the GNU General Public License version
|
|
||||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
|
||||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
*
|
|
||||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
|
||||||
* or visit www.oracle.com if you need additional information or have any
|
|
||||||
* questions.
|
|
||||||
*/
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.Reader;
|
|
||||||
import java.io.UncheckedIOException;
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.NoSuchElementException;
|
|
||||||
import java.util.Objects;
|
|
||||||
import java.util.Spliterator;
|
|
||||||
import java.util.Spliterators;
|
|
||||||
import java.util.stream.Stream;
|
|
||||||
import java.util.stream.StreamSupport;
|
|
||||||
|
|
||||||
/** This is java.io.BufferedReader with a different value for defaultExpectedLineLength
|
|
||||||
*/
|
|
||||||
|
|
||||||
public class LargeLineBufferedReader extends Reader {
|
|
||||||
|
|
||||||
private Reader in;
|
|
||||||
|
|
||||||
private char[] cb;
|
|
||||||
private int nChars, nextChar;
|
|
||||||
|
|
||||||
private static final int INVALIDATED = -2;
|
|
||||||
private static final int UNMARKED = -1;
|
|
||||||
private int markedChar = UNMARKED;
|
|
||||||
private int readAheadLimit = 0; /* Valid only when markedChar > 0 */
|
|
||||||
|
|
||||||
/** If the next character is a line feed, skip it */
|
|
||||||
private boolean skipLF = false;
|
|
||||||
|
|
||||||
/** The skipLF flag when the mark was set */
|
|
||||||
private boolean markedSkipLF = false;
|
|
||||||
|
|
||||||
private static int defaultCharBufferSize = 8192;
|
|
||||||
private static int defaultExpectedLineLength = 65536;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Creates a buffering character-input stream that uses an input buffer of
|
|
||||||
* the specified size.
|
|
||||||
*
|
|
||||||
* @param in A Reader
|
|
||||||
* @param sz Input-buffer size
|
|
||||||
*
|
|
||||||
* @throws IllegalArgumentException If {@code sz <= 0}
|
|
||||||
*/
|
|
||||||
public LargeLineBufferedReader(Reader in, int sz) {
|
|
||||||
super(in);
|
|
||||||
if (sz <= 0)
|
|
||||||
throw new IllegalArgumentException("Buffer size <= 0");
|
|
||||||
this.in = in;
|
|
||||||
cb = new char[sz];
|
|
||||||
nextChar = nChars = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Creates a buffering character-input stream that uses a default-sized
|
|
||||||
* input buffer.
|
|
||||||
*
|
|
||||||
* @param in A Reader
|
|
||||||
*/
|
|
||||||
public LargeLineBufferedReader(Reader in) {
|
|
||||||
this(in, defaultCharBufferSize);
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Checks to make sure that the stream has not been closed */
|
|
||||||
private void ensureOpen() throws IOException {
|
|
||||||
if (in == null)
|
|
||||||
throw new IOException("Stream closed");
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Fills the input buffer, taking the mark into account if it is valid.
|
|
||||||
*/
|
|
||||||
private void fill() throws IOException {
|
|
||||||
int dst;
|
|
||||||
if (markedChar <= UNMARKED) {
|
|
||||||
/* No mark */
|
|
||||||
dst = 0;
|
|
||||||
} else {
|
|
||||||
/* Marked */
|
|
||||||
int delta = nextChar - markedChar;
|
|
||||||
if (delta >= readAheadLimit) {
|
|
||||||
/* Gone past read-ahead limit: Invalidate mark */
|
|
||||||
markedChar = INVALIDATED;
|
|
||||||
readAheadLimit = 0;
|
|
||||||
dst = 0;
|
|
||||||
} else {
|
|
||||||
if (readAheadLimit <= cb.length) {
|
|
||||||
/* Shuffle in the current buffer */
|
|
||||||
System.arraycopy(cb, markedChar, cb, 0, delta);
|
|
||||||
markedChar = 0;
|
|
||||||
dst = delta;
|
|
||||||
} else {
|
|
||||||
/* Reallocate buffer to accommodate read-ahead limit */
|
|
||||||
char[] ncb = new char[readAheadLimit];
|
|
||||||
System.arraycopy(cb, markedChar, ncb, 0, delta);
|
|
||||||
cb = ncb;
|
|
||||||
markedChar = 0;
|
|
||||||
dst = delta;
|
|
||||||
}
|
|
||||||
nextChar = nChars = delta;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
int n;
|
|
||||||
do {
|
|
||||||
n = in.read(cb, dst, cb.length - dst);
|
|
||||||
} while (n == 0);
|
|
||||||
if (n > 0) {
|
|
||||||
nChars = dst + n;
|
|
||||||
nextChar = dst;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Reads a single character.
|
|
||||||
*
|
|
||||||
* @return The character read, as an integer in the range
|
|
||||||
* 0 to 65535 ({@code 0x00-0xffff}), or -1 if the
|
|
||||||
* end of the stream has been reached
|
|
||||||
* @throws IOException If an I/O error occurs
|
|
||||||
*/
|
|
||||||
public int read() throws IOException {
|
|
||||||
synchronized (lock) {
|
|
||||||
ensureOpen();
|
|
||||||
for (;;) {
|
|
||||||
if (nextChar >= nChars) {
|
|
||||||
fill();
|
|
||||||
if (nextChar >= nChars)
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
if (skipLF) {
|
|
||||||
skipLF = false;
|
|
||||||
if (cb[nextChar] == '\n') {
|
|
||||||
nextChar++;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return cb[nextChar++];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Reads characters into a portion of an array, reading from the underlying
|
|
||||||
* stream if necessary.
|
|
||||||
*/
|
|
||||||
private int read1(char[] cbuf, int off, int len) throws IOException {
|
|
||||||
if (nextChar >= nChars) {
|
|
||||||
/* If the requested length is at least as large as the buffer, and
|
|
||||||
if there is no mark/reset activity, and if line feeds are not
|
|
||||||
being skipped, do not bother to copy the characters into the
|
|
||||||
local buffer. In this way buffered streams will cascade
|
|
||||||
harmlessly. */
|
|
||||||
if (len >= cb.length && markedChar <= UNMARKED && !skipLF) {
|
|
||||||
return in.read(cbuf, off, len);
|
|
||||||
}
|
|
||||||
fill();
|
|
||||||
}
|
|
||||||
if (nextChar >= nChars) return -1;
|
|
||||||
if (skipLF) {
|
|
||||||
skipLF = false;
|
|
||||||
if (cb[nextChar] == '\n') {
|
|
||||||
nextChar++;
|
|
||||||
if (nextChar >= nChars)
|
|
||||||
fill();
|
|
||||||
if (nextChar >= nChars)
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
int n = Math.min(len, nChars - nextChar);
|
|
||||||
System.arraycopy(cb, nextChar, cbuf, off, n);
|
|
||||||
nextChar += n;
|
|
||||||
return n;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Reads characters into a portion of an array.
|
|
||||||
*
|
|
||||||
* <p> This method implements the general contract of the corresponding
|
|
||||||
* {@link Reader#read(char[], int, int) read} method of the
|
|
||||||
* {@link Reader} class. As an additional convenience, it
|
|
||||||
* attempts to read as many characters as possible by repeatedly invoking
|
|
||||||
* the {@code read} method of the underlying stream. This iterated
|
|
||||||
* {@code read} continues until one of the following conditions becomes
|
|
||||||
* true:
|
|
||||||
* <ul>
|
|
||||||
*
|
|
||||||
* <li> The specified number of characters have been read,
|
|
||||||
*
|
|
||||||
* <li> The {@code read} method of the underlying stream returns
|
|
||||||
* {@code -1}, indicating end-of-file, or
|
|
||||||
*
|
|
||||||
* <li> The {@code ready} method of the underlying stream
|
|
||||||
* returns {@code false}, indicating that further input requests
|
|
||||||
* would block.
|
|
||||||
*
|
|
||||||
* </ul>
|
|
||||||
* If the first {@code read} on the underlying stream returns
|
|
||||||
* {@code -1} to indicate end-of-file then this method returns
|
|
||||||
* {@code -1}. Otherwise this method returns the number of characters
|
|
||||||
* actually read.
|
|
||||||
*
|
|
||||||
* <p> Subclasses of this class are encouraged, but not required, to
|
|
||||||
* attempt to read as many characters as possible in the same fashion.
|
|
||||||
*
|
|
||||||
* <p> Ordinarily this method takes characters from this stream's character
|
|
||||||
* buffer, filling it from the underlying stream as necessary. If,
|
|
||||||
* however, the buffer is empty, the mark is not valid, and the requested
|
|
||||||
* length is at least as large as the buffer, then this method will read
|
|
||||||
* characters directly from the underlying stream into the given array.
|
|
||||||
* Thus redundant {@code BufferedReader}s will not copy data
|
|
||||||
* unnecessarily.
|
|
||||||
*
|
|
||||||
* @param cbuf {@inheritDoc}
|
|
||||||
* @param off {@inheritDoc}
|
|
||||||
* @param len {@inheritDoc}
|
|
||||||
*
|
|
||||||
* @return {@inheritDoc}
|
|
||||||
*
|
|
||||||
* @throws IndexOutOfBoundsException {@inheritDoc}
|
|
||||||
* @throws IOException {@inheritDoc}
|
|
||||||
*/
|
|
||||||
public int read(char[] cbuf, int off, int len) throws IOException {
|
|
||||||
synchronized (lock) {
|
|
||||||
ensureOpen();
|
|
||||||
Objects.checkFromIndexSize(off, len, cbuf.length);
|
|
||||||
if (len == 0) {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
int n = read1(cbuf, off, len);
|
|
||||||
if (n <= 0) return n;
|
|
||||||
while ((n < len) && in.ready()) {
|
|
||||||
int n1 = read1(cbuf, off + n, len - n);
|
|
||||||
if (n1 <= 0) break;
|
|
||||||
n += n1;
|
|
||||||
}
|
|
||||||
return n;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Reads a line of text. A line is considered to be terminated by any one
|
|
||||||
* of a line feed ('\n'), a carriage return ('\r'), a carriage return
|
|
||||||
* followed immediately by a line feed, or by reaching the end-of-file
|
|
||||||
* (EOF).
|
|
||||||
*
|
|
||||||
* @param ignoreLF If true, the next '\n' will be skipped
|
|
||||||
* @param term Output: Whether a line terminator was encountered
|
|
||||||
* while reading the line; may be {@code null}.
|
|
||||||
*
|
|
||||||
* @return A String containing the contents of the line, not including
|
|
||||||
* any line-termination characters, or null if the end of the
|
|
||||||
* stream has been reached without reading any characters
|
|
||||||
*
|
|
||||||
* @see java.io.LineNumberReader#readLine()
|
|
||||||
*
|
|
||||||
* @throws IOException If an I/O error occurs
|
|
||||||
*/
|
|
||||||
|
|
||||||
StringBuilder s = new StringBuilder(10000);
|
|
||||||
String readLine(boolean ignoreLF, boolean[] term) throws IOException {
|
|
||||||
|
|
||||||
int startChar;
|
|
||||||
|
|
||||||
synchronized (lock) {
|
|
||||||
|
|
||||||
s.setLength(0);
|
|
||||||
|
|
||||||
ensureOpen();
|
|
||||||
boolean omitLF = ignoreLF || skipLF;
|
|
||||||
if (term != null) term[0] = false;
|
|
||||||
|
|
||||||
for (;;) {
|
|
||||||
|
|
||||||
if (nextChar >= nChars)
|
|
||||||
fill();
|
|
||||||
if (nextChar >= nChars) { /* EOF */
|
|
||||||
if (s != null && s.length() > 0)
|
|
||||||
return s.toString();
|
|
||||||
else
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
boolean eol = false;
|
|
||||||
char c = 0;
|
|
||||||
int i;
|
|
||||||
|
|
||||||
/* Skip a leftover '\n', if necessary */
|
|
||||||
if (omitLF && (cb[nextChar] == '\n'))
|
|
||||||
nextChar++;
|
|
||||||
skipLF = false;
|
|
||||||
omitLF = false;
|
|
||||||
|
|
||||||
for (i = nextChar; i < nChars; i++) {
|
|
||||||
c = cb[i];
|
|
||||||
if ((c == '\n') || (c == '\r')) {
|
|
||||||
if (term != null) term[0] = true;
|
|
||||||
eol = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
startChar = nextChar;
|
|
||||||
nextChar = i;
|
|
||||||
|
|
||||||
if (eol) {
|
|
||||||
s.append(cb, startChar, i - startChar);
|
|
||||||
String str = s.toString();
|
|
||||||
|
|
||||||
nextChar++;
|
|
||||||
if (c == '\r') {
|
|
||||||
skipLF = true;
|
|
||||||
}
|
|
||||||
return str;
|
|
||||||
}
|
|
||||||
|
|
||||||
s.append(cb, startChar, i - startChar);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Reads a line of text. A line is considered to be terminated by any one
|
|
||||||
* of a line feed ('\n'), a carriage return ('\r'), a carriage return
|
|
||||||
* followed immediately by a line feed, or by reaching the end-of-file
|
|
||||||
* (EOF).
|
|
||||||
*
|
|
||||||
* @return A String containing the contents of the line, not including
|
|
||||||
* any line-termination characters, or null if the end of the
|
|
||||||
* stream has been reached without reading any characters
|
|
||||||
*
|
|
||||||
* @throws IOException If an I/O error occurs
|
|
||||||
*
|
|
||||||
* @see java.nio.file.Files#readAllLines
|
|
||||||
*/
|
|
||||||
public String readLine() throws IOException {
|
|
||||||
return readLine(false, null);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* {@inheritDoc}
|
|
||||||
*/
|
|
||||||
public long skip(long n) throws IOException {
|
|
||||||
if (n < 0L) {
|
|
||||||
throw new IllegalArgumentException("skip value is negative");
|
|
||||||
}
|
|
||||||
synchronized (lock) {
|
|
||||||
ensureOpen();
|
|
||||||
long r = n;
|
|
||||||
while (r > 0) {
|
|
||||||
if (nextChar >= nChars)
|
|
||||||
fill();
|
|
||||||
if (nextChar >= nChars) /* EOF */
|
|
||||||
break;
|
|
||||||
if (skipLF) {
|
|
||||||
skipLF = false;
|
|
||||||
if (cb[nextChar] == '\n') {
|
|
||||||
nextChar++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
long d = nChars - nextChar;
|
|
||||||
if (r <= d) {
|
|
||||||
nextChar += r;
|
|
||||||
r = 0;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
r -= d;
|
|
||||||
nextChar = nChars;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return n - r;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Tells whether this stream is ready to be read. A buffered character
|
|
||||||
* stream is ready if the buffer is not empty, or if the underlying
|
|
||||||
* character stream is ready.
|
|
||||||
*
|
|
||||||
* @throws IOException If an I/O error occurs
|
|
||||||
*/
|
|
||||||
public boolean ready() throws IOException {
|
|
||||||
synchronized (lock) {
|
|
||||||
ensureOpen();
|
|
||||||
|
|
||||||
/*
|
|
||||||
* If newline needs to be skipped and the next char to be read
|
|
||||||
* is a newline character, then just skip it right away.
|
|
||||||
*/
|
|
||||||
if (skipLF) {
|
|
||||||
/* Note that in.ready() will return true if and only if the next
|
|
||||||
* read on the stream will not block.
|
|
||||||
*/
|
|
||||||
if (nextChar >= nChars && in.ready()) {
|
|
||||||
fill();
|
|
||||||
}
|
|
||||||
if (nextChar < nChars) {
|
|
||||||
if (cb[nextChar] == '\n')
|
|
||||||
nextChar++;
|
|
||||||
skipLF = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return (nextChar < nChars) || in.ready();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Tells whether this stream supports the mark() operation, which it does.
|
|
||||||
*/
|
|
||||||
public boolean markSupported() {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Marks the present position in the stream. Subsequent calls to reset()
|
|
||||||
* will attempt to reposition the stream to this point.
|
|
||||||
*
|
|
||||||
* @param readAheadLimit Limit on the number of characters that may be
|
|
||||||
* read while still preserving the mark. An attempt
|
|
||||||
* to reset the stream after reading characters
|
|
||||||
* up to this limit or beyond may fail.
|
|
||||||
* A limit value larger than the size of the input
|
|
||||||
* buffer will cause a new buffer to be allocated
|
|
||||||
* whose size is no smaller than limit.
|
|
||||||
* Therefore large values should be used with care.
|
|
||||||
*
|
|
||||||
* @throws IllegalArgumentException If {@code readAheadLimit < 0}
|
|
||||||
* @throws IOException If an I/O error occurs
|
|
||||||
*/
|
|
||||||
public void mark(int readAheadLimit) throws IOException {
|
|
||||||
if (readAheadLimit < 0) {
|
|
||||||
throw new IllegalArgumentException("Read-ahead limit < 0");
|
|
||||||
}
|
|
||||||
synchronized (lock) {
|
|
||||||
ensureOpen();
|
|
||||||
this.readAheadLimit = readAheadLimit;
|
|
||||||
markedChar = nextChar;
|
|
||||||
markedSkipLF = skipLF;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Resets the stream to the most recent mark.
|
|
||||||
*
|
|
||||||
* @throws IOException If the stream has never been marked,
|
|
||||||
* or if the mark has been invalidated
|
|
||||||
*/
|
|
||||||
public void reset() throws IOException {
|
|
||||||
synchronized (lock) {
|
|
||||||
ensureOpen();
|
|
||||||
if (markedChar < 0)
|
|
||||||
throw new IOException((markedChar == INVALIDATED)
|
|
||||||
? "Mark invalid"
|
|
||||||
: "Stream not marked");
|
|
||||||
nextChar = markedChar;
|
|
||||||
skipLF = markedSkipLF;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public void close() throws IOException {
|
|
||||||
synchronized (lock) {
|
|
||||||
if (in == null)
|
|
||||||
return;
|
|
||||||
try {
|
|
||||||
in.close();
|
|
||||||
} finally {
|
|
||||||
in = null;
|
|
||||||
cb = null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns a {@code Stream}, the elements of which are lines read from
|
|
||||||
* this {@code BufferedReader}. The {@link Stream} is lazily populated,
|
|
||||||
* i.e., read only occurs during the
|
|
||||||
* <a href="../util/stream/package-summary.html#StreamOps">terminal
|
|
||||||
* stream operation</a>.
|
|
||||||
*
|
|
||||||
* <p> The reader must not be operated on during the execution of the
|
|
||||||
* terminal stream operation. Otherwise, the result of the terminal stream
|
|
||||||
* operation is undefined.
|
|
||||||
*
|
|
||||||
* <p> After execution of the terminal stream operation there are no
|
|
||||||
* guarantees that the reader will be at a specific position from which to
|
|
||||||
* read the next character or line.
|
|
||||||
*
|
|
||||||
* <p> If an {@link IOException} is thrown when accessing the underlying
|
|
||||||
* {@code BufferedReader}, it is wrapped in an {@link
|
|
||||||
* UncheckedIOException} which will be thrown from the {@code Stream}
|
|
||||||
* method that caused the read to take place. This method will return a
|
|
||||||
* Stream if invoked on a BufferedReader that is closed. Any operation on
|
|
||||||
* that stream that requires reading from the BufferedReader after it is
|
|
||||||
* closed, will cause an UncheckedIOException to be thrown.
|
|
||||||
*
|
|
||||||
* @return a {@code Stream<String>} providing the lines of text
|
|
||||||
* described by this {@code BufferedReader}
|
|
||||||
*
|
|
||||||
* @since 1.8
|
|
||||||
*/
|
|
||||||
public Stream<String> lines() {
|
|
||||||
Iterator<String> iter = new Iterator<>() {
|
|
||||||
String nextLine = null;
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean hasNext() {
|
|
||||||
if (nextLine != null) {
|
|
||||||
return true;
|
|
||||||
} else {
|
|
||||||
try {
|
|
||||||
nextLine = readLine();
|
|
||||||
return (nextLine != null);
|
|
||||||
} catch (IOException e) {
|
|
||||||
throw new UncheckedIOException(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String next() {
|
|
||||||
if (nextLine != null || hasNext()) {
|
|
||||||
String line = nextLine;
|
|
||||||
nextLine = null;
|
|
||||||
return line;
|
|
||||||
} else {
|
|
||||||
throw new NoSuchElementException();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
return StreamSupport.stream(Spliterators.spliteratorUnknownSize(
|
|
||||||
iter, Spliterator.ORDERED | Spliterator.NONNULL), false);
|
|
||||||
}
|
|
||||||
}
|
|
16
third-party/symspell/build.gradle
vendored
Normal file
16
third-party/symspell/build.gradle
vendored
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
plugins {
|
||||||
|
id 'java'
|
||||||
|
}
|
||||||
|
|
||||||
|
java {
|
||||||
|
toolchain {
|
||||||
|
languageVersion.set(JavaLanguageVersion.of(17))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
dependencies {
|
||||||
|
}
|
||||||
|
|
||||||
|
test {
|
||||||
|
useJUnitPlatform()
|
||||||
|
}
|
9
third-party/symspell/readme.md
vendored
Normal file
9
third-party/symspell/readme.md
vendored
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
# SymSpell
|
||||||
|
|
||||||
|
[SymSpell](https://github.com/wolfgarbe/symspell) - LGPL-3.0
|
||||||
|
|
||||||
|
Fast spell checking library. Ostensibly lacks an artifact, so we're packaging it ourselves.
|
||||||
|
|
||||||
|
## Further Reading
|
||||||
|
|
||||||
|
Wolf Garbe, [1000x Faster Spelling Correction algorithm (2012)](https://wolfgarbe.medium.com/1000x-faster-spelling-correction-algorithm-2012-8701fcd87a5f)
|
17
third-party/uppend/build.gradle
vendored
Normal file
17
third-party/uppend/build.gradle
vendored
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
plugins {
|
||||||
|
id 'java'
|
||||||
|
}
|
||||||
|
|
||||||
|
java {
|
||||||
|
toolchain {
|
||||||
|
languageVersion.set(JavaLanguageVersion.of(17))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
dependencies {
|
||||||
|
implementation libs.ffi
|
||||||
|
}
|
||||||
|
|
||||||
|
test {
|
||||||
|
useJUnitPlatform()
|
||||||
|
}
|
6
third-party/uppend/readme.md
vendored
Normal file
6
third-party/uppend/readme.md
vendored
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
# Uppend
|
||||||
|
|
||||||
|
[Uppend](https://github.com/upserve/uppend) - MIT
|
||||||
|
|
||||||
|
It's "an append-only, key-multivalue store". Cool project, but we're unceremoniously pillaging just a small piece of
|
||||||
|
code they did for calling [memadvise()](https://man7.org/linux/man-pages/man2/madvise.2.html) on off-heap byte buffers.
|
16
third-party/xz/build.gradle
vendored
Normal file
16
third-party/xz/build.gradle
vendored
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
plugins {
|
||||||
|
id 'java'
|
||||||
|
}
|
||||||
|
|
||||||
|
java {
|
||||||
|
toolchain {
|
||||||
|
languageVersion.set(JavaLanguageVersion.of(17))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
dependencies {
|
||||||
|
}
|
||||||
|
|
||||||
|
test {
|
||||||
|
useJUnitPlatform()
|
||||||
|
}
|
9
third-party/xz/readme.md
vendored
Normal file
9
third-party/xz/readme.md
vendored
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
# XZ
|
||||||
|
|
||||||
|
[XZ for Java](https://tukaani.org/xz/) - Public Domain
|
||||||
|
|
||||||
|
"XZ Utils is free general-purpose data compression software with a high compression ratio.
|
||||||
|
XZ Utils were written for POSIX-like systems, but also work on some not-so-POSIX systems.
|
||||||
|
XZ Utils are the successor to LZMA Utils."
|
||||||
|
|
||||||
|
Needed for [openzim](../openzim) to deal with modern zim files.
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user