The refactoring will continue until morale improves.
This commit is contained in:
parent
4cec89da91
commit
616effdb3c
@ -11,7 +11,6 @@ java {
|
||||
}
|
||||
}
|
||||
dependencies {
|
||||
implementation project(':third-party')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:service-discovery')
|
||||
|
@ -12,7 +12,6 @@ java {
|
||||
}
|
||||
|
||||
dependencies {
|
||||
implementation project(':third-party')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:service-discovery')
|
||||
|
@ -12,7 +12,6 @@ java {
|
||||
}
|
||||
|
||||
dependencies {
|
||||
implementation project(':third-party')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:service-discovery')
|
||||
|
@ -12,7 +12,6 @@ java {
|
||||
}
|
||||
|
||||
dependencies {
|
||||
implementation project(':third-party')
|
||||
implementation project(':code:common:service-discovery')
|
||||
|
||||
|
||||
|
@ -12,7 +12,6 @@ java {
|
||||
}
|
||||
|
||||
dependencies {
|
||||
implementation project(':third-party')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:libraries:guarded-regex')
|
||||
|
@ -11,7 +11,6 @@ java {
|
||||
}
|
||||
}
|
||||
dependencies {
|
||||
implementation project(':third-party')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:api:index-api')
|
||||
implementation project(':code:common:service-discovery')
|
||||
|
@ -19,7 +19,7 @@ application {
|
||||
tasks.distZip.enabled = false
|
||||
|
||||
dependencies {
|
||||
implementation project(':third-party')
|
||||
implementation project(':third-party:porterstemmer')
|
||||
implementation project(':code:api:index-api')
|
||||
|
||||
implementation project(':code:common:model')
|
||||
|
@ -13,7 +13,6 @@ java {
|
||||
}
|
||||
|
||||
dependencies {
|
||||
implementation project(':third-party')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:libraries:big-string')
|
||||
implementation project(':code:api:index-api')
|
||||
|
@ -2,13 +2,13 @@ package nu.marginalia.crawling.io;
|
||||
|
||||
import com.github.luben.zstd.ZstdInputStream;
|
||||
import com.google.gson.Gson;
|
||||
import jdkoverride.LargeLineBufferedReader;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.model.gson.GsonFactory;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
@ -29,7 +29,7 @@ public class CrawledDomainReader {
|
||||
public CrawledDomain read(Path path) throws IOException {
|
||||
DomainDataAssembler domainData = new DomainDataAssembler();
|
||||
|
||||
try (var br = new LargeLineBufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(path.toFile()))))) {
|
||||
try (var br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(path.toFile()))))) {
|
||||
String line;
|
||||
while ((line = br.readLine()) != null) {
|
||||
if (line.startsWith("//")) {
|
||||
|
@ -19,7 +19,6 @@ application {
|
||||
tasks.distZip.enabled = false
|
||||
|
||||
dependencies {
|
||||
implementation project(':third-party')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:service')
|
||||
|
@ -12,7 +12,6 @@ java {
|
||||
}
|
||||
|
||||
dependencies {
|
||||
implementation project(':third-party')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:service')
|
||||
|
@ -18,7 +18,6 @@ application {
|
||||
tasks.distZip.enabled = false
|
||||
|
||||
dependencies {
|
||||
implementation project(':third-party')
|
||||
implementation project(':code:api:index-api')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:config')
|
||||
|
@ -15,7 +15,6 @@ java {
|
||||
}
|
||||
|
||||
dependencies {
|
||||
implementation project(':third-party')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:service')
|
||||
|
||||
|
@ -11,7 +11,6 @@ java {
|
||||
}
|
||||
|
||||
dependencies {
|
||||
implementation project(':third-party')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:service')
|
||||
|
||||
|
@ -16,7 +16,7 @@ java {
|
||||
dependencies {
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':third-party')
|
||||
implementation project(':third-party:porterstemmer')
|
||||
|
||||
implementation libs.lombok
|
||||
annotationProcessor libs.lombok
|
||||
|
@ -18,7 +18,8 @@ dependencies {
|
||||
implementation project(':code:index:index-journal')
|
||||
implementation project(':code:index:lexicon')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':third-party')
|
||||
|
||||
implementation project(':third-party:uppend')
|
||||
|
||||
implementation libs.lombok
|
||||
annotationProcessor libs.lombok
|
||||
|
@ -14,7 +14,6 @@ dependencies {
|
||||
implementation project(':code:libraries:array')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:index:lexicon')
|
||||
implementation project(':third-party')
|
||||
|
||||
implementation libs.lombok
|
||||
annotationProcessor libs.lombok
|
||||
|
@ -9,7 +9,7 @@ java {
|
||||
}
|
||||
|
||||
dependencies {
|
||||
implementation project(':third-party')
|
||||
implementation project(':third-party:uppend')
|
||||
|
||||
implementation libs.lombok
|
||||
annotationProcessor libs.lombok
|
||||
|
@ -9,7 +9,6 @@ java {
|
||||
}
|
||||
|
||||
dependencies {
|
||||
implementation project(':third-party')
|
||||
implementation project(':code:libraries:array')
|
||||
implementation project(':code:libraries:next-prime')
|
||||
|
||||
|
@ -15,7 +15,9 @@ java {
|
||||
}
|
||||
|
||||
dependencies {
|
||||
implementation project(':third-party')
|
||||
implementation project(':third-party:rdrpostagger')
|
||||
implementation project(':third-party:porterstemmer')
|
||||
implementation project(':third-party:monkey-patch-opennlp')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:libraries:easy-lsh')
|
||||
|
@ -22,7 +22,7 @@ java {
|
||||
}
|
||||
|
||||
dependencies {
|
||||
implementation project(':third-party')
|
||||
implementation project(':third-party:symspell')
|
||||
implementation project(':code:api:assistant-api')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:service')
|
||||
|
@ -1,45 +0,0 @@
|
||||
package nu.marginalia.assistant.dict;
|
||||
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.openzim.ZIMTypes.ZIMFile;
|
||||
import org.openzim.ZIMTypes.ZIMReader;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
class WikiCleanerTest {
|
||||
|
||||
@Test
|
||||
void cleanWikiJunk() throws IOException {
|
||||
// String str = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/Scamander", new String(Files.readAllBytes(Path.of("/home/vlofgren/Work/wiki-cleaner/Scamander.wiki.html"))));
|
||||
// String str2 = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/Plato", new String(Files.readAllBytes(Path.of("/home/vlofgren/Work/wiki-cleaner/Plato.wiki.html"))));
|
||||
// String str3 = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/C++", new String(Files.readAllBytes(Path.of("/home/vlofgren/Work/wiki-cleaner/Cpp.wiki.html"))));
|
||||
// String str4 = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/Memex", new String(Files.readAllBytes(Path.of("/home/vlofgren/Work/wiki-cleaner/Memex.wiki.html"))));
|
||||
// Files.writeString(Path.of("/home/vlofgren/Work/wiki-cleaner/Scamander.out.html"), str);
|
||||
// Files.writeString(Path.of("/home/vlofgren/Work/wiki-cleaner/Plato.out.html"), str2);
|
||||
// Files.writeString(Path.of("/home/vlofgren/Work/wiki-cleaner/Cpp.out.html"), str3);
|
||||
// Files.writeString(Path.of("/home/vlofgren/Work/wiki-cleaner/Memex.out.html"), str4);
|
||||
}
|
||||
|
||||
@Test @Disabled
|
||||
public void readZim() throws IOException {
|
||||
var zr = new ZIMReader(new ZIMFile("/home/vlofgren/Work/wikipedia_en_all_nopic_2021-01.zim"));
|
||||
// try (var pw = new PrintWriter(new File("/home/vlofgren/Work/article-clusters.tsv"))) {
|
||||
// zr.enumerateArticles(pw);
|
||||
// }
|
||||
zr.forEachArticles((url, art) -> {
|
||||
if (art != null) {
|
||||
System.out.println(url);
|
||||
}
|
||||
// if (art != null && art.length() > 5) {
|
||||
// System.out.println(url + " -> " + art.substring(0, 5));
|
||||
// }
|
||||
}, (p) -> true);
|
||||
|
||||
/*try (var baos = zr.getArticleData("Giraffe", 'A')) {
|
||||
String str = baos.toString();
|
||||
Files.writeString(Path.of("/home/vlofgren/Work/wiki-cleaner/Giraffe.wiki.html"), str);
|
||||
Files.writeString(Path.of("/home/vlofgren/Work/wiki-cleaner/Giraffe.out.html"), new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/Giraffe", str));
|
||||
}*/
|
||||
}
|
||||
}
|
@ -21,7 +21,6 @@ java {
|
||||
}
|
||||
}
|
||||
dependencies {
|
||||
implementation project(':third-party')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:service')
|
||||
|
@ -21,7 +21,6 @@ java {
|
||||
}
|
||||
}
|
||||
dependencies {
|
||||
implementation project(':third-party')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:common:config')
|
||||
|
@ -22,7 +22,6 @@ tasks.distZip.enabled = false
|
||||
apply from: "$rootProject.projectDir/docker-service.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation project(':third-party')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:common:config')
|
||||
|
@ -21,7 +21,6 @@ java {
|
||||
}
|
||||
}
|
||||
dependencies {
|
||||
implementation project(':third-party')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:common:service-discovery')
|
||||
|
@ -21,7 +21,6 @@ java {
|
||||
}
|
||||
}
|
||||
dependencies {
|
||||
implementation project(':third-party')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:common:service-discovery')
|
||||
|
@ -4,9 +4,11 @@ ext {
|
||||
serviceToolOpts='-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5000'
|
||||
}
|
||||
|
||||
docker {
|
||||
var df = new File(buildDir, "Dockerfile")
|
||||
tasks.register('dockerFile') {
|
||||
buildDir.mkdir()
|
||||
|
||||
var df = new File(buildDir, "Dockerfile")
|
||||
doLast {
|
||||
df.text = """#
|
||||
# I'm auto-generated, please don't make changes to me or commit me to git
|
||||
#
|
||||
@ -22,11 +24,23 @@ ENV JAVA_OPTS="${serviceJvmOpts} "
|
||||
|
||||
ENTRYPOINT WMSA_HOME=/wmsa /${application.applicationName}/bin/${application.applicationName} \${arg0} \${arg1}
|
||||
"""
|
||||
}
|
||||
it.outputs.file(df)
|
||||
}
|
||||
|
||||
dockerfile = new File(buildDir, "Dockerfile")
|
||||
dockerPrepare {
|
||||
dependsOn tasks.dockerFile
|
||||
}
|
||||
|
||||
dockerfileZip {
|
||||
dependsOn tasks.dockerFile
|
||||
}
|
||||
|
||||
|
||||
docker {
|
||||
dockerfile = tasks.dockerFile.outputs.files.singleFile
|
||||
name = 'marginalia.nu/'+application.applicationName+':latest'
|
||||
files tasks.distTar.outputs
|
||||
tags 'latest'
|
||||
|
||||
dependsOn tasks.distTar
|
||||
}
|
||||
|
@ -59,7 +59,6 @@ jmhJar {
|
||||
zip64 true
|
||||
}
|
||||
dependencies {
|
||||
implementation project(':third-party')
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:service-discovery')
|
||||
|
@ -30,7 +30,6 @@ java {
|
||||
}
|
||||
}
|
||||
dependencies {
|
||||
implementation project(':third-party')
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:common:service-discovery')
|
||||
implementation project(':code:common:service-client')
|
||||
|
@ -52,7 +52,14 @@ include 'code:crawl:loading-process'
|
||||
include 'code:crawl:common'
|
||||
include 'code:crawl:experimental'
|
||||
|
||||
include 'third-party'
|
||||
include 'third-party:porterstemmer'
|
||||
include 'third-party:xz'
|
||||
include 'third-party:symspell'
|
||||
include 'third-party:rdrpostagger'
|
||||
include 'third-party:uppend'
|
||||
include 'third-party:openzim'
|
||||
include 'third-party:monkey-patch-opennlp'
|
||||
|
||||
include 'other:memex'
|
||||
include 'other:wmsa_old'
|
||||
|
||||
|
15
third-party/README.md
vendored
15
third-party/README.md
vendored
@ -6,14 +6,11 @@ or lack an artifact, or to override some default that is inappropriate for the t
|
||||
## Sources and Licenses
|
||||
|
||||
### Modified
|
||||
* [RDRPosTagger](https://github.com/datquocnguyen/RDRPOSTagger) - GPL3
|
||||
* [PorterStemmer](https://github.com/caarmen/porter-stemmer) - LGPL3
|
||||
* [Uppend](https://github.com/upserve/uppend) - MIT
|
||||
* [OpenZIM](https://github.com/openzim/libzim) - GPL-2.0
|
||||
* [XZ for Java](https://tukaani.org/xz/) - Public Domain
|
||||
* [SymSpell](https://github.com/wolfgarbe/symspell) - LGPL-3.0
|
||||
* [RDRPosTagger](rdrpostagger/) - GPL3
|
||||
* [PorterStemmer](porterstemmer/) - LGPL3
|
||||
* [Uppend](uppend/) - MIT
|
||||
* [OpenZIM](openzim/) - GPL-2.0
|
||||
* [SymSpell](symspell/) - LGPL-3.0
|
||||
|
||||
### Monkey Patched
|
||||
* [GSON](https://github.com/google/gson) - Apache-2.0
|
||||
* OpenJDK - GPL-2.0 (packaged under jdkoverride)
|
||||
* Stanford OpenNLP - Apache-2.0
|
||||
* [Stanford OpenNLP](monkey-patch-opennlp/) - Apache-2.0
|
||||
|
@ -27,5 +27,5 @@ dependencies {
|
||||
}
|
||||
|
||||
test {
|
||||
useJUnitPlatform()
|
||||
useJUnitPlatform()\
|
||||
}
|
11
third-party/monkey-patch-opennlp/readme.md
vendored
Normal file
11
third-party/monkey-patch-opennlp/readme.md
vendored
Normal file
@ -0,0 +1,11 @@
|
||||
# Monkey Patched OpenNLP
|
||||
|
||||
Stanford OpenNLP - Apache-2.0
|
||||
|
||||
## Rationale
|
||||
|
||||
OpenNLP's sentence detector uses a slow StringBuffer instead of a StringBuilder where it makes no
|
||||
no sense to do so. This makes it much slower than it needs to be. I've found no way to file issues with the
|
||||
project to get it fixed. Instead we're doing this monkey patch where the class is overridden with something
|
||||
better.
|
||||
|
24
third-party/openzim/build.gradle
vendored
Normal file
24
third-party/openzim/build.gradle
vendored
Normal file
@ -0,0 +1,24 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(17))
|
||||
}
|
||||
}
|
||||
|
||||
dependencies {
|
||||
implementation libs.bundles.nlp
|
||||
implementation libs.zstd
|
||||
implementation libs.commons.compress
|
||||
implementation libs.ffi
|
||||
implementation libs.databind
|
||||
implementation libs.bundles.gson
|
||||
|
||||
implementation project(':third-party:xz')
|
||||
}
|
||||
|
||||
test {
|
||||
useJUnitPlatform()
|
||||
}
|
11
third-party/openzim/readme.md
vendored
Normal file
11
third-party/openzim/readme.md
vendored
Normal file
@ -0,0 +1,11 @@
|
||||
# OpenZIM
|
||||
|
||||
[OpenZIM](https://github.com/openzim/libzim) - GPL-2.0
|
||||
|
||||
OpenZIM is a ZIM file reader. This code has been modified in a fairly crude manner
|
||||
to be much faster than the original code base which seems quite antique. It also
|
||||
supports XZ compression.
|
||||
|
||||
**Important Note** the license is incompatible with AGPL 3, so we can't link Marginalia
|
||||
directly to this. It's still very useful for building tools that deal with
|
||||
wikipedia data which would be stand-alone.
|
16
third-party/porterstemmer/build.gradle
vendored
Normal file
16
third-party/porterstemmer/build.gradle
vendored
Normal file
@ -0,0 +1,16 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(17))
|
||||
}
|
||||
}
|
||||
|
||||
dependencies {
|
||||
}
|
||||
|
||||
test {
|
||||
useJUnitPlatform()
|
||||
}
|
6
third-party/porterstemmer/readme.md
vendored
Normal file
6
third-party/porterstemmer/readme.md
vendored
Normal file
@ -0,0 +1,6 @@
|
||||
# Porterstemmer
|
||||
|
||||
[PorterStemmer](https://github.com/caarmen/porter-stemmer) - LGPL3
|
||||
|
||||
It's a [porter stemmer](https://tartarus.org/martin/PorterStemmer/) library, although one comes with OpenNLP
|
||||
too. TBD which one to use, they're fairly equivalent.
|
16
third-party/rdrpostagger/build.gradle
vendored
Normal file
16
third-party/rdrpostagger/build.gradle
vendored
Normal file
@ -0,0 +1,16 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(17))
|
||||
}
|
||||
}
|
||||
|
||||
dependencies {
|
||||
}
|
||||
|
||||
test {
|
||||
useJUnitPlatform()
|
||||
}
|
12
third-party/rdrpostagger/readme.md
vendored
Normal file
12
third-party/rdrpostagger/readme.md
vendored
Normal file
@ -0,0 +1,12 @@
|
||||
# RDRPosTagger
|
||||
|
||||
[RDRPosTagger](https://github.com/datquocnguyen/RDRPOSTagger) - GPL3
|
||||
|
||||
datquocnguyen's excellent fast POS tagger. It's been crudely modified to be faster.
|
||||
Unlike the original, it only does English.
|
||||
|
||||
## Citations
|
||||
|
||||
- Dat Quoc Nguyen, Dai Quoc Nguyen, Dang Duc Pham and Son Bao Pham. [RDRPOSTagger: A Ripple Down Rules-based Part-Of-Speech Tagger](http://www.aclweb.org/anthology/E14-2005). In *Proceedings of the Demonstrations at the 14th Conference of the European Chapter of the Association for Computational Linguistics*, EACL 2014, pp. 17-20, 2014. [[.PDF]](http://www.aclweb.org/anthology/E14-2005) [[.bib]](http://www.aclweb.org/anthology/E14-2005.bib)
|
||||
|
||||
- Dat Quoc Nguyen, Dai Quoc Nguyen, Dang Duc Pham and Son Bao Pham. [A Robust Transformation-Based Learning Approach Using Ripple Down Rules for Part-Of-Speech Tagging](http://content.iospress.com/articles/ai-communications/aic698). *AI Communications* (AICom), vol. 29, no. 3, pp. 409-422, 2016. [[.PDF]](http://arxiv.org/pdf/1412.4021.pdf) [[.bib]](http://rdrpostagger.sourceforge.net/AICom.bib)
|
File diff suppressed because it is too large
Load Diff
@ -1,559 +0,0 @@
|
||||
package jdkoverride;/*
|
||||
* Copyright (c) 1996, 2021, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.UncheckedIOException;
|
||||
import java.util.Iterator;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.Objects;
|
||||
import java.util.Spliterator;
|
||||
import java.util.Spliterators;
|
||||
import java.util.stream.Stream;
|
||||
import java.util.stream.StreamSupport;
|
||||
|
||||
/** This is java.io.BufferedReader with a different value for defaultExpectedLineLength
|
||||
*/
|
||||
|
||||
public class LargeLineBufferedReader extends Reader {
|
||||
|
||||
private Reader in;
|
||||
|
||||
private char[] cb;
|
||||
private int nChars, nextChar;
|
||||
|
||||
private static final int INVALIDATED = -2;
|
||||
private static final int UNMARKED = -1;
|
||||
private int markedChar = UNMARKED;
|
||||
private int readAheadLimit = 0; /* Valid only when markedChar > 0 */
|
||||
|
||||
/** If the next character is a line feed, skip it */
|
||||
private boolean skipLF = false;
|
||||
|
||||
/** The skipLF flag when the mark was set */
|
||||
private boolean markedSkipLF = false;
|
||||
|
||||
private static int defaultCharBufferSize = 8192;
|
||||
private static int defaultExpectedLineLength = 65536;
|
||||
|
||||
/**
|
||||
* Creates a buffering character-input stream that uses an input buffer of
|
||||
* the specified size.
|
||||
*
|
||||
* @param in A Reader
|
||||
* @param sz Input-buffer size
|
||||
*
|
||||
* @throws IllegalArgumentException If {@code sz <= 0}
|
||||
*/
|
||||
public LargeLineBufferedReader(Reader in, int sz) {
|
||||
super(in);
|
||||
if (sz <= 0)
|
||||
throw new IllegalArgumentException("Buffer size <= 0");
|
||||
this.in = in;
|
||||
cb = new char[sz];
|
||||
nextChar = nChars = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a buffering character-input stream that uses a default-sized
|
||||
* input buffer.
|
||||
*
|
||||
* @param in A Reader
|
||||
*/
|
||||
public LargeLineBufferedReader(Reader in) {
|
||||
this(in, defaultCharBufferSize);
|
||||
}
|
||||
|
||||
/** Checks to make sure that the stream has not been closed */
|
||||
private void ensureOpen() throws IOException {
|
||||
if (in == null)
|
||||
throw new IOException("Stream closed");
|
||||
}
|
||||
|
||||
/**
|
||||
* Fills the input buffer, taking the mark into account if it is valid.
|
||||
*/
|
||||
private void fill() throws IOException {
|
||||
int dst;
|
||||
if (markedChar <= UNMARKED) {
|
||||
/* No mark */
|
||||
dst = 0;
|
||||
} else {
|
||||
/* Marked */
|
||||
int delta = nextChar - markedChar;
|
||||
if (delta >= readAheadLimit) {
|
||||
/* Gone past read-ahead limit: Invalidate mark */
|
||||
markedChar = INVALIDATED;
|
||||
readAheadLimit = 0;
|
||||
dst = 0;
|
||||
} else {
|
||||
if (readAheadLimit <= cb.length) {
|
||||
/* Shuffle in the current buffer */
|
||||
System.arraycopy(cb, markedChar, cb, 0, delta);
|
||||
markedChar = 0;
|
||||
dst = delta;
|
||||
} else {
|
||||
/* Reallocate buffer to accommodate read-ahead limit */
|
||||
char[] ncb = new char[readAheadLimit];
|
||||
System.arraycopy(cb, markedChar, ncb, 0, delta);
|
||||
cb = ncb;
|
||||
markedChar = 0;
|
||||
dst = delta;
|
||||
}
|
||||
nextChar = nChars = delta;
|
||||
}
|
||||
}
|
||||
|
||||
int n;
|
||||
do {
|
||||
n = in.read(cb, dst, cb.length - dst);
|
||||
} while (n == 0);
|
||||
if (n > 0) {
|
||||
nChars = dst + n;
|
||||
nextChar = dst;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads a single character.
|
||||
*
|
||||
* @return The character read, as an integer in the range
|
||||
* 0 to 65535 ({@code 0x00-0xffff}), or -1 if the
|
||||
* end of the stream has been reached
|
||||
* @throws IOException If an I/O error occurs
|
||||
*/
|
||||
public int read() throws IOException {
|
||||
synchronized (lock) {
|
||||
ensureOpen();
|
||||
for (;;) {
|
||||
if (nextChar >= nChars) {
|
||||
fill();
|
||||
if (nextChar >= nChars)
|
||||
return -1;
|
||||
}
|
||||
if (skipLF) {
|
||||
skipLF = false;
|
||||
if (cb[nextChar] == '\n') {
|
||||
nextChar++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
return cb[nextChar++];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads characters into a portion of an array, reading from the underlying
|
||||
* stream if necessary.
|
||||
*/
|
||||
private int read1(char[] cbuf, int off, int len) throws IOException {
|
||||
if (nextChar >= nChars) {
|
||||
/* If the requested length is at least as large as the buffer, and
|
||||
if there is no mark/reset activity, and if line feeds are not
|
||||
being skipped, do not bother to copy the characters into the
|
||||
local buffer. In this way buffered streams will cascade
|
||||
harmlessly. */
|
||||
if (len >= cb.length && markedChar <= UNMARKED && !skipLF) {
|
||||
return in.read(cbuf, off, len);
|
||||
}
|
||||
fill();
|
||||
}
|
||||
if (nextChar >= nChars) return -1;
|
||||
if (skipLF) {
|
||||
skipLF = false;
|
||||
if (cb[nextChar] == '\n') {
|
||||
nextChar++;
|
||||
if (nextChar >= nChars)
|
||||
fill();
|
||||
if (nextChar >= nChars)
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
int n = Math.min(len, nChars - nextChar);
|
||||
System.arraycopy(cb, nextChar, cbuf, off, n);
|
||||
nextChar += n;
|
||||
return n;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads characters into a portion of an array.
|
||||
*
|
||||
* <p> This method implements the general contract of the corresponding
|
||||
* {@link Reader#read(char[], int, int) read} method of the
|
||||
* {@link Reader} class. As an additional convenience, it
|
||||
* attempts to read as many characters as possible by repeatedly invoking
|
||||
* the {@code read} method of the underlying stream. This iterated
|
||||
* {@code read} continues until one of the following conditions becomes
|
||||
* true:
|
||||
* <ul>
|
||||
*
|
||||
* <li> The specified number of characters have been read,
|
||||
*
|
||||
* <li> The {@code read} method of the underlying stream returns
|
||||
* {@code -1}, indicating end-of-file, or
|
||||
*
|
||||
* <li> The {@code ready} method of the underlying stream
|
||||
* returns {@code false}, indicating that further input requests
|
||||
* would block.
|
||||
*
|
||||
* </ul>
|
||||
* If the first {@code read} on the underlying stream returns
|
||||
* {@code -1} to indicate end-of-file then this method returns
|
||||
* {@code -1}. Otherwise this method returns the number of characters
|
||||
* actually read.
|
||||
*
|
||||
* <p> Subclasses of this class are encouraged, but not required, to
|
||||
* attempt to read as many characters as possible in the same fashion.
|
||||
*
|
||||
* <p> Ordinarily this method takes characters from this stream's character
|
||||
* buffer, filling it from the underlying stream as necessary. If,
|
||||
* however, the buffer is empty, the mark is not valid, and the requested
|
||||
* length is at least as large as the buffer, then this method will read
|
||||
* characters directly from the underlying stream into the given array.
|
||||
* Thus redundant {@code BufferedReader}s will not copy data
|
||||
* unnecessarily.
|
||||
*
|
||||
* @param cbuf {@inheritDoc}
|
||||
* @param off {@inheritDoc}
|
||||
* @param len {@inheritDoc}
|
||||
*
|
||||
* @return {@inheritDoc}
|
||||
*
|
||||
* @throws IndexOutOfBoundsException {@inheritDoc}
|
||||
* @throws IOException {@inheritDoc}
|
||||
*/
|
||||
public int read(char[] cbuf, int off, int len) throws IOException {
|
||||
synchronized (lock) {
|
||||
ensureOpen();
|
||||
Objects.checkFromIndexSize(off, len, cbuf.length);
|
||||
if (len == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int n = read1(cbuf, off, len);
|
||||
if (n <= 0) return n;
|
||||
while ((n < len) && in.ready()) {
|
||||
int n1 = read1(cbuf, off + n, len - n);
|
||||
if (n1 <= 0) break;
|
||||
n += n1;
|
||||
}
|
||||
return n;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads a line of text. A line is considered to be terminated by any one
|
||||
* of a line feed ('\n'), a carriage return ('\r'), a carriage return
|
||||
* followed immediately by a line feed, or by reaching the end-of-file
|
||||
* (EOF).
|
||||
*
|
||||
* @param ignoreLF If true, the next '\n' will be skipped
|
||||
* @param term Output: Whether a line terminator was encountered
|
||||
* while reading the line; may be {@code null}.
|
||||
*
|
||||
* @return A String containing the contents of the line, not including
|
||||
* any line-termination characters, or null if the end of the
|
||||
* stream has been reached without reading any characters
|
||||
*
|
||||
* @see java.io.LineNumberReader#readLine()
|
||||
*
|
||||
* @throws IOException If an I/O error occurs
|
||||
*/
|
||||
|
||||
StringBuilder s = new StringBuilder(10000);
|
||||
String readLine(boolean ignoreLF, boolean[] term) throws IOException {
|
||||
|
||||
int startChar;
|
||||
|
||||
synchronized (lock) {
|
||||
|
||||
s.setLength(0);
|
||||
|
||||
ensureOpen();
|
||||
boolean omitLF = ignoreLF || skipLF;
|
||||
if (term != null) term[0] = false;
|
||||
|
||||
for (;;) {
|
||||
|
||||
if (nextChar >= nChars)
|
||||
fill();
|
||||
if (nextChar >= nChars) { /* EOF */
|
||||
if (s != null && s.length() > 0)
|
||||
return s.toString();
|
||||
else
|
||||
return null;
|
||||
}
|
||||
boolean eol = false;
|
||||
char c = 0;
|
||||
int i;
|
||||
|
||||
/* Skip a leftover '\n', if necessary */
|
||||
if (omitLF && (cb[nextChar] == '\n'))
|
||||
nextChar++;
|
||||
skipLF = false;
|
||||
omitLF = false;
|
||||
|
||||
for (i = nextChar; i < nChars; i++) {
|
||||
c = cb[i];
|
||||
if ((c == '\n') || (c == '\r')) {
|
||||
if (term != null) term[0] = true;
|
||||
eol = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
startChar = nextChar;
|
||||
nextChar = i;
|
||||
|
||||
if (eol) {
|
||||
s.append(cb, startChar, i - startChar);
|
||||
String str = s.toString();
|
||||
|
||||
nextChar++;
|
||||
if (c == '\r') {
|
||||
skipLF = true;
|
||||
}
|
||||
return str;
|
||||
}
|
||||
|
||||
s.append(cb, startChar, i - startChar);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads a line of text. A line is considered to be terminated by any one
|
||||
* of a line feed ('\n'), a carriage return ('\r'), a carriage return
|
||||
* followed immediately by a line feed, or by reaching the end-of-file
|
||||
* (EOF).
|
||||
*
|
||||
* @return A String containing the contents of the line, not including
|
||||
* any line-termination characters, or null if the end of the
|
||||
* stream has been reached without reading any characters
|
||||
*
|
||||
* @throws IOException If an I/O error occurs
|
||||
*
|
||||
* @see java.nio.file.Files#readAllLines
|
||||
*/
|
||||
public String readLine() throws IOException {
|
||||
return readLine(false, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
public long skip(long n) throws IOException {
|
||||
if (n < 0L) {
|
||||
throw new IllegalArgumentException("skip value is negative");
|
||||
}
|
||||
synchronized (lock) {
|
||||
ensureOpen();
|
||||
long r = n;
|
||||
while (r > 0) {
|
||||
if (nextChar >= nChars)
|
||||
fill();
|
||||
if (nextChar >= nChars) /* EOF */
|
||||
break;
|
||||
if (skipLF) {
|
||||
skipLF = false;
|
||||
if (cb[nextChar] == '\n') {
|
||||
nextChar++;
|
||||
}
|
||||
}
|
||||
long d = nChars - nextChar;
|
||||
if (r <= d) {
|
||||
nextChar += r;
|
||||
r = 0;
|
||||
break;
|
||||
}
|
||||
else {
|
||||
r -= d;
|
||||
nextChar = nChars;
|
||||
}
|
||||
}
|
||||
return n - r;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Tells whether this stream is ready to be read. A buffered character
|
||||
* stream is ready if the buffer is not empty, or if the underlying
|
||||
* character stream is ready.
|
||||
*
|
||||
* @throws IOException If an I/O error occurs
|
||||
*/
|
||||
public boolean ready() throws IOException {
|
||||
synchronized (lock) {
|
||||
ensureOpen();
|
||||
|
||||
/*
|
||||
* If newline needs to be skipped and the next char to be read
|
||||
* is a newline character, then just skip it right away.
|
||||
*/
|
||||
if (skipLF) {
|
||||
/* Note that in.ready() will return true if and only if the next
|
||||
* read on the stream will not block.
|
||||
*/
|
||||
if (nextChar >= nChars && in.ready()) {
|
||||
fill();
|
||||
}
|
||||
if (nextChar < nChars) {
|
||||
if (cb[nextChar] == '\n')
|
||||
nextChar++;
|
||||
skipLF = false;
|
||||
}
|
||||
}
|
||||
return (nextChar < nChars) || in.ready();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Tells whether this stream supports the mark() operation, which it does.
|
||||
*/
|
||||
public boolean markSupported() {
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Marks the present position in the stream. Subsequent calls to reset()
|
||||
* will attempt to reposition the stream to this point.
|
||||
*
|
||||
* @param readAheadLimit Limit on the number of characters that may be
|
||||
* read while still preserving the mark. An attempt
|
||||
* to reset the stream after reading characters
|
||||
* up to this limit or beyond may fail.
|
||||
* A limit value larger than the size of the input
|
||||
* buffer will cause a new buffer to be allocated
|
||||
* whose size is no smaller than limit.
|
||||
* Therefore large values should be used with care.
|
||||
*
|
||||
* @throws IllegalArgumentException If {@code readAheadLimit < 0}
|
||||
* @throws IOException If an I/O error occurs
|
||||
*/
|
||||
public void mark(int readAheadLimit) throws IOException {
|
||||
if (readAheadLimit < 0) {
|
||||
throw new IllegalArgumentException("Read-ahead limit < 0");
|
||||
}
|
||||
synchronized (lock) {
|
||||
ensureOpen();
|
||||
this.readAheadLimit = readAheadLimit;
|
||||
markedChar = nextChar;
|
||||
markedSkipLF = skipLF;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Resets the stream to the most recent mark.
|
||||
*
|
||||
* @throws IOException If the stream has never been marked,
|
||||
* or if the mark has been invalidated
|
||||
*/
|
||||
public void reset() throws IOException {
|
||||
synchronized (lock) {
|
||||
ensureOpen();
|
||||
if (markedChar < 0)
|
||||
throw new IOException((markedChar == INVALIDATED)
|
||||
? "Mark invalid"
|
||||
: "Stream not marked");
|
||||
nextChar = markedChar;
|
||||
skipLF = markedSkipLF;
|
||||
}
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
synchronized (lock) {
|
||||
if (in == null)
|
||||
return;
|
||||
try {
|
||||
in.close();
|
||||
} finally {
|
||||
in = null;
|
||||
cb = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a {@code Stream}, the elements of which are lines read from
|
||||
* this {@code BufferedReader}. The {@link Stream} is lazily populated,
|
||||
* i.e., read only occurs during the
|
||||
* <a href="../util/stream/package-summary.html#StreamOps">terminal
|
||||
* stream operation</a>.
|
||||
*
|
||||
* <p> The reader must not be operated on during the execution of the
|
||||
* terminal stream operation. Otherwise, the result of the terminal stream
|
||||
* operation is undefined.
|
||||
*
|
||||
* <p> After execution of the terminal stream operation there are no
|
||||
* guarantees that the reader will be at a specific position from which to
|
||||
* read the next character or line.
|
||||
*
|
||||
* <p> If an {@link IOException} is thrown when accessing the underlying
|
||||
* {@code BufferedReader}, it is wrapped in an {@link
|
||||
* UncheckedIOException} which will be thrown from the {@code Stream}
|
||||
* method that caused the read to take place. This method will return a
|
||||
* Stream if invoked on a BufferedReader that is closed. Any operation on
|
||||
* that stream that requires reading from the BufferedReader after it is
|
||||
* closed, will cause an UncheckedIOException to be thrown.
|
||||
*
|
||||
* @return a {@code Stream<String>} providing the lines of text
|
||||
* described by this {@code BufferedReader}
|
||||
*
|
||||
* @since 1.8
|
||||
*/
|
||||
public Stream<String> lines() {
|
||||
Iterator<String> iter = new Iterator<>() {
|
||||
String nextLine = null;
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
if (nextLine != null) {
|
||||
return true;
|
||||
} else {
|
||||
try {
|
||||
nextLine = readLine();
|
||||
return (nextLine != null);
|
||||
} catch (IOException e) {
|
||||
throw new UncheckedIOException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String next() {
|
||||
if (nextLine != null || hasNext()) {
|
||||
String line = nextLine;
|
||||
nextLine = null;
|
||||
return line;
|
||||
} else {
|
||||
throw new NoSuchElementException();
|
||||
}
|
||||
}
|
||||
};
|
||||
return StreamSupport.stream(Spliterators.spliteratorUnknownSize(
|
||||
iter, Spliterator.ORDERED | Spliterator.NONNULL), false);
|
||||
}
|
||||
}
|
16
third-party/symspell/build.gradle
vendored
Normal file
16
third-party/symspell/build.gradle
vendored
Normal file
@ -0,0 +1,16 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(17))
|
||||
}
|
||||
}
|
||||
|
||||
dependencies {
|
||||
}
|
||||
|
||||
test {
|
||||
useJUnitPlatform()
|
||||
}
|
9
third-party/symspell/readme.md
vendored
Normal file
9
third-party/symspell/readme.md
vendored
Normal file
@ -0,0 +1,9 @@
|
||||
# SymSpell
|
||||
|
||||
[SymSpell](https://github.com/wolfgarbe/symspell) - LGPL-3.0
|
||||
|
||||
Fast spell checking library. Ostensibly lacks an artifact, so we're packaging it ourselves.
|
||||
|
||||
## Further Reading
|
||||
|
||||
Wolf Garbe, [1000x Faster Spelling Correction algorithm (2012)](https://wolfgarbe.medium.com/1000x-faster-spelling-correction-algorithm-2012-8701fcd87a5f)
|
17
third-party/uppend/build.gradle
vendored
Normal file
17
third-party/uppend/build.gradle
vendored
Normal file
@ -0,0 +1,17 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(17))
|
||||
}
|
||||
}
|
||||
|
||||
dependencies {
|
||||
implementation libs.ffi
|
||||
}
|
||||
|
||||
test {
|
||||
useJUnitPlatform()
|
||||
}
|
6
third-party/uppend/readme.md
vendored
Normal file
6
third-party/uppend/readme.md
vendored
Normal file
@ -0,0 +1,6 @@
|
||||
# Uppend
|
||||
|
||||
[Uppend](https://github.com/upserve/uppend) - MIT
|
||||
|
||||
It's "an append-only, key-multivalue store". Cool project, but we're unceremoniously pillaging just a small piece of
|
||||
code they did for calling [memadvise()](https://man7.org/linux/man-pages/man2/madvise.2.html) on off-heap byte buffers.
|
16
third-party/xz/build.gradle
vendored
Normal file
16
third-party/xz/build.gradle
vendored
Normal file
@ -0,0 +1,16 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(17))
|
||||
}
|
||||
}
|
||||
|
||||
dependencies {
|
||||
}
|
||||
|
||||
test {
|
||||
useJUnitPlatform()
|
||||
}
|
9
third-party/xz/readme.md
vendored
Normal file
9
third-party/xz/readme.md
vendored
Normal file
@ -0,0 +1,9 @@
|
||||
# XZ
|
||||
|
||||
[XZ for Java](https://tukaani.org/xz/) - Public Domain
|
||||
|
||||
"XZ Utils is free general-purpose data compression software with a high compression ratio.
|
||||
XZ Utils were written for POSIX-like systems, but also work on some not-so-POSIX systems.
|
||||
XZ Utils are the successor to LZMA Utils."
|
||||
|
||||
Needed for [openzim](../openzim) to deal with modern zim files.
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user