The refactoring will continue until morale improves.

This commit is contained in:
Viktor Lofgren 2023-03-12 10:04:48 +01:00
parent 4cec89da91
commit 616effdb3c
118 changed files with 214 additions and 2287 deletions

View File

@ -11,7 +11,6 @@ java {
}
}
dependencies {
implementation project(':third-party')
implementation project(':code:common:model')
implementation project(':code:common:config')
implementation project(':code:common:service-discovery')

View File

@ -12,7 +12,6 @@ java {
}
dependencies {
implementation project(':third-party')
implementation project(':code:common:model')
implementation project(':code:common:config')
implementation project(':code:common:service-discovery')

View File

@ -12,7 +12,6 @@ java {
}
dependencies {
implementation project(':third-party')
implementation project(':code:common:model')
implementation project(':code:common:config')
implementation project(':code:common:service-discovery')

View File

@ -12,7 +12,6 @@ java {
}
dependencies {
implementation project(':third-party')
implementation project(':code:common:service-discovery')

View File

@ -12,7 +12,6 @@ java {
}
dependencies {
implementation project(':third-party')
implementation project(':code:common:model')
implementation project(':code:common:config')
implementation project(':code:libraries:guarded-regex')

View File

@ -11,7 +11,6 @@ java {
}
}
dependencies {
implementation project(':third-party')
implementation project(':code:common:model')
implementation project(':code:api:index-api')
implementation project(':code:common:service-discovery')

View File

@ -19,7 +19,7 @@ application {
tasks.distZip.enabled = false
dependencies {
implementation project(':third-party')
implementation project(':third-party:porterstemmer')
implementation project(':code:api:index-api')
implementation project(':code:common:model')

View File

@ -13,7 +13,6 @@ java {
}
dependencies {
implementation project(':third-party')
implementation project(':code:common:model')
implementation project(':code:libraries:big-string')
implementation project(':code:api:index-api')

View File

@ -2,13 +2,13 @@ package nu.marginalia.crawling.io;
import com.github.luben.zstd.ZstdInputStream;
import com.google.gson.Gson;
import jdkoverride.LargeLineBufferedReader;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.model.gson.GsonFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
@ -29,7 +29,7 @@ public class CrawledDomainReader {
public CrawledDomain read(Path path) throws IOException {
DomainDataAssembler domainData = new DomainDataAssembler();
try (var br = new LargeLineBufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(path.toFile()))))) {
try (var br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(path.toFile()))))) {
String line;
while ((line = br.readLine()) != null) {
if (line.startsWith("//")) {

View File

@ -19,7 +19,6 @@ application {
tasks.distZip.enabled = false
dependencies {
implementation project(':third-party')
implementation project(':code:common:model')
implementation project(':code:common:config')
implementation project(':code:common:service')

View File

@ -12,7 +12,6 @@ java {
}
dependencies {
implementation project(':third-party')
implementation project(':code:common:model')
implementation project(':code:common:config')
implementation project(':code:common:service')

View File

@ -18,7 +18,6 @@ application {
tasks.distZip.enabled = false
dependencies {
implementation project(':third-party')
implementation project(':code:api:index-api')
implementation project(':code:common:model')
implementation project(':code:common:config')

View File

@ -15,7 +15,6 @@ java {
}
dependencies {
implementation project(':third-party')
implementation project(':code:common:model')
implementation project(':code:common:service')

View File

@ -11,7 +11,6 @@ java {
}
dependencies {
implementation project(':third-party')
implementation project(':code:common:model')
implementation project(':code:common:service')

View File

@ -16,7 +16,7 @@ java {
dependencies {
implementation project(':code:common:config')
implementation project(':code:libraries:language-processing')
implementation project(':third-party')
implementation project(':third-party:porterstemmer')
implementation libs.lombok
annotationProcessor libs.lombok

View File

@ -18,7 +18,8 @@ dependencies {
implementation project(':code:index:index-journal')
implementation project(':code:index:lexicon')
implementation project(':code:common:model')
implementation project(':third-party')
implementation project(':third-party:uppend')
implementation libs.lombok
annotationProcessor libs.lombok

View File

@ -14,7 +14,6 @@ dependencies {
implementation project(':code:libraries:array')
implementation project(':code:common:model')
implementation project(':code:index:lexicon')
implementation project(':third-party')
implementation libs.lombok
annotationProcessor libs.lombok

View File

@ -9,7 +9,7 @@ java {
}
dependencies {
implementation project(':third-party')
implementation project(':third-party:uppend')
implementation libs.lombok
annotationProcessor libs.lombok

View File

@ -9,7 +9,6 @@ java {
}
dependencies {
implementation project(':third-party')
implementation project(':code:libraries:array')
implementation project(':code:libraries:next-prime')

View File

@ -15,7 +15,9 @@ java {
}
dependencies {
implementation project(':third-party')
implementation project(':third-party:rdrpostagger')
implementation project(':third-party:porterstemmer')
implementation project(':third-party:monkey-patch-opennlp')
implementation project(':code:common:model')
implementation project(':code:common:config')
implementation project(':code:libraries:easy-lsh')

View File

@ -22,7 +22,7 @@ java {
}
dependencies {
implementation project(':third-party')
implementation project(':third-party:symspell')
implementation project(':code:api:assistant-api')
implementation project(':code:common:config')
implementation project(':code:common:service')

View File

@ -1,45 +0,0 @@
package nu.marginalia.assistant.dict;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.openzim.ZIMTypes.ZIMFile;
import org.openzim.ZIMTypes.ZIMReader;
import java.io.IOException;
class WikiCleanerTest {
@Test
void cleanWikiJunk() throws IOException {
// String str = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/Scamander", new String(Files.readAllBytes(Path.of("/home/vlofgren/Work/wiki-cleaner/Scamander.wiki.html"))));
// String str2 = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/Plato", new String(Files.readAllBytes(Path.of("/home/vlofgren/Work/wiki-cleaner/Plato.wiki.html"))));
// String str3 = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/C++", new String(Files.readAllBytes(Path.of("/home/vlofgren/Work/wiki-cleaner/Cpp.wiki.html"))));
// String str4 = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/Memex", new String(Files.readAllBytes(Path.of("/home/vlofgren/Work/wiki-cleaner/Memex.wiki.html"))));
// Files.writeString(Path.of("/home/vlofgren/Work/wiki-cleaner/Scamander.out.html"), str);
// Files.writeString(Path.of("/home/vlofgren/Work/wiki-cleaner/Plato.out.html"), str2);
// Files.writeString(Path.of("/home/vlofgren/Work/wiki-cleaner/Cpp.out.html"), str3);
// Files.writeString(Path.of("/home/vlofgren/Work/wiki-cleaner/Memex.out.html"), str4);
}
@Test @Disabled
public void readZim() throws IOException {
var zr = new ZIMReader(new ZIMFile("/home/vlofgren/Work/wikipedia_en_all_nopic_2021-01.zim"));
// try (var pw = new PrintWriter(new File("/home/vlofgren/Work/article-clusters.tsv"))) {
// zr.enumerateArticles(pw);
// }
zr.forEachArticles((url, art) -> {
if (art != null) {
System.out.println(url);
}
// if (art != null && art.length() > 5) {
// System.out.println(url + " -> " + art.substring(0, 5));
// }
}, (p) -> true);
/*try (var baos = zr.getArticleData("Giraffe", 'A')) {
String str = baos.toString();
Files.writeString(Path.of("/home/vlofgren/Work/wiki-cleaner/Giraffe.wiki.html"), str);
Files.writeString(Path.of("/home/vlofgren/Work/wiki-cleaner/Giraffe.out.html"), new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/Giraffe", str));
}*/
}
}

View File

@ -21,7 +21,6 @@ java {
}
}
dependencies {
implementation project(':third-party')
implementation project(':code:common:config')
implementation project(':code:common:model')
implementation project(':code:common:service')

View File

@ -21,7 +21,6 @@ java {
}
}
dependencies {
implementation project(':third-party')
implementation project(':code:common:model')
implementation project(':code:common:service')
implementation project(':code:common:config')

View File

@ -22,7 +22,6 @@ tasks.distZip.enabled = false
apply from: "$rootProject.projectDir/docker-service.gradle"
dependencies {
implementation project(':third-party')
implementation project(':code:common:model')
implementation project(':code:common:service')
implementation project(':code:common:config')

View File

@ -21,7 +21,6 @@ java {
}
}
dependencies {
implementation project(':third-party')
implementation project(':code:common:model')
implementation project(':code:common:service')
implementation project(':code:common:service-discovery')

View File

@ -21,7 +21,6 @@ java {
}
}
dependencies {
implementation project(':third-party')
implementation project(':code:common:model')
implementation project(':code:common:service')
implementation project(':code:common:service-discovery')

View File

@ -4,10 +4,12 @@ ext {
serviceToolOpts='-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5000'
}
docker {
var df = new File(buildDir, "Dockerfile")
tasks.register('dockerFile') {
buildDir.mkdir()
df.text = """#
var df = new File(buildDir, "Dockerfile")
doLast {
df.text = """#
# I'm auto-generated, please don't make changes to me or commit me to git
#
# The template exists in docker-service.gradle
@ -22,11 +24,23 @@ ENV JAVA_OPTS="${serviceJvmOpts} "
ENTRYPOINT WMSA_HOME=/wmsa /${application.applicationName}/bin/${application.applicationName} \${arg0} \${arg1}
"""
}
it.outputs.file(df)
}
dockerfile = new File(buildDir, "Dockerfile")
dockerPrepare {
dependsOn tasks.dockerFile
}
dockerfileZip {
dependsOn tasks.dockerFile
}
docker {
dockerfile = tasks.dockerFile.outputs.files.singleFile
name = 'marginalia.nu/'+application.applicationName+':latest'
files tasks.distTar.outputs
tags 'latest'
dependsOn tasks.distTar
}

View File

@ -59,7 +59,6 @@ jmhJar {
zip64 true
}
dependencies {
implementation project(':third-party')
implementation project(':code:common:service')
implementation project(':code:common:config')
implementation project(':code:common:service-discovery')

View File

@ -30,7 +30,6 @@ java {
}
}
dependencies {
implementation project(':third-party')
implementation project(':code:common:service')
implementation project(':code:common:service-discovery')
implementation project(':code:common:service-client')

View File

@ -52,7 +52,14 @@ include 'code:crawl:loading-process'
include 'code:crawl:common'
include 'code:crawl:experimental'
include 'third-party'
include 'third-party:porterstemmer'
include 'third-party:xz'
include 'third-party:symspell'
include 'third-party:rdrpostagger'
include 'third-party:uppend'
include 'third-party:openzim'
include 'third-party:monkey-patch-opennlp'
include 'other:memex'
include 'other:wmsa_old'

15
third-party/README.md vendored
View File

@ -6,14 +6,11 @@ or lack an artifact, or to override some default that is inappropriate for the t
## Sources and Licenses
### Modified
* [RDRPosTagger](https://github.com/datquocnguyen/RDRPOSTagger) - GPL3
* [PorterStemmer](https://github.com/caarmen/porter-stemmer) - LGPL3
* [Uppend](https://github.com/upserve/uppend) - MIT
* [OpenZIM](https://github.com/openzim/libzim) - GPL-2.0
* [XZ for Java](https://tukaani.org/xz/) - Public Domain
* [SymSpell](https://github.com/wolfgarbe/symspell) - LGPL-3.0
* [RDRPosTagger](rdrpostagger/) - GPL3
* [PorterStemmer](porterstemmer/) - LGPL3
* [Uppend](uppend/) - MIT
* [OpenZIM](openzim/) - GPL-2.0
* [SymSpell](symspell/) - LGPL-3.0
### Monkey Patched
* [GSON](https://github.com/google/gson) - Apache-2.0
* OpenJDK - GPL-2.0 (packaged under jdkoverride)
* Stanford OpenNLP - Apache-2.0
* [Stanford OpenNLP](monkey-patch-opennlp/) - Apache-2.0

View File

@ -27,5 +27,5 @@ dependencies {
}
test {
useJUnitPlatform()
useJUnitPlatform()\
}

View File

@ -0,0 +1,11 @@
# Monkey Patched OpenNLP
Stanford OpenNLP - Apache-2.0
## Rationale
OpenNLP's sentence detector uses a slow StringBuffer instead of a StringBuilder where it makes no
no sense to do so. This makes it much slower than it needs to be. I've found no way to file issues with the
project to get it fixed. Instead we're doing this monkey patch where the class is overridden with something
better.

24
third-party/openzim/build.gradle vendored Normal file
View File

@ -0,0 +1,24 @@
plugins {
id 'java'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(17))
}
}
dependencies {
implementation libs.bundles.nlp
implementation libs.zstd
implementation libs.commons.compress
implementation libs.ffi
implementation libs.databind
implementation libs.bundles.gson
implementation project(':third-party:xz')
}
test {
useJUnitPlatform()
}

11
third-party/openzim/readme.md vendored Normal file
View File

@ -0,0 +1,11 @@
# OpenZIM
[OpenZIM](https://github.com/openzim/libzim) - GPL-2.0
OpenZIM is a ZIM file reader. This code has been modified in a fairly crude manner
to be much faster than the original code base which seems quite antique. It also
supports XZ compression.
**Important Note** the license is incompatible with AGPL 3, so we can't link Marginalia
directly to this. It's still very useful for building tools that deal with
wikipedia data which would be stand-alone.

16
third-party/porterstemmer/build.gradle vendored Normal file
View File

@ -0,0 +1,16 @@
plugins {
id 'java'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(17))
}
}
dependencies {
}
test {
useJUnitPlatform()
}

6
third-party/porterstemmer/readme.md vendored Normal file
View File

@ -0,0 +1,6 @@
# Porterstemmer
[PorterStemmer](https://github.com/caarmen/porter-stemmer) - LGPL3
It's a [porter stemmer](https://tartarus.org/martin/PorterStemmer/) library, although one comes with OpenNLP
too. TBD which one to use, they're fairly equivalent.

16
third-party/rdrpostagger/build.gradle vendored Normal file
View File

@ -0,0 +1,16 @@
plugins {
id 'java'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(17))
}
}
dependencies {
}
test {
useJUnitPlatform()
}

12
third-party/rdrpostagger/readme.md vendored Normal file
View File

@ -0,0 +1,12 @@
# RDRPosTagger
[RDRPosTagger](https://github.com/datquocnguyen/RDRPOSTagger) - GPL3
datquocnguyen's excellent fast POS tagger. It's been crudely modified to be faster.
Unlike the original, it only does English.
## Citations
- Dat Quoc Nguyen, Dai Quoc Nguyen, Dang Duc Pham and Son Bao Pham. [RDRPOSTagger: A Ripple Down Rules-based Part-Of-Speech Tagger](http://www.aclweb.org/anthology/E14-2005). In *Proceedings of the Demonstrations at the 14th Conference of the European Chapter of the Association for Computational Linguistics*, EACL 2014, pp. 17-20, 2014. [[.PDF]](http://www.aclweb.org/anthology/E14-2005) [[.bib]](http://www.aclweb.org/anthology/E14-2005.bib)
- Dat Quoc Nguyen, Dai Quoc Nguyen, Dang Duc Pham and Son Bao Pham. [A Robust Transformation-Based Learning Approach Using Ripple Down Rules for Part-Of-Speech Tagging](http://content.iospress.com/articles/ai-communications/aic698). *AI Communications* (AICom), vol. 29, no. 3, pp. 409-422, 2016. [[.PDF]](http://arxiv.org/pdf/1412.4021.pdf) [[.bib]](http://rdrpostagger.sourceforge.net/AICom.bib)

File diff suppressed because it is too large Load Diff

View File

@ -1,559 +0,0 @@
package jdkoverride;/*
* Copyright (c) 1996, 2021, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
import java.io.IOException;
import java.io.Reader;
import java.io.UncheckedIOException;
import java.util.Iterator;
import java.util.NoSuchElementException;
import java.util.Objects;
import java.util.Spliterator;
import java.util.Spliterators;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
/** This is java.io.BufferedReader with a different value for defaultExpectedLineLength
*/
public class LargeLineBufferedReader extends Reader {
private Reader in;
private char[] cb;
private int nChars, nextChar;
private static final int INVALIDATED = -2;
private static final int UNMARKED = -1;
private int markedChar = UNMARKED;
private int readAheadLimit = 0; /* Valid only when markedChar > 0 */
/** If the next character is a line feed, skip it */
private boolean skipLF = false;
/** The skipLF flag when the mark was set */
private boolean markedSkipLF = false;
private static int defaultCharBufferSize = 8192;
private static int defaultExpectedLineLength = 65536;
/**
* Creates a buffering character-input stream that uses an input buffer of
* the specified size.
*
* @param in A Reader
* @param sz Input-buffer size
*
* @throws IllegalArgumentException If {@code sz <= 0}
*/
public LargeLineBufferedReader(Reader in, int sz) {
super(in);
if (sz <= 0)
throw new IllegalArgumentException("Buffer size <= 0");
this.in = in;
cb = new char[sz];
nextChar = nChars = 0;
}
/**
* Creates a buffering character-input stream that uses a default-sized
* input buffer.
*
* @param in A Reader
*/
public LargeLineBufferedReader(Reader in) {
this(in, defaultCharBufferSize);
}
/** Checks to make sure that the stream has not been closed */
private void ensureOpen() throws IOException {
if (in == null)
throw new IOException("Stream closed");
}
/**
* Fills the input buffer, taking the mark into account if it is valid.
*/
private void fill() throws IOException {
int dst;
if (markedChar <= UNMARKED) {
/* No mark */
dst = 0;
} else {
/* Marked */
int delta = nextChar - markedChar;
if (delta >= readAheadLimit) {
/* Gone past read-ahead limit: Invalidate mark */
markedChar = INVALIDATED;
readAheadLimit = 0;
dst = 0;
} else {
if (readAheadLimit <= cb.length) {
/* Shuffle in the current buffer */
System.arraycopy(cb, markedChar, cb, 0, delta);
markedChar = 0;
dst = delta;
} else {
/* Reallocate buffer to accommodate read-ahead limit */
char[] ncb = new char[readAheadLimit];
System.arraycopy(cb, markedChar, ncb, 0, delta);
cb = ncb;
markedChar = 0;
dst = delta;
}
nextChar = nChars = delta;
}
}
int n;
do {
n = in.read(cb, dst, cb.length - dst);
} while (n == 0);
if (n > 0) {
nChars = dst + n;
nextChar = dst;
}
}
/**
* Reads a single character.
*
* @return The character read, as an integer in the range
* 0 to 65535 ({@code 0x00-0xffff}), or -1 if the
* end of the stream has been reached
* @throws IOException If an I/O error occurs
*/
public int read() throws IOException {
synchronized (lock) {
ensureOpen();
for (;;) {
if (nextChar >= nChars) {
fill();
if (nextChar >= nChars)
return -1;
}
if (skipLF) {
skipLF = false;
if (cb[nextChar] == '\n') {
nextChar++;
continue;
}
}
return cb[nextChar++];
}
}
}
/**
* Reads characters into a portion of an array, reading from the underlying
* stream if necessary.
*/
private int read1(char[] cbuf, int off, int len) throws IOException {
if (nextChar >= nChars) {
/* If the requested length is at least as large as the buffer, and
if there is no mark/reset activity, and if line feeds are not
being skipped, do not bother to copy the characters into the
local buffer. In this way buffered streams will cascade
harmlessly. */
if (len >= cb.length && markedChar <= UNMARKED && !skipLF) {
return in.read(cbuf, off, len);
}
fill();
}
if (nextChar >= nChars) return -1;
if (skipLF) {
skipLF = false;
if (cb[nextChar] == '\n') {
nextChar++;
if (nextChar >= nChars)
fill();
if (nextChar >= nChars)
return -1;
}
}
int n = Math.min(len, nChars - nextChar);
System.arraycopy(cb, nextChar, cbuf, off, n);
nextChar += n;
return n;
}
/**
* Reads characters into a portion of an array.
*
* <p> This method implements the general contract of the corresponding
* {@link Reader#read(char[], int, int) read} method of the
* {@link Reader} class. As an additional convenience, it
* attempts to read as many characters as possible by repeatedly invoking
* the {@code read} method of the underlying stream. This iterated
* {@code read} continues until one of the following conditions becomes
* true:
* <ul>
*
* <li> The specified number of characters have been read,
*
* <li> The {@code read} method of the underlying stream returns
* {@code -1}, indicating end-of-file, or
*
* <li> The {@code ready} method of the underlying stream
* returns {@code false}, indicating that further input requests
* would block.
*
* </ul>
* If the first {@code read} on the underlying stream returns
* {@code -1} to indicate end-of-file then this method returns
* {@code -1}. Otherwise this method returns the number of characters
* actually read.
*
* <p> Subclasses of this class are encouraged, but not required, to
* attempt to read as many characters as possible in the same fashion.
*
* <p> Ordinarily this method takes characters from this stream's character
* buffer, filling it from the underlying stream as necessary. If,
* however, the buffer is empty, the mark is not valid, and the requested
* length is at least as large as the buffer, then this method will read
* characters directly from the underlying stream into the given array.
* Thus redundant {@code BufferedReader}s will not copy data
* unnecessarily.
*
* @param cbuf {@inheritDoc}
* @param off {@inheritDoc}
* @param len {@inheritDoc}
*
* @return {@inheritDoc}
*
* @throws IndexOutOfBoundsException {@inheritDoc}
* @throws IOException {@inheritDoc}
*/
public int read(char[] cbuf, int off, int len) throws IOException {
synchronized (lock) {
ensureOpen();
Objects.checkFromIndexSize(off, len, cbuf.length);
if (len == 0) {
return 0;
}
int n = read1(cbuf, off, len);
if (n <= 0) return n;
while ((n < len) && in.ready()) {
int n1 = read1(cbuf, off + n, len - n);
if (n1 <= 0) break;
n += n1;
}
return n;
}
}
/**
* Reads a line of text. A line is considered to be terminated by any one
* of a line feed ('\n'), a carriage return ('\r'), a carriage return
* followed immediately by a line feed, or by reaching the end-of-file
* (EOF).
*
* @param ignoreLF If true, the next '\n' will be skipped
* @param term Output: Whether a line terminator was encountered
* while reading the line; may be {@code null}.
*
* @return A String containing the contents of the line, not including
* any line-termination characters, or null if the end of the
* stream has been reached without reading any characters
*
* @see java.io.LineNumberReader#readLine()
*
* @throws IOException If an I/O error occurs
*/
StringBuilder s = new StringBuilder(10000);
String readLine(boolean ignoreLF, boolean[] term) throws IOException {
int startChar;
synchronized (lock) {
s.setLength(0);
ensureOpen();
boolean omitLF = ignoreLF || skipLF;
if (term != null) term[0] = false;
for (;;) {
if (nextChar >= nChars)
fill();
if (nextChar >= nChars) { /* EOF */
if (s != null && s.length() > 0)
return s.toString();
else
return null;
}
boolean eol = false;
char c = 0;
int i;
/* Skip a leftover '\n', if necessary */
if (omitLF && (cb[nextChar] == '\n'))
nextChar++;
skipLF = false;
omitLF = false;
for (i = nextChar; i < nChars; i++) {
c = cb[i];
if ((c == '\n') || (c == '\r')) {
if (term != null) term[0] = true;
eol = true;
break;
}
}
startChar = nextChar;
nextChar = i;
if (eol) {
s.append(cb, startChar, i - startChar);
String str = s.toString();
nextChar++;
if (c == '\r') {
skipLF = true;
}
return str;
}
s.append(cb, startChar, i - startChar);
}
}
}
/**
* Reads a line of text. A line is considered to be terminated by any one
* of a line feed ('\n'), a carriage return ('\r'), a carriage return
* followed immediately by a line feed, or by reaching the end-of-file
* (EOF).
*
* @return A String containing the contents of the line, not including
* any line-termination characters, or null if the end of the
* stream has been reached without reading any characters
*
* @throws IOException If an I/O error occurs
*
* @see java.nio.file.Files#readAllLines
*/
public String readLine() throws IOException {
return readLine(false, null);
}
/**
* {@inheritDoc}
*/
public long skip(long n) throws IOException {
if (n < 0L) {
throw new IllegalArgumentException("skip value is negative");
}
synchronized (lock) {
ensureOpen();
long r = n;
while (r > 0) {
if (nextChar >= nChars)
fill();
if (nextChar >= nChars) /* EOF */
break;
if (skipLF) {
skipLF = false;
if (cb[nextChar] == '\n') {
nextChar++;
}
}
long d = nChars - nextChar;
if (r <= d) {
nextChar += r;
r = 0;
break;
}
else {
r -= d;
nextChar = nChars;
}
}
return n - r;
}
}
/**
* Tells whether this stream is ready to be read. A buffered character
* stream is ready if the buffer is not empty, or if the underlying
* character stream is ready.
*
* @throws IOException If an I/O error occurs
*/
public boolean ready() throws IOException {
synchronized (lock) {
ensureOpen();
/*
* If newline needs to be skipped and the next char to be read
* is a newline character, then just skip it right away.
*/
if (skipLF) {
/* Note that in.ready() will return true if and only if the next
* read on the stream will not block.
*/
if (nextChar >= nChars && in.ready()) {
fill();
}
if (nextChar < nChars) {
if (cb[nextChar] == '\n')
nextChar++;
skipLF = false;
}
}
return (nextChar < nChars) || in.ready();
}
}
/**
* Tells whether this stream supports the mark() operation, which it does.
*/
public boolean markSupported() {
return true;
}
/**
* Marks the present position in the stream. Subsequent calls to reset()
* will attempt to reposition the stream to this point.
*
* @param readAheadLimit Limit on the number of characters that may be
* read while still preserving the mark. An attempt
* to reset the stream after reading characters
* up to this limit or beyond may fail.
* A limit value larger than the size of the input
* buffer will cause a new buffer to be allocated
* whose size is no smaller than limit.
* Therefore large values should be used with care.
*
* @throws IllegalArgumentException If {@code readAheadLimit < 0}
* @throws IOException If an I/O error occurs
*/
public void mark(int readAheadLimit) throws IOException {
if (readAheadLimit < 0) {
throw new IllegalArgumentException("Read-ahead limit < 0");
}
synchronized (lock) {
ensureOpen();
this.readAheadLimit = readAheadLimit;
markedChar = nextChar;
markedSkipLF = skipLF;
}
}
/**
* Resets the stream to the most recent mark.
*
* @throws IOException If the stream has never been marked,
* or if the mark has been invalidated
*/
public void reset() throws IOException {
synchronized (lock) {
ensureOpen();
if (markedChar < 0)
throw new IOException((markedChar == INVALIDATED)
? "Mark invalid"
: "Stream not marked");
nextChar = markedChar;
skipLF = markedSkipLF;
}
}
public void close() throws IOException {
synchronized (lock) {
if (in == null)
return;
try {
in.close();
} finally {
in = null;
cb = null;
}
}
}
/**
* Returns a {@code Stream}, the elements of which are lines read from
* this {@code BufferedReader}. The {@link Stream} is lazily populated,
* i.e., read only occurs during the
* <a href="../util/stream/package-summary.html#StreamOps">terminal
* stream operation</a>.
*
* <p> The reader must not be operated on during the execution of the
* terminal stream operation. Otherwise, the result of the terminal stream
* operation is undefined.
*
* <p> After execution of the terminal stream operation there are no
* guarantees that the reader will be at a specific position from which to
* read the next character or line.
*
* <p> If an {@link IOException} is thrown when accessing the underlying
* {@code BufferedReader}, it is wrapped in an {@link
* UncheckedIOException} which will be thrown from the {@code Stream}
* method that caused the read to take place. This method will return a
* Stream if invoked on a BufferedReader that is closed. Any operation on
* that stream that requires reading from the BufferedReader after it is
* closed, will cause an UncheckedIOException to be thrown.
*
* @return a {@code Stream<String>} providing the lines of text
* described by this {@code BufferedReader}
*
* @since 1.8
*/
public Stream<String> lines() {
Iterator<String> iter = new Iterator<>() {
String nextLine = null;
@Override
public boolean hasNext() {
if (nextLine != null) {
return true;
} else {
try {
nextLine = readLine();
return (nextLine != null);
} catch (IOException e) {
throw new UncheckedIOException(e);
}
}
}
@Override
public String next() {
if (nextLine != null || hasNext()) {
String line = nextLine;
nextLine = null;
return line;
} else {
throw new NoSuchElementException();
}
}
};
return StreamSupport.stream(Spliterators.spliteratorUnknownSize(
iter, Spliterator.ORDERED | Spliterator.NONNULL), false);
}
}

16
third-party/symspell/build.gradle vendored Normal file
View File

@ -0,0 +1,16 @@
plugins {
id 'java'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(17))
}
}
dependencies {
}
test {
useJUnitPlatform()
}

9
third-party/symspell/readme.md vendored Normal file
View File

@ -0,0 +1,9 @@
# SymSpell
[SymSpell](https://github.com/wolfgarbe/symspell) - LGPL-3.0
Fast spell checking library. Ostensibly lacks an artifact, so we're packaging it ourselves.
## Further Reading
Wolf Garbe, [1000x Faster Spelling Correction algorithm (2012)](https://wolfgarbe.medium.com/1000x-faster-spelling-correction-algorithm-2012-8701fcd87a5f)

17
third-party/uppend/build.gradle vendored Normal file
View File

@ -0,0 +1,17 @@
plugins {
id 'java'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(17))
}
}
dependencies {
implementation libs.ffi
}
test {
useJUnitPlatform()
}

6
third-party/uppend/readme.md vendored Normal file
View File

@ -0,0 +1,6 @@
# Uppend
[Uppend](https://github.com/upserve/uppend) - MIT
It's "an append-only, key-multivalue store". Cool project, but we're unceremoniously pillaging just a small piece of
code they did for calling [memadvise()](https://man7.org/linux/man-pages/man2/madvise.2.html) on off-heap byte buffers.

16
third-party/xz/build.gradle vendored Normal file
View File

@ -0,0 +1,16 @@
plugins {
id 'java'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(17))
}
}
dependencies {
}
test {
useJUnitPlatform()
}

9
third-party/xz/readme.md vendored Normal file
View File

@ -0,0 +1,9 @@
# XZ
[XZ for Java](https://tukaani.org/xz/) - Public Domain
"XZ Utils is free general-purpose data compression software with a high compression ratio.
XZ Utils were written for POSIX-like systems, but also work on some not-so-POSIX systems.
XZ Utils are the successor to LZMA Utils."
Needed for [openzim](../openzim) to deal with modern zim files.

Some files were not shown because too many files have changed in this diff Show More