Remove unrelated code, break tools into their own directory.

This commit is contained in:
Viktor Lofgren 2023-03-17 16:03:11 +01:00
parent 449471a076
commit 2eb972dea1
224 changed files with 377 additions and 9972 deletions

View File

@ -3,9 +3,7 @@ package nu.marginalia.model.idx;
import java.util.EnumSet;
public enum DocumentFlags {
/** Simple processing was done, this document should be de-prioritized as a search result */
Simple,
UnusedBit1,
PlainText,
UnusedBit2,
UnusedBit3,

View File

@ -67,7 +67,7 @@ public class DocumentKeywordExtractor {
String flatWord = AsciiFlattener.flattenUnicode(word.word);
if (WordPatterns.hasWordQualities(flatWord)) {
if (!flatWord.isBlank()) {
wordsBuilder.add(flatWord, metadata.getMetadataForWord(word.stemmed));
}
}

View File

@ -220,7 +220,7 @@ public class KeywordExtractor {
}
String word = sentence.constructWordFromSpan(w);
if (word.isBlank() || !WordPatterns.filter(word)) return false;
if (word.isBlank() || !WordPatterns.isNotJunkWord(word)) return false;
if (sentence.posTags[w.start].equals("CC")) return false;
if (sentence.posTags[w.end-1].equals("IN")) return false;
if (sentence.posTags[w.end-1].equals("DT")) return false;

View File

@ -12,11 +12,6 @@ public class IndexJournalEntryBuilder {
this.documentMeta = documentMeta;
}
public IndexJournalEntryBuilder capacity(int size) {
items.ensureCapacity(size);
return this;
}
public IndexJournalEntryBuilder add(long wordId, long metadata) {
items.add(wordId);

View File

@ -13,12 +13,13 @@ java {
}
dependencies {
implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:ngram-bloom-filter')
implementation project(':code:libraries:term-frequency-dict')
implementation project(':code:features-convert:keyword-extraction')
implementation project(':code:common:config')
implementation project(':code:common:model')
implementation project(':third-party:porterstemmer')
implementation libs.lombok
annotationProcessor libs.lombok
implementation libs.bundles.slf4j
@ -26,6 +27,7 @@ dependencies {
implementation libs.bundles.handlebars
implementation libs.trove
implementation libs.guice
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit

View File

@ -1,4 +1,4 @@
package nu.marginalia.language.statistics;
package nu.marginalia.language;
import com.google.inject.Inject;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;

View File

@ -1,4 +1,4 @@
package nu.marginalia.ngram_bloom_filter;
package nu.marginalia.ngrams;
import java.io.IOException;
import java.nio.ByteBuffer;

View File

@ -1,4 +1,4 @@
package nu.marginalia.ngram_bloom_filter;
package nu.marginalia.ngrams;
import ca.rmen.porterstemmer.PorterStemmer;
import com.google.common.hash.HashFunction;

View File

@ -6,9 +6,9 @@ import lombok.Getter;
import lombok.ToString;
import nu.marginalia.LanguageModels;
import nu.marginalia.keyword.KeywordExtractor;
import nu.marginalia.language.statistics.EnglishDictionary;
import nu.marginalia.language.EnglishDictionary;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.ngram_bloom_filter.NGramBloomFilter;
import nu.marginalia.ngrams.NGramBloomFilter;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import nu.marginalia.language.model.DocumentSentence;
import nu.marginalia.language.model.WordSpan;

View File

@ -1,4 +1,4 @@
package nu.marginalia.ngram_bloom_filter;
package nu.marginalia.ngrams;
import org.junit.jupiter.api.Test;

View File

@ -1,8 +1,8 @@
package nu.marginalia.query_parser;
import nu.marginalia.LanguageModels;
import nu.marginalia.language.statistics.EnglishDictionary;
import nu.marginalia.ngram_bloom_filter.NGramBloomFilter;
import nu.marginalia.language.EnglishDictionary;
import nu.marginalia.ngrams.NGramBloomFilter;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import nu.marginalia.query_parser.token.TokenType;
import nu.marginalia.util.TestLanguageModels;

View File

@ -1,8 +1,8 @@
package nu.marginalia.query_parser;
import nu.marginalia.LanguageModels;
import nu.marginalia.language.statistics.EnglishDictionary;
import nu.marginalia.ngram_bloom_filter.NGramBloomFilter;
import nu.marginalia.language.EnglishDictionary;
import nu.marginalia.ngrams.NGramBloomFilter;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import nu.marginalia.util.TestLanguageModels;
import nu.marginalia.language.sentence.SentenceExtractor;

View File

@ -41,7 +41,7 @@ class TermCoherenceFactorTest {
assertEquals(0, termCoherenceFactor.calculate(allPositionsSet));
}
@Test
@Test @SuppressWarnings("unchecked")
public void testLowPosMatches() {
var allPositionsSet = createSet(
List.of(0, 1, 2, 3), List.of(0, 1, 2, 3)
@ -53,7 +53,7 @@ class TermCoherenceFactorTest {
assertEquals(1.0, termCoherenceFactor.bitPositionFactor(mask), 0.01);
}
@Test
@Test @SuppressWarnings("unchecked")
public void testHiPosMatches() {
var allPositionsSet = createSet(
List.of(28, 29, 30, 31), List.of(28, 29, 30, 31)

View File

@ -8,8 +8,6 @@ import java.io.InputStreamReader;
import java.util.HashSet;
import java.util.Objects;
import java.util.Set;
import java.util.function.Predicate;
import java.util.regex.Pattern;
/** Regular expression patterns for deciding which words are eligible to be keywords.
* <p/>
@ -44,25 +42,17 @@ public class WordPatterns {
}
}
private static boolean hasMoreThanTwo(String s, char c, int max) {
int idx = 0;
for (int i = 0; i <= max; i++) {
idx = s.indexOf(c, idx+1);
if (idx < 0 || idx >= s.length() - 1)
return false;
}
return true;
}
public static boolean filter(String word) {
/** Run checks on the word and exclude terms with too many special characters
*/
public static boolean isNotJunkWord(String word) {
if (word.isBlank()) {
return false;
}
if (hasMoreThanTwo(word, '-', 4)) {
if (hasMoreThanN(word, '-', 4)) {
return false;
}
if (hasMoreThanTwo(word, '+', 2)) {
if (hasMoreThanN(word, '+', 2)) {
return false;
}
if (word.startsWith("-")
@ -83,29 +73,13 @@ public class WordPatterns {
return true;
}
public static boolean hasWordQualities(String s) {
if (s.isBlank())
return false;
int start = 0;
int end = s.length();
if (s.charAt(0) == '#') start++;
if (end > 1 && s.charAt(end-1) == '#') end--;
for (int i = start; i < end; i++) {
char c = s.charAt(i);
if (("_@.'+-".indexOf(c) < 0)
&& !(c >= 'a' && c <= 'z')
&& !(c >= 'A' && c <= 'Z')
&& !(c >= '0' && c <= '9')
&& !(c >= '\u00C0' && c <= '\u00D6')
&& !(c >= '\u00D8' && c <= '\u00f6')
&& !(c >= '\u00f8' && c <= '\u00ff'))
{
return false;
}
private static boolean hasMoreThanN(String s, char c, int max) {
int idx = 0;
for (int i = 0; i <= max; i++) {
idx = s.indexOf(c, idx+1);
if (idx < 0 || idx >= s.length() - 1)
return false;
}
return true;
}
@ -113,10 +87,8 @@ public class WordPatterns {
if (s.length() < MIN_WORD_LENGTH) {
return true;
}
if (!hasWordQualities(s)) {
return true;
}
if (!filter(s)) {
if (!isNotJunkWord(s)) {
return true;
}

View File

@ -1,42 +0,0 @@
plugins {
id 'java'
id "io.freefair.lombok" version "5.3.3.3"
id "de.undercouch.download" version "5.1.0"
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(17))
}
}
dependencies {
implementation project(':code:common:config')
implementation project(':third-party:porterstemmer')
implementation libs.lombok
annotationProcessor libs.lombok
implementation libs.bundles.slf4j
implementation libs.notnull
implementation libs.bundles.nlp
implementation libs.guice
implementation libs.trove
implementation libs.fastutil
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
}
test {
useJUnitPlatform()
}
task fastTests(type: Test) {
useJUnitPlatform {
excludeTags "slow"
}
}

View File

@ -21,6 +21,7 @@ dependencies {
implementation project(':code:common:model')
implementation project(':code:common:config')
implementation project(':code:libraries:easy-lsh')
implementation project(':code:libraries:array')
implementation libs.lombok
annotationProcessor libs.lombok

View File

@ -0,0 +1,12 @@
# Term Frequency Dictionary
This dictionary is used by various parts of the system to evaluate for example
the TF-IDF score of a keyword.
## Central Classes
* [TermFrequencyDict](src/main/java/nu/marginalia/term_frequency_dict/TermFrequencyDict.java)
## See Also
* [tools/term-frequency-extractor](../../tools/term-frequency-extractor) constructs this file

View File

@ -1,8 +1,10 @@
package nu.marginalia.term_frequency_dict;
import ca.rmen.porterstemmer.PorterStemmer;
import gnu.trove.map.hash.TLongIntHashMap;
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
import lombok.SneakyThrows;
import nu.marginalia.LanguageModels;
import nu.marginalia.array.LongArray;
import org.apache.commons.lang3.StringUtils;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
@ -14,39 +16,45 @@ import java.io.*;
import java.nio.file.Files;
import java.nio.file.Path;
/** Dictionary with term frequency information for (stemmed) words.
*
*/
@Singleton
public class TermFrequencyDict {
private final TLongIntHashMap wordRates = new TLongIntHashMap(1_000_000, 0.5f, 0, 0);
private final Logger logger = LoggerFactory.getLogger(getClass());
private final Long2IntOpenHashMap wordRates;
private static final Logger logger = LoggerFactory.getLogger(TermFrequencyDict.class);
private static final PorterStemmer ps = new PorterStemmer();
private static final long DOC_COUNT_KEY = ~0L;
public static final long DOC_COUNT_KEY = ~0L;
@Inject
public TermFrequencyDict(@NotNull LanguageModels models) {
this(models.termFrequencies);
}
@SneakyThrows
public TermFrequencyDict(Path file) {
try (var frequencyData = new DataInputStream(new BufferedInputStream(new FileInputStream(file.toFile())))) {
wordRates.ensureCapacity((int)(Files.size(file)/16));
for (;;) {
wordRates.put(frequencyData.readLong(), (int) frequencyData.readLong());
}
} catch (EOFException eof) {
// ok
} catch (IOException e) {
logger.error("IO Exception reading " + file, e);
}
wordRates = load(file);
logger.info("Read {} N-grams frequencies", wordRates.size());
}
public TermFrequencyDict(TLongIntHashMap data) {
wordRates.putAll(data);
private static Long2IntOpenHashMap load(Path file) throws IOException {
LongArray array = LongArray.mmapRead(file);
int size = (int) Files.size(file)/16;
var ret = new Long2IntOpenHashMap(size, 0.5f);
ret.defaultReturnValue(0);
for (int i = 0; i < size; i++) {
ret.put(array.get(2*i), (int) array.get(2*i + 1));
}
return ret;
}
/** Total number of documents in the corpus */
public int docCount() {
int cnt = wordRates.get(DOC_COUNT_KEY);
@ -56,91 +64,20 @@ public class TermFrequencyDict {
return cnt;
}
// WIP refactoring, this needs a new home:
//
// public static void main(String... args) throws IOException, InterruptedException {
// if (args.length != 2) {
// System.err.println("Expected arguments: plan.yaml out-file");
// }
// String outFile = args[1];
//
// var plan = new CrawlPlanLoader().load(Path.of(args[0]));
//
// ThreadLocal<SentenceExtractor> se = ThreadLocal.withInitial(() -> new SentenceExtractor(WmsaHome.getLanguageModels()));
// LanguageFilter lf = new LanguageFilter();
//
// TLongIntHashMap counts = new TLongIntHashMap(100_000_000, 0.7f, -1, -1);
//
// ForkJoinPool fjp = new ForkJoinPool(24);
// AtomicInteger docCount = new AtomicInteger();
//
// for (var domain : plan.domainsIterable()) { // leaks file descriptor, is fine
//
// if (domain.doc == null)
// continue;
//
// fjp.execute(() -> {
//
// TLongHashSet words = new TLongHashSet(10_000);
//
// for (var doc : domain.doc) {
//
// if (doc.documentBody == null)
// continue;
// docCount.incrementAndGet();
//
// Document parsed = Jsoup.parse(doc.documentBody.decode());
// parsed.body().filter(new DomPruningFilter(0.5));
//
// DocumentLanguageData dld = se.get().extractSentences(parsed);
//
// if (lf.dictionaryAgreement(dld) < 0.1) {
// return;
// }
//
// for (var sent : dld.sentences) {
// for (var word : sent) {
// words.add(longHash(word.stemmed().getBytes(StandardCharsets.UTF_8)));
// }
// }
//
// synchronized (counts) {
// words.forEach(w -> {
// counts.adjustOrPutValue(w, 1, 1);
// return true;
// });
// }
//
// words.clear();
// }
//
// System.out.println(domain.domain + "\t" + counts.size());
// });
//
//
// }
//
// fjp.shutdown();
// fjp.awaitTermination(10, TimeUnit.DAYS);
//
// try (var dos = new DataOutputStream(Files.newOutputStream(Path.of(outFile)))) {
// synchronized (counts) {
// counts.put(DOC_COUNT_KEY, docCount.get());
//
// counts.forEachEntry((hash, cnt) -> {
// try {
// dos.writeLong(hash);
// dos.writeLong(cnt);
// } catch (IOException e) {
// throw new RuntimeException(e);
// }
// return true;
// });
// }
// }
//
// System.out.println(docCount.get());
// }
/** Get the term frequency for the string s */
public long getTermFreq(String s) {
return wordRates.get(getStringHash(s));
}
/** Get the term frequency for the already stemmed string s */
public long getTermFreqStemmed(String s) {
return wordRates.get(longHash(s.getBytes()));
}
/** Get the term frequency for the already stemmed and already hashed value 'hash' */
public long getTermFreqHash(long hash) {
return wordRates.get(hash);
}
public static long getStringHash(String s) {
if (s.indexOf(' ') >= 0 || s.indexOf('_') >= 0) {
@ -156,17 +93,11 @@ public class TermFrequencyDict {
}
}
public long getTermFreqHash(long hash) {
return wordRates.get(hash);
}
public long getTermFreq(String s) {
return wordRates.get(getStringHash(s));
}
public long getTermFreqStemmed(String s) {
return wordRates.get(longHash(s.getBytes()));
}
// If this ever changes, we need to re-generate the term frequency dictionary
/** The hashing function used by TermFrequencyHash
* <p>
* If this function changes its behavior in any way,
* it is necessary to re-generate the dictionary.
*/
public static long longHash(byte[]... bytesSets) {
if (bytesSets == null || bytesSets.length == 0)
return 0;

View File

@ -1,4 +1,49 @@
# Converting Models
Contains models shared by the [converting-process](../../processes/converting-process/) and
[loading-process](../../processes/loading-process/).
[loading-process](../../processes/loading-process/).
## Design
The two processes communicate through a file-based protocol. The converter serializes [instructions](src/main/java/nu/marginalia/converting/instruction/Instruction.java)
to file, which are deserialized by the loader and fed into an [instructions](src/main/java/nu/marginalia/converting/instruction/Interpreter.java).
The instructions implement a visitor pattern.
Conceptually the pattern can be thought of a bit like remote function calls over file,
or a crude instructions-based programming language.
This
```java
producer.foo("cat");
producer.bar("milk", "eggs", "bread");
```
translates through this paradigm, to this:
```
(producer)
writeInstruction(DoFoo("Cat"))
writeInstruction(DoBar("Milk", "Eggs", "Bread"))
(consumer)
while read instruction:
interpreter.apply(instruction)
(Interpreter)
doFoo(animal):
...
doBar(ingredients):
...
(doFoo)
DoFoo(animal):
apply(interpreter):
interpreter.foo(animal)
(doBar)
DoBar(ingredients):
apply(interpreter):
interpreter.bar(ingredients)
```

View File

@ -7,6 +7,7 @@ Contains models shared by the [crawling-process](../../processes/crawling-proces
* [CrawledDocument](src/main/java/nu/marginalia/crawling/model/CrawledDocument.java)
* [CrawledDomain](src/main/java/nu/marginalia/crawling/model/CrawledDomain.java)
* [CrawlingSpecification](src/main/java/nu/marginalia/crawling/model/spec/CrawlingSpecification.java)
### Marshalling
* [CrawledDomainReader](src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java)

View File

@ -1,4 +1,4 @@
package nu.marginalia.language;
package nu.marginalia.converting.language;
import nu.marginalia.language.encoding.UnicodeRanges;
import nu.marginalia.language.model.DocumentLanguageData;

View File

@ -29,7 +29,7 @@ public class ProcessedDocument {
if (details == null)
return false;
return !details.metadata.hasFlag(DocumentFlags.Simple);
return true;
}
public OptionalDouble quality() {

View File

@ -2,7 +2,7 @@ package nu.marginalia.converting.processor.plugin;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.language.LanguageFilter;
import nu.marginalia.converting.language.LanguageFilter;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;

View File

@ -4,6 +4,7 @@ package nu.marginalia.converting;
import com.google.inject.Guice;
import com.google.inject.Injector;
import nu.marginalia.bigstring.BigString;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.converting.processor.DomainProcessor;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain;
@ -22,8 +23,7 @@ import static org.junit.jupiter.api.Assertions.*;
public class ConvertingIntegrationTest {
DomainProcessor domainProcessor;
private DomainProcessor domainProcessor;
@BeforeEach
public void setUp() {
@ -60,7 +60,22 @@ public class ConvertingIntegrationTest {
ret.documents.forEach(doc -> {
resultsByStatusCount.merge(doc.state, 1, Integer::sum);
});
assertTrue(resultsByStatusCount.get(UrlIndexingState.OK) > 5);
assertTrue(resultsByStatusCount.get(UrlIndexingState.OK) > 25);
for (var doc : ret.documents) {
if (!doc.isProcessedFully()) {
continue;
}
var details = doc.details;
assertTrue(details.title.length() > 4);
assertTrue(details.description.length() > 4);
assertEquals(HtmlStandard.HTML5, details.standard);
}
}
private CrawledDomain readMarginaliaWorkingSet() throws IOException {

View File

@ -1,6 +1,5 @@
package nu.marginalia.crawling;
package nu.marginalia.converting.language;
import nu.marginalia.language.LanguageFilter;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.Test;

View File

@ -0,0 +1,3 @@
# Test Data
This is a snapshot of memex.marginalia.nu from 2023-03-17.

View File

@ -1,4 +0,0 @@
# Crawl Job Extractor
The crawl job extractor creates a file containing a list of domains
along with known URLs. This is consumed by the [crawling-process](../crawling-process).

View File

@ -1,22 +1,20 @@
# Processes
## 1. Crawl Job Extractor
The [crawl-job-extractor-process](crawl-job-extractor-process/) creates a crawl job specification
based on the content in the database.
## 2. Crawl Process
## 1. Crawl Process
The [crawling-process](crawling-process/) fetches website contents and saves them
as compressed JSON models described in [crawling-model](../process-models/crawling-model/).
## 3. Converting Process
The operation is specified by a crawl job specification. This is generated by [tools/crawl-job-extractor](../tools/crawl-job-extractor/)
based on the content in the database.
## 2. Converting Process
The [converting-process](converting-process/) reads crawl data from the crawling step and
processes them, extracting keywords and metadata and saves them as compressed JSON models
described in [converting-model](../process-models/converting-model/).
## 4. Loading Process
## 3. Loading Process
The [loading-process](loading-process/) reads the processed data and creates an index journal
and lexicon, and loads domains and addresses into the MariaDB-database.

View File

@ -21,11 +21,15 @@ You'll find a short description in each module of what it does and how it relate
Processes are batch jobs that deal with data retrieval, processing and loading.
* [processes](processes/)
* * [crawl-job-extractor](processes/crawl-job-extractor-process)
* * [crawling-process](processes/crawling-process)
* * [converting-process](processes/converting-process)
* * [loading-process](processes/loading-process)
#### Tools
* * [crawl-job-extractor](tools/crawl-job-extractor)
* * [term-frequency-extractor](tools/term-frequency-extractor)
### Features
Features are relatively stand-alone components that serve some part of the domain. They aren't domain-independent,

View File

@ -29,7 +29,6 @@ dependencies {
implementation project(':code:libraries:easy-lsh')
implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:braille-block-punch-cards')
implementation project(':code:libraries:ngram-bloom-filter')
implementation project(':code:libraries:term-frequency-dict')
implementation project(':code:api:assistant-api')

View File

@ -8,8 +8,8 @@ import nu.marginalia.index.client.model.query.SearchSubquery;
import nu.marginalia.index.query.limit.QueryLimits;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.language.statistics.EnglishDictionary;
import nu.marginalia.ngram_bloom_filter.NGramBloomFilter;
import nu.marginalia.language.EnglishDictionary;
import nu.marginalia.ngrams.NGramBloomFilter;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import nu.marginalia.query_parser.QueryParser;
import nu.marginalia.query_parser.QueryPermutation;

View File

@ -2,9 +2,9 @@ package nu.marginalia.search.query;
import nu.marginalia.WmsaHome;
import nu.marginalia.index.query.limit.SpecificationLimitType;
import nu.marginalia.language.statistics.EnglishDictionary;
import nu.marginalia.language.EnglishDictionary;
import nu.marginalia.index.client.model.query.SearchSpecification;
import nu.marginalia.ngram_bloom_filter.NGramBloomFilter;
import nu.marginalia.ngrams.NGramBloomFilter;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import nu.marginalia.search.command.SearchJsParameter;
import nu.marginalia.search.model.SearchProfile;

View File

@ -31,7 +31,7 @@ dependencies {
implementation libs.bundles.mariadb
implementation libs.guice
implementation libs.gson
implementation libs.bundles.gson
implementation libs.zstd
testImplementation libs.bundles.slf4j.test

View File

@ -0,0 +1,6 @@
# Crawl Job Extractor
The crawl job extractor creates a file containing a list of domains
along with known URLs.
This is consumed by [processes/crawling-process](../../processes/crawling-process).

View File

@ -0,0 +1,62 @@
plugins {
id 'java'
id "io.freefair.lombok" version "5.3.3.3"
id 'application'
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(17))
}
}
application {
mainClass = 'nu.marginalia.tools.TermFrequencyExtractor'
applicationName = 'term-frequency-extractor'
}
tasks.distZip.enabled = false
dependencies {
implementation project(':third-party:rdrpostagger')
implementation project(':third-party:porterstemmer')
implementation project(':third-party:monkey-patch-opennlp')
implementation project(':code:common:model')
implementation project(':code:common:config')
implementation project(':code:common:process')
implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:term-frequency-dict')
implementation project(':code:libraries:big-string')
implementation project(':code:processes:converting-process')
implementation project(':code:process-models:crawling-model')
implementation libs.lombok
annotationProcessor libs.lombok
implementation libs.bundles.slf4j
implementation libs.notnull
implementation libs.guice
implementation libs.jsoup
implementation libs.trove
implementation libs.fastutil
implementation libs.bundles.nlp
implementation libs.commons.lang3
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
}
test {
useJUnitPlatform()
}
task fastTests(type: Test) {
useJUnitPlatform {
excludeTags "slow"
}
}

View File

@ -0,0 +1,16 @@
# Term Frequency Extractor
Generates a term frequency dictionary file from a batch of crawl data.
Usage:
```shell
PATH_TO_SAMPLES=run/samples/crawl-s
export JAVA_OPTS=-Dcrawl.rootDirRewrite=/crawl:${PATH_TO_SAMPLES}
term-frequency-extractor ${PATH_TO_SAMPLES}/plan.yaml out.dat
```
## See Also
* [libraries/term-frequency-dict](../../libraries/term-frequency-dict)

View File

@ -0,0 +1,114 @@
package nu.marginalia.tools;
import gnu.trove.map.hash.TLongIntHashMap;
import gnu.trove.set.hash.TLongHashSet;
import nu.marginalia.WmsaHome;
import nu.marginalia.converting.language.LanguageFilter;
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.sentence.SentenceExtractor;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import plan.CrawlPlanLoader;
import java.io.DataOutputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import static nu.marginalia.term_frequency_dict.TermFrequencyDict.DOC_COUNT_KEY;
import static nu.marginalia.term_frequency_dict.TermFrequencyDict.longHash;
public class TermFrequencyExtractor {
public static void main(String... args) throws IOException, InterruptedException {
if (args.length != 2) {
System.err.println("Expected arguments: plan.yaml out-file");
return;
}
String outFile = args[1];
var plan = new CrawlPlanLoader().load(Path.of(args[0]));
ThreadLocal<SentenceExtractor> se = ThreadLocal.withInitial(() -> new SentenceExtractor(WmsaHome.getLanguageModels()));
LanguageFilter lf = new LanguageFilter();
TLongIntHashMap counts = new TLongIntHashMap(100_000_000, 0.7f, -1, -1);
ForkJoinPool fjp = new ForkJoinPool(24);
AtomicInteger docCount = new AtomicInteger();
for (var domain : plan.domainsIterable()) { // leaks file descriptor, is fine
if (domain.doc == null)
continue;
fjp.execute(() -> {
TLongHashSet words = new TLongHashSet(10_000);
for (var doc : domain.doc) {
if (doc.documentBody == null)
continue;
docCount.incrementAndGet();
Document parsed = Jsoup.parse(doc.documentBody.decode());
parsed.body().filter(new DomPruningFilter(0.5));
DocumentLanguageData dld = se.get().extractSentences(parsed);
if (lf.dictionaryAgreement(dld) < 0.1) {
return;
}
for (var sent : dld.sentences) {
for (var word : sent) {
words.add(longHash(word.stemmed().getBytes(StandardCharsets.UTF_8)));
}
}
synchronized (counts) {
words.forEach(w -> {
counts.adjustOrPutValue(w, 1, 1);
return true;
});
}
words.clear();
}
System.out.println(domain.domain + "\t" + counts.size());
});
}
fjp.shutdown();
fjp.awaitTermination(10, TimeUnit.DAYS);
try (var dos = new DataOutputStream(Files.newOutputStream(Path.of(outFile)))) {
synchronized (counts) {
counts.put(DOC_COUNT_KEY, docCount.get());
counts.forEachEntry((hash, cnt) -> {
try {
dos.writeLong(hash);
dos.writeLong(cnt);
} catch (IOException e) {
throw new RuntimeException(e);
}
return true;
});
}
}
System.out.println(docCount.get());
}
}

View File

@ -1,242 +0,0 @@
plugins {
id 'java'
id "io.freefair.lombok" version "5.3.3.3"
id "me.champeau.jmh" version "0.6.6"
id "de.undercouch.download" version "5.1.0"
id 'jvm-test-suite'
}
repositories {
mavenLocal()
maven { url "https://artifactory.cronapp.io/public-release/" }
maven { url "https://repo1.maven.org/maven2/" }
maven { url "https://www2.ph.ed.ac.uk/maven2/" }
maven { url "https://jitpack.io/" }
exclusiveContent {
forRepository {
maven {
url = uri("https://jitpack.io")
}
}
filter {
// Only use JitPack for the `gson-record-type-adapter-factory` library
includeModule("com.github.Marcono1234", "gson-record-type-adapter-factory")
}
}
}
sourceSets {
e2eTest {
java {
java {
compileClasspath += main.output + test.output
runtimeClasspath += main.output + test.output
srcDir file('src/e2e/java')
}
resources.srcDir file('src/e2e/resources')
}
}
jmh {
java {
java {
compileClasspath += main.output + test.output
runtimeClasspath += main.output + test.output
srcDir file('src/jmh/java')
}
resources.srcDir file('src/jmh/resources')
}
}
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(17))
}
}
jmhJar {
zip64 true
}
dependencies {
implementation project(':code:common:service')
implementation project(':code:common:config')
implementation project(':code:common:service-discovery')
implementation project(':code:common:service-client')
implementation 'org.projectlombok:lombok:1.18.24'
implementation 'org.jetbrains:annotations:20.1.0'
annotationProcessor 'org.projectlombok:lombok:1.18.24'
implementation 'com.github.jknack:handlebars:4.3.1'
implementation 'com.github.jknack:handlebars-markdown:4.2.1'
implementation group: 'com.google.code.gson', name: 'gson', version: '2.9.0'
implementation 'io.reactivex.rxjava3:rxjava:3.1.5'
implementation "com.sparkjava:spark-core:2.9.3"
implementation 'com.opencsv:opencsv:5.6'
implementation group: 'org.apache.logging.log4j', name: 'log4j-api', version: '2.17.2'
implementation group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.17.2'
implementation group: 'org.apache.logging.log4j', name: 'log4j-slf4j-impl', version: '2.17.2'
implementation 'org.slf4j:slf4j-api:1.7.36'
testImplementation 'org.slf4j:slf4j-jdk14:2.0.3'
implementation 'com.google.guava:guava:31.1-jre'
implementation 'com.google.inject:guice:5.1.0'
implementation 'com.github.jnr:jnr-ffi:2.2.12'
implementation 'org.apache.httpcomponents:httpcore:4.4.15'
implementation 'org.apache.httpcomponents:httpclient:4.5.13'
implementation group: 'com.h2database', name: 'h2', version: '2.1.210'
implementation 'org.jsoup:jsoup:1.15.3'
implementation 'org.mariadb.jdbc:mariadb-java-client:3.0.6'
implementation group: 'net.sf.trove4j', name: 'trove4j', version: '3.0.3'
implementation 'com.zaxxer:HikariCP:5.0.1'
implementation 'org.apache.opennlp:opennlp-tools:1.9.4'
implementation 'io.prometheus:simpleclient:0.16.0'
implementation 'io.prometheus:simpleclient_servlet:0.16.0'
implementation 'io.prometheus:simpleclient_httpserver:0.16.0'
implementation 'io.prometheus:simpleclient_hotspot:0.16.0'
implementation 'com.fasterxml.jackson.core:jackson-databind:2.13.3'
implementation group: 'org.yaml', name: 'snakeyaml', version: '1.30'
implementation 'com.github.luben:zstd-jni:1.5.2-2'
implementation 'org.lz4:lz4-java:1.8.0'
implementation 'com.github.vladimir-bukhtoyarov:bucket4j-core:7.5.0'
implementation 'de.rototor.jeuclid:jeuclid-core:3.1.14'
implementation 'org.imgscalr:imgscalr-lib:4.2'
implementation 'org.jclarion:image4j:0.7'
implementation 'commons-net:commons-net:3.8.0'
implementation 'org.eclipse.jgit:org.eclipse.jgit:5.12.0.202106070339-r'
implementation 'org.eclipse.jgit:org.eclipse.jgit.ssh.jsch:5.12.0.202106070339-r'
implementation 'com.jcraft:jsch:0.1.55'
implementation group: 'it.unimi.dsi', name: 'fastutil', version: '8.5.8'
implementation 'org.roaringbitmap:RoaringBitmap:0.9.32'
testImplementation 'org.junit.jupiter:junit-jupiter-api:5.8.2'
testImplementation 'org.mockito:mockito-junit-jupiter:4.5.1'
testRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine'
testCompileOnly 'org.projectlombok:lombok:1.18.24'
testImplementation 'org.projectlombok:lombok:1.18.24'
testAnnotationProcessor 'org.projectlombok:lombok:1.18.24'
testImplementation group: 'org.mockito', name: 'mockito-core', version: '4.5.1'
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
testImplementation 'org.testcontainers:mariadb:1.17.4'
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
e2eTestImplementation 'org.junit.jupiter:junit-jupiter-api:5.9.0'
e2eTestRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine'
e2eTestImplementation 'org.projectlombok:lombok:1.18.24'
e2eTestAnnotationProcessor 'org.projectlombok:lombok:1.18.24'
e2eTestImplementation 'org.testcontainers:nginx:1.17.4'
e2eTestImplementation "org.testcontainers:junit-jupiter:1.17.2"
e2eTestImplementation 'org.testcontainers:selenium:1.17.4'
e2eTestImplementation 'org.seleniumhq.selenium:selenium-remote-driver:4.5.3'
e2eTestImplementation 'org.seleniumhq.selenium:selenium-chrome-driver:4.5.3'
implementation 'org.seleniumhq.selenium:selenium-chrome-driver:4.5.3'
implementation 'org.seleniumhq.selenium:selenium-java:4.5.3'
implementation 'org.sejda.imageio:webp-imageio:0.1.6'
jmh 'org.openjdk.jmh:jmh-core:1.35'
jmh 'org.openjdk.jmh:jmh-generator-annprocess:1.35'
implementation 'net.agkn:hll:1.6.0'
}
configurations {
e2eTestImplementation.extendsFrom(testImplementation)
}
test {
maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1
maxHeapSize = "8G"
useJUnitPlatform()
}
task fastTests(type: Test) {
maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1
maxHeapSize = "8G"
useJUnitPlatform {
excludeTags "slow"
}
}
task e2eTest(type: Test) {
maxParallelForks = 1
forkEvery = 1
maxHeapSize = "8G"
dependsOn ':shadowJar'
dependsOn 'downloadTestData'
dependsOn 'downloadRDRModelData'
dependsOn 'downloadSentenceModelData'
dependsOn 'downloadTokenModelData'
dependsOn 'downloadTermFreqData'
dependsOn 'IP2LocationFile'
classpath = sourceSets.e2eTest.runtimeClasspath
testClassesDirs = sourceSets.e2eTest.output.classesDirs
useJUnitPlatform {
includeTags "e2e"
}
}
task downloadTestData(type: Download) {
src 'http://hammurabi.acc.umu.se/mirror/kiwix.org/zim/wikipedia/wikipedia_en_100_nopic_2022-05.zim'
dest file('data/test/wikipedia_en_100_nopic.zim')
overwrite false
}
task downloadRDRModelData(type: Download) {
src (['https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/master/Models/POS/English.DICT',
'https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/master/Models/POS/English.RDR'])
dest file('data/models/')
overwrite false
}
task downloadSentenceModelData(type: Download) {
src 'https://dlcdn.apache.org/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin'
dest file('data/models/opennlp-sentence.bin')
overwrite false
}
task downloadTokenModelData(type: Download) {
src 'https://dlcdn.apache.org/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin'
dest file('data/models/opennlp-tokens.bin')
overwrite false
}
task downloadIP2LocationFile(type: Download) {
src 'https://download.ip2location.com/lite/IP2LOCATION-LITE-DB1.CSV.ZIP'
dest file('data/models/IP2LOCATION-LITE-DB1.CSV.ZIP')
overwrite false
}
task IP2LocationFile(type: Copy) {
dependsOn 'downloadIP2LocationFile'
def zipFile = file('data/models/IP2LOCATION-LITE-DB1.CSV.ZIP')
def outputDir = file("data/models/IP2LOC")
from zipTree(zipFile)
into outputDir
}
task downloadTermFreqData(type: Download) {
src 'https://downloads.marginalia.nu/model/tfreq-new-algo3.bin'
dest file('data/models/tfreq-new-algo3.bin')
overwrite false
}

View File

@ -1,2 +0,0 @@
# This file is generated by the 'io.freefair.lombok' Gradle plugin
config.stopBubbling = true

View File

@ -1,15 +0,0 @@
package nu.marginalia.memex;
import nu.marginalia.memex.auth.AuthMain;
import nu.marginalia.service.descriptor.ServiceDescriptor;
import nu.marginalia.service.descriptor.ServiceDescriptors;
import nu.marginalia.service.id.ServiceId;
import java.util.List;
public class MemexServiceDescriptors {
public static ServiceDescriptors descriptors = new ServiceDescriptors(
List.of(
new ServiceDescriptor(ServiceId.Other_Memex, 5030),
new ServiceDescriptor (ServiceId.Other_Auth, 5003)));
}

View File

@ -1,14 +0,0 @@
package nu.marginalia.memex.auth;
import com.google.inject.AbstractModule;
import com.google.inject.name.Names;
import nu.marginalia.service.descriptor.HostsFile;
import java.nio.file.Path;
public class AuthConfigurationModule extends AbstractModule {
public void configure() {
bind(Path.class).annotatedWith(Names.named("password-file")).toInstance(Path.of("/var/lib/wmsa/password.dat"));
bind(HostsFile.class).toInstance(new HostsFile());
}
}

View File

@ -1,27 +0,0 @@
package nu.marginalia.memex.auth;
import com.google.inject.Guice;
import com.google.inject.Inject;
import com.google.inject.Injector;
import nu.marginalia.memex.MemexServiceDescriptors;
import nu.marginalia.service.MainClass;
import nu.marginalia.service.id.ServiceId;
import nu.marginalia.service.module.ConfigurationModule;
import nu.marginalia.service.server.Initialization;
public class AuthMain extends MainClass {
@Inject
public AuthMain(AuthService service) {
}
public static void main(String... args) {
MainClass.init(ServiceId.Other_Auth, args);
Injector injector = Guice.createInjector(
new AuthConfigurationModule(),
new ConfigurationModule(MemexServiceDescriptors.descriptors, ServiceId.Other_Auth));
injector.getInstance(AuthMain.class);
injector.getInstance(Initialization.class).setReady();
}
}

View File

@ -1,118 +0,0 @@
package nu.marginalia.memex.auth;
import com.google.inject.Inject;
import com.google.inject.name.Named;
import nu.marginalia.client.Context;
import nu.marginalia.memex.auth.model.LoginFormModel;
import nu.marginalia.memex.renderer.MustacheRenderer;
import nu.marginalia.memex.renderer.RendererFactory;
import nu.marginalia.service.server.Initialization;
import nu.marginalia.service.server.MetricsServer;
import nu.marginalia.service.server.RateLimiter;
import nu.marginalia.service.server.Service;
import org.apache.http.HttpStatus;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import spark.Request;
import spark.Response;
import spark.Spark;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Objects;
import java.util.Optional;
import java.util.UUID;
import static spark.Spark.*;
public class AuthService extends Service {
private final Logger logger = LoggerFactory.getLogger(getClass());
private String password;
private final RateLimiter rateLimiter = RateLimiter.forLogin();
private final MustacheRenderer<LoginFormModel> loginFormRenderer;
@Inject
public AuthService(@Named("service-host") String ip,
@Named("service-port") Integer port,
@Named("password-file") Path topSecretPasswordFile,
RendererFactory rendererFactory,
Initialization initialization,
MetricsServer metricsServer) throws IOException {
super(ip, port, initialization, metricsServer);
password = initPassword(topSecretPasswordFile);
loginFormRenderer = rendererFactory.renderer("auth/login");
Spark.path("public/api", () -> {
before((req, rsp) -> {
logger.info("{} {}", req.requestMethod(), req.pathInfo());
});
post("/login", this::login);
get("/login", this::loginForm);
});
Spark.path("api", () -> {
get("/is-logged-in", this::isLoggedIn);
});
}
private String initPassword(Path topSecretPasswordFile) {
if (Files.exists(topSecretPasswordFile)) {
try {
return Files.readString(topSecretPasswordFile);
} catch (IOException e) {
logger.error("Could not read password from file " + topSecretPasswordFile, e);
}
}
logger.error("Setting random password");
return UUID.randomUUID().toString();
}
private Object loginForm(Request request, Response response) {
String redir = Objects.requireNonNull(request.queryParams("redirect"));
String service = Objects.requireNonNull(request.queryParams("service"));
return loginFormRenderer.render(new LoginFormModel(service, redir));
}
private Object login(Request request, Response response) {
var redir = Objects.requireNonNullElse(request.queryParams("redirect"), "/");
if (isLoggedIn(request, response)) {
response.redirect(redir);
return "";
}
if (!rateLimiter.isAllowed(Context.fromRequest(request))) {
Spark.halt(429, "Too many requests");
return null;
}
if (Objects.equals(password, request.queryParams("password"))) {
request.session(true).attribute("logged-in", true);
response.redirect(redir);
return "";
}
response.status(HttpStatus.SC_FORBIDDEN);
return "<h1>Bad password!</h1>";
}
public boolean isLoggedIn(Request request, Response response) {
var session = request.session(false);
if (null == session) {
return false;
}
return Optional.ofNullable(session.attribute("logged-in"))
.map(Boolean.class::cast)
.orElse(false);
}
}

View File

@ -1,45 +0,0 @@
package nu.marginalia.memex.auth.client;
import com.google.gson.GsonBuilder;
import com.google.inject.Inject;
import io.reactivex.rxjava3.core.Observable;
import nu.marginalia.WmsaHome;
import nu.marginalia.client.AbstractDynamicClient;
import nu.marginalia.client.Context;
import nu.marginalia.service.descriptor.ServiceDescriptors;
import nu.marginalia.service.id.ServiceId;
import org.apache.http.HttpStatus;
import spark.Request;
import spark.Response;
import spark.Spark;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.concurrent.TimeUnit;
public class AuthClient extends AbstractDynamicClient {
@Inject
public AuthClient(ServiceDescriptors descriptors) {
super(descriptors.forId(ServiceId.Other_Auth), WmsaHome.getHostsFile(), new GsonBuilder()::create);
}
public Observable<Boolean> isLoggedIn(Context ctx) {
return get(ctx, "/api/is-logged-in").map(Boolean::parseBoolean);
}
public void redirectToLoginIfUnauthenticated(String domain, Request req, Response rsp) {
if (!isLoggedIn(Context.fromRequest(req)).timeout(1, TimeUnit.SECONDS).blockingFirst()) {
rsp.redirect(req.headers("X-Extern-Domain") + "/auth/login?service="+domain
+"&redirect="+ URLEncoder.encode(req.headers("X-Extern-Url"), StandardCharsets.UTF_8));
Spark.halt();
}
}
public void requireLogIn(Context ctx) {
if (!isLoggedIn(ctx).timeout(1, TimeUnit.SECONDS).blockingFirst()) {
Spark.halt(HttpStatus.SC_FORBIDDEN);
}
}
}

View File

@ -1,10 +0,0 @@
package nu.marginalia.memex.auth.model;
import lombok.AllArgsConstructor;
import lombok.Getter;
@Getter @AllArgsConstructor
public class LoginFormModel {
public final String service;
public final String redirect;
}

View File

@ -1,43 +0,0 @@
package nu.marginalia.memex.gemini;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.net.InetAddress;
import java.util.HashSet;
import java.util.Set;
public class BadBotList {
private final Set<InetAddress> shitlist = new HashSet<>();
public static final BadBotList INSTANCE = new BadBotList();
private final Logger logger = LoggerFactory.getLogger(getClass().getSimpleName());
private BadBotList() {}
public boolean isAllowed(InetAddress address) {
return !shitlist.contains(address);
}
public boolean isQueryPermitted(InetAddress address, String query) {
if (isBadQuery(query)) {
logger.info("Banning {}", address);
shitlist.add(address);
return false;
}
return true;
}
private boolean isBadQuery(String query) {
if (query.startsWith("GET")) {
return true;
}
if (query.startsWith("OPTIONS")) {
return true;
}
if (query.contains("mstshash")) {
return true;
}
return false;
}
}

View File

@ -1,17 +0,0 @@
package nu.marginalia.memex.gemini;
import com.google.inject.AbstractModule;
import com.google.inject.name.Names;
import java.nio.file.Path;
public class GeminiConfigurationModule extends AbstractModule {
public void configure() {
bind(Path.class).annotatedWith(Names.named("gemini-server-root")).toInstance(Path.of("/var/lib/wmsa/memex-gmi"));
bind(Path.class).annotatedWith(Names.named("gemini-cert-file")).toInstance(Path.of("/var/lib/wmsa/gemini/crypto.jks"));
bind(Path.class).annotatedWith(Names.named("gemini-cert-password-file")).toInstance(Path.of("/var/lib/wmsa/gemini/password.dat"));
bind(Integer.class).annotatedWith(Names.named("gemini-server-port")).toInstance(1965);
}
}

View File

@ -1,7 +0,0 @@
package nu.marginalia.memex.gemini;
public interface GeminiService {
String DEFAULT_FILENAME = "index.gmi";
void run();
}

View File

@ -1,10 +0,0 @@
package nu.marginalia.memex.gemini;
import com.google.inject.Singleton;
@Singleton
public class GeminiServiceDummy implements GeminiService {
@Override
public void run() {
}
}

View File

@ -1,164 +0,0 @@
package nu.marginalia.memex.gemini;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.google.inject.name.Named;
import nu.marginalia.memex.gemini.io.GeminiConnection;
import nu.marginalia.memex.gemini.io.GeminiSSLSetUp;
import nu.marginalia.memex.gemini.io.GeminiStatusCode;
import nu.marginalia.memex.gemini.io.GeminiUserException;
import nu.marginalia.memex.gemini.plugins.BareStaticPagePlugin;
import nu.marginalia.memex.gemini.plugins.Plugin;
import nu.marginalia.memex.gemini.plugins.SearchPlugin;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.net.ssl.SSLException;
import javax.net.ssl.SSLServerSocket;
import javax.net.ssl.SSLServerSocketFactory;
import javax.net.ssl.SSLSocket;
import java.io.IOException;
import java.net.URI;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Optional;
import java.util.concurrent.Executor;
import java.util.concurrent.Executors;
@Singleton
public class GeminiServiceImpl implements GeminiService {
public final Path serverRoot;
private final Logger logger = LoggerFactory.getLogger(getClass().getSimpleName());
private final Executor pool = Executors.newFixedThreadPool(32);
private final SSLServerSocket serverSocket;
private final Plugin[] plugins;
private final BadBotList badBotList = BadBotList.INSTANCE;
@Inject
public GeminiServiceImpl(@Named("gemini-server-root") Path serverRoot,
@Named("gemini-server-port") Integer port,
GeminiSSLSetUp sslSetUp,
BareStaticPagePlugin pagePlugin,
SearchPlugin searchPlugin) throws Exception {
this.serverRoot = serverRoot;
logger.info("Setting up crypto");
final SSLServerSocketFactory socketFactory = sslSetUp.getServerSocketFactory();
serverSocket = (SSLServerSocket) socketFactory.createServerSocket(port /* 1965 */);
serverSocket.setEnabledCipherSuites(socketFactory.getSupportedCipherSuites());
serverSocket.setEnabledProtocols(new String[] {"TLSv1.3", "TLSv1.2"});
logger.info("Verifying setup");
if (!Files.exists(this.serverRoot)) {
logger.error("Could not find SERVER_ROOT {}", this.serverRoot);
System.exit(255);
}
plugins = new Plugin[] {
pagePlugin,
searchPlugin
};
}
@Override
public void run() {
logger.info("Awaiting connections");
try {
for (;;) {
SSLSocket connection = (SSLSocket) serverSocket.accept();
connection.setSoTimeout(10_000);
if (!badBotList.isAllowed(connection.getInetAddress())) {
connection.close();
} else {
pool.execute(() -> serve(connection));
}
}
}
catch (IOException ex) {
logger.error("IO Exception in gemini server", ex);
}
}
private void serve(SSLSocket socket) {
final GeminiConnection connection;
try {
connection = new GeminiConnection(socket);
}
catch (IOException ex) {
logger.error("Failed to create connection object", ex);
return;
}
try {
handleRequest(connection);
}
catch (GeminiUserException ex) {
errorResponse(connection, ex.getMessage());
}
catch (SSLException ex) {
logger.error(connection.getAddress() + " SSL error");
connection.close();
}
catch (Exception ex) {
errorResponse(connection, "Error");
logger.error(connection.getAddress(), ex);
}
finally {
connection.close();
}
}
private void errorResponse(GeminiConnection connection, String message) {
if (connection.isConnected()) {
try {
logger.error("=> " + connection.getAddress(), message);
connection.writeStatusLine(GeminiStatusCode.ERROR_PERMANENT, message);
}
catch (IOException ex) {
logger.error("Exception while sending error", ex);
}
}
}
private void handleRequest(GeminiConnection connection) throws Exception {
final String address = connection.getAddress();
logger.info("Connect: " + address);
final Optional<URI> maybeUri = connection.readUrl();
if (maybeUri.isEmpty()) {
logger.info("Done: {}", address);
return;
}
final URI uri = maybeUri.get();
logger.info("Request {}", uri);
if (!uri.getScheme().equals("gemini")) {
throw new GeminiUserException("Unsupported protocol");
}
servePage(connection, uri);
logger.info("Done: {}", address);
}
private void servePage(GeminiConnection connection, URI url) throws IOException {
String path = url.getPath();
for (Plugin p : plugins) {
if (p.serve(url, connection)) {
return;
}
}
logger.error("FileNotFound {}", path);
connection.writeStatusLine(GeminiStatusCode.ERROR_TEMPORARY, "No such file");
}
}

View File

@ -1,130 +0,0 @@
package nu.marginalia.memex.gemini.client;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSocketFactory;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.nio.charset.StandardCharsets;
import java.security.cert.X509Certificate;
/** Unstable code! */
public class GeminiClient {
private final SSLSocketFactory socketFactory;
// Create a trust manager that does not validate anything
public static final TrustManager[] trustAllCerts = new TrustManager[]{
new X509TrustManager() {
@Override
public void checkClientTrusted(X509Certificate[] chain,
String authType) {
}
@Override
public void checkServerTrusted(X509Certificate[] chain,
String authType) {
}
@Override
public X509Certificate[] getAcceptedIssuers() {
return new X509Certificate[0];
}
}
};
public static SSLSocketFactory buildSocketFactory() throws Exception {
// Install the all-trusting trust manager
final SSLContext sslContext = SSLContext.getInstance("SSL");
sslContext.init(null, trustAllCerts, new java.security.SecureRandom());
return sslContext.getSocketFactory();
}
public GeminiClient() throws Exception {
socketFactory = buildSocketFactory();
}
public Response get(URI uri) throws IOException {
final int port = uri.getPort() == -1 ? 1965 : uri.getPort();
final String host = uri.getHost();
var requestString = String.format("%s\r\n", uri).getBytes(StandardCharsets.UTF_8);
try (var socket = socketFactory.createSocket(host, port)) {
socket.setSoTimeout(10_000);
socket.getOutputStream().write(requestString);
var is = socket.getInputStream();
String statusLine = new GeminiInput(is).get();
int code = Integer.parseInt(statusLine.substring(0,2));
String meta = statusLine.substring(3);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
is.transferTo(baos);
return new Response(code, meta, baos.toByteArray());
}
}
public static class Response {
public final int code;
public final String meta;
public final byte[] data;
Response(int code, String meta, byte[] data) {
this.code = code;
this.meta = meta;
this.data = data;
}
}
public static class GeminiInput {
private final InputStream is;
private final byte[] buffer = new byte[1024];
private int idx;
final String result;
public GeminiInput(InputStream is) throws IOException {
this.is = is;
for (idx = 0; idx < buffer.length; idx++) {
if (hasEndOfLine()) {
result = new String(buffer, 0, idx-2, StandardCharsets.UTF_8);
return;
}
readCharacter();
}
throw new RuntimeException("String too long");
}
public String get() {
return result;
}
private void readCharacter() throws IOException {
int rb = is.read();
if (-1 == rb) {
throw new RuntimeException("URL incomplete (no CR LF)");
}
buffer[idx] = (byte) rb;
}
public boolean hasEndOfLine() {
return idx > 2
&& buffer[idx - 1] == (byte) '\n'
&& buffer[idx - 2] == (byte) '\r';
}
}
}

View File

@ -1,53 +0,0 @@
package nu.marginalia.memex.gemini.gmi;
import lombok.Getter;
import nu.marginalia.memex.gemini.gmi.line.AbstractGemtextLine;
import nu.marginalia.memex.gemini.gmi.parser.GemtextParser;
import nu.marginalia.memex.gemini.gmi.renderer.GemtextRenderer;
import nu.marginalia.memex.memex.model.MemexNodeHeadingId;
import nu.marginalia.memex.memex.model.MemexNodeUrl;
import java.io.IOException;
import java.io.Writer;
import java.util.Arrays;
import java.util.stream.Collectors;
import java.util.stream.Stream;
@Getter
public class Gemtext {
private final AbstractGemtextLine[] lines;
private final MemexNodeUrl url;
public Gemtext(MemexNodeUrl url, String[] lines, MemexNodeHeadingId headingRoot) {
this.lines = GemtextParser.parse(lines, headingRoot);
this.url = url;
}
public Gemtext(MemexNodeUrl url, String[] lines) {
this.lines = GemtextParser.parse(lines, new MemexNodeHeadingId(0));
this.url = url;
}
public String render(GemtextRenderer renderer) {
return Arrays.stream(lines).map(renderer::renderLine).collect(Collectors.joining());
}
public void render(GemtextRenderer renderer, Writer w) throws IOException {
for (var line : lines) {
w.write(renderer.renderLine(line));
w.write('\n');
}
}
public Stream<AbstractGemtextLine> stream() {
return Arrays.stream(lines);
}
public AbstractGemtextLine get(int idx) {
return lines[idx];
}
public int size() {
return lines.length;
}
}

View File

@ -1,71 +0,0 @@
package nu.marginalia.memex.gemini.gmi;
import com.google.common.collect.Sets;
import nu.marginalia.memex.gemini.gmi.line.GemtextLineVisitorAdapter;
import nu.marginalia.memex.gemini.gmi.line.GemtextLink;
import nu.marginalia.memex.memex.model.MemexNodeUrl;
import nu.marginalia.memex.memex.model.MemexUrl;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
public class GemtextDatabase extends Gemtext {
public final Map<String, Integer> links;
public GemtextDatabase(MemexNodeUrl url, String[] lines) {
super(url, lines);
links = new HashMap<>();
for (int i = 0; i < size(); i++) {
int linkIdx = i;
get(i).visit(new GemtextLineVisitorAdapter<>() {
@Override
public Object visit(GemtextLink g) {
links.put(g.getUrl().toString(), linkIdx);
return null;
}
});
}
}
public Set<String> keys() {
return links.keySet();
}
public Optional<String> getLinkData(MemexUrl url) {
Integer idx = links.get(url.getUrl());
if (idx != null) {
return
Optional.of(get(idx).mapLink(GemtextLink::getTitle).orElse(""));
}
return Optional.empty();
}
public static GemtextDatabase of(MemexNodeUrl url, String[] lines) {
return new GemtextDatabase(url, lines);
}
public static GemtextDatabase of(MemexNodeUrl url, Path file) throws IOException {
try (var s = Files.lines(file)) {
return new GemtextDatabase(url, s.toArray(String[]::new));
}
}
public Set<MemexNodeUrl> difference(GemtextDatabase other) {
Set<MemexNodeUrl> differences = new HashSet<>();
Sets.difference(keys(), other.keys()).stream().map(MemexNodeUrl::new).forEach(differences::add);
Sets.intersection(keys(), other.keys())
.stream()
.map(MemexNodeUrl::new)
.filter(url -> !Objects.equals(getLinkData(url), other.getLinkData(url)))
.forEach(differences::add);
return differences;
}
}

View File

@ -1,163 +0,0 @@
package nu.marginalia.memex.gemini.gmi;
import lombok.Getter;
import nu.marginalia.memex.gemini.gmi.renderer.GemtextRenderer;
import nu.marginalia.memex.gemini.gmi.renderer.GemtextRendererFactory;
import nu.marginalia.memex.gemini.gmi.line.*;
import nu.marginalia.memex.memex.model.MemexNodeHeadingId;
import nu.marginalia.memex.memex.model.MemexNodeTaskId;
import nu.marginalia.memex.memex.model.MemexNodeUrl;
import nu.marginalia.memex.memex.model.MemexTaskState;
import org.apache.commons.lang3.tuple.Pair;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
@Getter
public class GemtextDocument extends Gemtext {
private final Map<MemexNodeHeadingId, String> headings;
private final Map<String, List<MemexNodeHeadingId>> headingsByName;
private final Set<String> pragmas;
private final List<GemtextTask> tasks;
private final String title;
private final String date;
private final List<GemtextLink> links;
private final int hashCode;
private static final Pattern datePattern = Pattern.compile(".*(\\d{4}-\\d{2}-\\d{2}).*");
private static final GemtextRenderer rawRenderer = new GemtextRendererFactory().gemtextRendererAsIs();
public GemtextDocument(MemexNodeUrl url, String[] lines, MemexNodeHeadingId headingRoot) {
super(url, lines, headingRoot);
this.hashCode = Arrays.hashCode(lines);
GemtextDataExtractor extractor = new GemtextDataExtractor();
Arrays.stream(this.getLines()).forEach(extractor::take);
this.headings = extractor.getHeadings();
this.links = extractor.getLinks();
this.title = Objects.requireNonNullElse(extractor.getTitle(), url.getUrl());
this.pragmas = extractor.getPragmas();
this.headingsByName = extractor.getHeadingsByName();
this.tasks = extractor.getTasks();
this.date = extractor.getDate();
}
public String getHeadingForElement(AbstractGemtextLine line) {
return headings.getOrDefault(line.getHeading(), "");
}
public List<AbstractGemtextLine> getSection(MemexNodeHeadingId headingId) {
return stream()
.filter(line -> line.getHeading().isChildOf(headingId))
.collect(Collectors.toList());
}
public String getSectionGemtext(MemexNodeHeadingId headingId) {
if (headingId.equals(new MemexNodeHeadingId(0))) {
return stream()
.map(rawRenderer::renderLine)
.collect(Collectors.joining("\n"));
}
return stream()
.filter(line -> line.getHeading().isChildOf(headingId))
.map(rawRenderer::renderLine)
.collect(Collectors.joining("\n"));
}
public Map<MemexNodeTaskId, Pair<String, MemexTaskState>> getOpenTopTasks() {
return tasks.stream()
.filter(task -> MemexTaskState.TODO.equals(task.getState())
|| MemexTaskState.URGENT.equals(task.getState()))
.filter(task -> task.getId().level() == 1)
.collect(Collectors.toMap(GemtextTask::getId, task -> Pair.of(task.getTask(), task.getState())));
}
public static GemtextDocument of(MemexNodeUrl url, String... lines) {
return new GemtextDocument(url, lines, new MemexNodeHeadingId(0));
}
public static GemtextDocument of(MemexNodeUrl url, Path file) throws IOException {
try (var s = Files.lines(file)) {
return new GemtextDocument(url, s.toArray(String[]::new), new MemexNodeHeadingId(0));
}
}
public boolean isIndex() {
return getUrl().getFilename().equals("index.gmi");
}
@Override
public int hashCode() {
return hashCode;
}
public Optional<String> getHeading(MemexNodeHeadingId heading) {
return Optional.ofNullable(headings.get(heading));
}
public Optional<MemexNodeHeadingId> getHeadingByName(MemexNodeHeadingId parent, String name) {
var headings = headingsByName.get(name);
if (null == headings) {
return Optional.empty();
}
return headings.stream().filter(heading -> heading.isChildOf(parent)).findAny();
}
@Getter
private static class GemtextDataExtractor extends GemtextLineVisitorAdapter<Object> {
private String title;
private String date;
private final Map<MemexNodeHeadingId, String> headings = new TreeMap<>((a, b) -> Arrays.compare(a.getIds(), b.getIds()));
private final Map<String, List<MemexNodeHeadingId>> headingsByName = new HashMap<>();
private final Set<String> pragmas = new HashSet<>();
private final List<GemtextLink> links = new ArrayList<>();
private final List<GemtextTask> tasks = new ArrayList<>();
@Override
public Object visit(GemtextHeading g) {
headings.put(g.getLevel(), g.getName());
headingsByName.computeIfAbsent(g.getName(), t -> new ArrayList<>()).add(g.getLevel());
if (title == null) {
title = g.getName();
var dateMatcher = datePattern.matcher(title);
if (dateMatcher.matches()) {
date = dateMatcher.group(1);
}
}
return null;
}
@Override
public Object visit(GemtextLink g) {
links.add(g);
return null;
}
@Override
public Object visit(GemtextTask g) {
tasks.add(g);
return null;
}
@Override
public Object visit(GemtextPragma g) {
pragmas.add(g.getLine());
return null;
}
}
}

View File

@ -1,18 +0,0 @@
package nu.marginalia.memex.gemini.gmi.line;
import nu.marginalia.memex.memex.model.MemexNodeHeadingId;
import java.util.Optional;
import java.util.function.Function;
public abstract class AbstractGemtextLine {
public <T> Optional<T> mapLink(Function<GemtextLink, T> mapper) {
return Optional.empty();
}
public <T> Optional<T> mapHeading(Function<GemtextHeading, T> mapper) { return Optional.empty(); }
public <T> Optional<T> mapTask(Function<GemtextTask, T> mapper) { return Optional.empty(); }
public abstract <T> T visit(GemtextLineVisitor<T> visitor);
public abstract boolean breaksTask();
public abstract MemexNodeHeadingId getHeading();
}

View File

@ -1,21 +0,0 @@
package nu.marginalia.memex.gemini.gmi.line;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.ToString;
import nu.marginalia.memex.memex.model.MemexNodeHeadingId;
@AllArgsConstructor @Getter @ToString
public class GemtextAside extends AbstractGemtextLine {
private final String line;
private final MemexNodeHeadingId heading;
@Override
public <T> T visit(GemtextLineVisitor<T> visitor) {
return visitor.visit(this);
}
public boolean breaksTask() {
return false;
}
}

View File

@ -1,32 +0,0 @@
package nu.marginalia.memex.gemini.gmi.line;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.ToString;
import nu.marginalia.memex.memex.model.MemexNodeHeadingId;
import java.util.Optional;
import java.util.function.Function;
@AllArgsConstructor
@Getter
@ToString
public class GemtextHeading extends AbstractGemtextLine {
private final MemexNodeHeadingId level;
private final String name;
private final MemexNodeHeadingId heading;
public <T> Optional<T> mapHeading(Function<GemtextHeading, T> mapper) {
return Optional.of(mapper.apply(this));
}
@Override
public <T> T visit(GemtextLineVisitor<T> visitor) {
return visitor.visit(this);
}
public boolean breaksTask() {
return true;
}
}

View File

@ -1,18 +0,0 @@
package nu.marginalia.memex.gemini.gmi.line;
public interface GemtextLineVisitor<T> {
default T take(AbstractGemtextLine line) {
return line.visit(this);
}
T visit(GemtextHeading g);
T visit(GemtextLink g);
T visit(GemtextList g);
T visit(GemtextPreformat g);
T visit(GemtextQuote g);
T visit(GemtextText g);
T visit(GemtextTextLiteral g);
T visit(GemtextAside g);
T visit(GemtextTask g);
T visit(GemtextPragma g);
}

View File

@ -1,53 +0,0 @@
package nu.marginalia.memex.gemini.gmi.line;
public class GemtextLineVisitorAdapter<T> implements GemtextLineVisitor<T> {
@Override
public T visit(GemtextHeading g) {
return null;
}
@Override
public T visit(GemtextLink g) {
return null;
}
@Override
public T visit(GemtextList g) {
return null;
}
@Override
public T visit(GemtextPreformat g) {
return null;
}
@Override
public T visit(GemtextQuote g) {
return null;
}
@Override
public T visit(GemtextText g) {
return null;
}
@Override
public T visit(GemtextTextLiteral g) {
return null;
}
@Override
public T visit(GemtextAside g) {
return null;
}
@Override
public T visit(GemtextTask g) {
return null;
}
@Override
public T visit(GemtextPragma g) {
return null;
}
}

View File

@ -1,33 +0,0 @@
package nu.marginalia.memex.gemini.gmi.line;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.ToString;
import nu.marginalia.memex.memex.model.MemexNodeHeadingId;
import nu.marginalia.memex.memex.model.MemexUrl;
import javax.annotation.Nullable;
import java.util.Optional;
import java.util.function.Function;
@AllArgsConstructor @Getter @ToString
public class GemtextLink extends AbstractGemtextLine {
private final MemexUrl url;
@Nullable
private final String title;
private final MemexNodeHeadingId heading;
public <T> Optional<T> mapLink(Function<GemtextLink, T> mapper) {
return Optional.ofNullable(mapper.apply(this));
}
@Override
public <T> T visit(GemtextLineVisitor<T> visitor) {
return visitor.visit(this);
}
public boolean breaksTask() {
return false;
}
}

View File

@ -1,23 +0,0 @@
package nu.marginalia.memex.gemini.gmi.line;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.ToString;
import nu.marginalia.memex.memex.model.MemexNodeHeadingId;
import java.util.List;
@AllArgsConstructor @Getter @ToString
public class GemtextList extends AbstractGemtextLine {
private final List<String> items;
private final MemexNodeHeadingId heading;
@Override
public <T> T visit(GemtextLineVisitor<T> visitor) {
return visitor.visit(this);
}
public boolean breaksTask() {
return true;
}
}

View File

@ -1,21 +0,0 @@
package nu.marginalia.memex.gemini.gmi.line;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.ToString;
import nu.marginalia.memex.memex.model.MemexNodeHeadingId;
@AllArgsConstructor @Getter @ToString
public class GemtextPragma extends AbstractGemtextLine {
private final String line;
private final MemexNodeHeadingId heading;
@Override
public <T> T visit(GemtextLineVisitor<T> visitor) {
return visitor.visit(this);
}
public boolean breaksTask() {
return false;
}
}

View File

@ -1,23 +0,0 @@
package nu.marginalia.memex.gemini.gmi.line;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.ToString;
import nu.marginalia.memex.memex.model.MemexNodeHeadingId;
import java.util.List;
@AllArgsConstructor @Getter @ToString
public class GemtextPreformat extends AbstractGemtextLine {
private final List<String> items;
private final MemexNodeHeadingId heading;
@Override
public <T> T visit(GemtextLineVisitor<T> visitor) {
return visitor.visit(this);
}
public boolean breaksTask() {
return true;
}
}

View File

@ -1,23 +0,0 @@
package nu.marginalia.memex.gemini.gmi.line;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.ToString;
import nu.marginalia.memex.memex.model.MemexNodeHeadingId;
import java.util.List;
@AllArgsConstructor @Getter @ToString
public class GemtextQuote extends AbstractGemtextLine {
private final List<String> items;
private final MemexNodeHeadingId heading;
@Override
public <T> T visit(GemtextLineVisitor<T> visitor) {
return visitor.visit(this);
}
public boolean breaksTask() {
return true;
}
}

View File

@ -1,42 +0,0 @@
package nu.marginalia.memex.gemini.gmi.line;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.ToString;
import nu.marginalia.memex.memex.model.MemexNodeHeadingId;
import nu.marginalia.memex.memex.model.MemexNodeTaskId;
import nu.marginalia.memex.memex.model.MemexTaskState;
import nu.marginalia.memex.memex.model.MemexTaskTags;
import java.util.Optional;
import java.util.function.Function;
@AllArgsConstructor @Getter @ToString
public class GemtextTask extends AbstractGemtextLine {
private final MemexNodeTaskId id;
private final String task;
private final MemexNodeHeadingId heading;
private final MemexTaskTags tags;
public MemexTaskState getState() {
return MemexTaskState.of(tags);
}
public int getLevel() {
return id.level();
}
@Override
public <T> T visit(GemtextLineVisitor<T> visitor) {
return visitor.visit(this);
}
@Override
public boolean breaksTask() {
return true;
}
@Override
public <T> Optional<T> mapTask(Function<GemtextTask, T> mapper) {
return Optional.of(mapper.apply(this));
}
}

View File

@ -1,21 +0,0 @@
package nu.marginalia.memex.gemini.gmi.line;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.ToString;
import nu.marginalia.memex.memex.model.MemexNodeHeadingId;
@AllArgsConstructor @Getter @ToString
public class GemtextText extends AbstractGemtextLine {
private final String line;
private final MemexNodeHeadingId heading;
@Override
public <T> T visit(GemtextLineVisitor<T> visitor) {
return visitor.visit(this);
}
public boolean breaksTask() {
return !line.isBlank();
}
}

View File

@ -1,23 +0,0 @@
package nu.marginalia.memex.gemini.gmi.line;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.ToString;
import nu.marginalia.memex.memex.model.MemexNodeHeadingId;
import java.util.List;
@AllArgsConstructor @Getter @ToString
public class GemtextTextLiteral extends AbstractGemtextLine {
private final List<String> items;
private final MemexNodeHeadingId heading;
@Override
public <T> T visit(GemtextLineVisitor<T> visitor) {
return visitor.visit(this);
}
public boolean breaksTask() {
return false;
}
}

View File

@ -1,20 +0,0 @@
package nu.marginalia.memex.gemini.gmi.parser;
import nu.marginalia.memex.gemini.gmi.line.GemtextAside;
import nu.marginalia.memex.memex.model.MemexNodeHeadingId;
import java.util.regex.Pattern;
public class GemtextAsideParser {
private static final Pattern listItemPattern = Pattern.compile("^\\((.*)\\)$");
public static GemtextAside parse(String s, MemexNodeHeadingId heading) {
var matcher = listItemPattern.matcher(s);
if (!matcher.matches()) {
return null;
}
return new GemtextAside(matcher.group(1), heading);
}
}

View File

@ -1,26 +0,0 @@
package nu.marginalia.memex.gemini.gmi.parser;
import nu.marginalia.memex.gemini.gmi.line.AbstractGemtextLine;
import nu.marginalia.memex.gemini.gmi.line.GemtextHeading;
import nu.marginalia.memex.gemini.gmi.line.GemtextText;
import nu.marginalia.memex.memex.model.MemexNodeHeadingId;
import java.util.regex.Pattern;
public class GemtextHeadingParser {
private static final Pattern headingPattern = Pattern.compile("^(#+)\\s*([^#].*|$)$");
public static AbstractGemtextLine parse(String s, MemexNodeHeadingId heading) {
var matcher = headingPattern.matcher(s);
if (!matcher.matches()) {
return new GemtextText(s, heading);
}
int level = matcher.group(1).length() - 1;
var newHeading = heading.next(level);
return new GemtextHeading(newHeading, matcher.group(2), newHeading);
}
}

View File

@ -1,42 +0,0 @@
package nu.marginalia.memex.gemini.gmi.parser;
import nu.marginalia.memex.gemini.gmi.line.AbstractGemtextLine;
import nu.marginalia.memex.gemini.gmi.line.GemtextLink;
import nu.marginalia.memex.gemini.gmi.line.GemtextText;
import nu.marginalia.memex.memex.model.MemexExternalUrl;
import nu.marginalia.memex.memex.model.MemexNodeHeadingId;
import nu.marginalia.memex.memex.model.MemexNodeUrl;
import nu.marginalia.memex.memex.model.MemexUrl;
import javax.annotation.Nullable;
import java.util.regex.Pattern;
public class GemtextLinkParser {
private static final Pattern linkPattern = Pattern.compile("^=>\\s?([^\\s]+)\\s*(.+)?$");
@Nullable
public static AbstractGemtextLine parse(String s, MemexNodeHeadingId heading) {
var matcher = linkPattern.matcher(s);
if (!matcher.matches()) {
return new GemtextText(s, heading);
}
if (matcher.groupCount() == 2) {
return new GemtextLink(toMemexUrl(matcher.group(1)), matcher.group(2), heading);
}
else {
return new GemtextLink(toMemexUrl(matcher.group(1)), null, heading);
}
}
private static MemexUrl toMemexUrl(String url) {
if (url.startsWith("/")) {
return new MemexNodeUrl(url);
}
else {
return new MemexExternalUrl(url);
}
}
}

View File

@ -1,17 +0,0 @@
package nu.marginalia.memex.gemini.gmi.parser;
import java.util.regex.Pattern;
public class GemtextListParser {
private static final Pattern listItemPattern = Pattern.compile("^\\*\\s?(.+)$");
public static String parse(String s) {
var matcher = listItemPattern.matcher(s);
if (!matcher.matches()) {
return null;
}
return matcher.group(1);
}
}

View File

@ -1,135 +0,0 @@
package nu.marginalia.memex.gemini.gmi.parser;
import nu.marginalia.memex.gemini.gmi.line.*;
import nu.marginalia.memex.memex.model.MemexNodeHeadingId;
import nu.marginalia.memex.memex.model.MemexNodeTaskId;
import java.util.*;
public class GemtextParser {
private static final String PREFORMAT_MARKER = "```";
private static final String LITERAL_MARKER = " ";
private static final String LINK_MARKER = "=>";
private static final String HEADING_MARKER = "#";
private static final String LIST_MARKER = "*";
private static final String QUOTE_MARKER = ">";
private static final String ASIDE_MARKER = "(";
private static final String TASK_MARKER = "-";
private static final String PRAGMA_MARKER = "%%%";
public static AbstractGemtextLine[] parse(String[] lines, MemexNodeHeadingId headingRoot) {
List<AbstractGemtextLine> items = new ArrayList<>();
MemexNodeHeadingId heading = headingRoot;
MemexNodeTaskId task = new MemexNodeTaskId(0);
Set<String> pragmas = new HashSet<>();
for (int i = 0; i < lines.length; i++) {
String line = lines[i];
if (line.startsWith(PREFORMAT_MARKER)) {
i = getBlockQuote(items, lines, heading, i);
}
else if (line.startsWith(PRAGMA_MARKER)) {
var pragma = GemtextPragmaParser.parse(line, heading);
if (pragma instanceof GemtextPragma) {
GemtextPragma gtp = (GemtextPragma) pragma;
pragmas.add(gtp.getLine());
}
items.add(pragma);
}
else if (line.startsWith(LINK_MARKER)) {
var link = GemtextLinkParser.parse(line, heading);
items.add(link);
}
else if (line.startsWith(HEADING_MARKER)) {
var tag = GemtextHeadingParser.parse(line, heading);
heading = tag.mapHeading(GemtextHeading::getHeading).orElse(heading);
items.add(tag);
}
else if (line.startsWith(LIST_MARKER)) {
i = getList(items, lines, heading, i);
}
else if (line.startsWith(LITERAL_MARKER)) {
i = getLitteral(items, lines, heading, i);
}
else if (pragmas.contains("TASKS")
&& line.startsWith(TASK_MARKER))
{
var tag = GemtextTaskParser.parse(line, heading, task);
task = tag.mapTask(GemtextTask::getId).orElse(task);
items.add(tag);
}
else if (line.startsWith(QUOTE_MARKER)) {
i = getQuote(items, lines, heading, i);
}
else if (line.startsWith(ASIDE_MARKER)) {
var aside = GemtextAsideParser.parse(line, heading);
items.add(Objects.requireNonNullElse(aside, new GemtextText(line, heading)));
}
else {
items.add(new GemtextText(line, heading));
}
}
return items.toArray(AbstractGemtextLine[]::new);
}
private static int getBlockQuote(List<AbstractGemtextLine> items, String[] lines, MemexNodeHeadingId heading, int i) {
int j = i+1;
List<String> quotedLines = new ArrayList<>();
for (;j < lines.length; j++) {
if (lines[j].startsWith(PREFORMAT_MARKER)) {
break;
}
quotedLines.add(lines[j]);
}
items.add(new GemtextPreformat(quotedLines, heading));
return j;
}
private static int getList(List<AbstractGemtextLine> items, String[] lines, MemexNodeHeadingId heading, int i) {
int j = i;
List<String> listLines = new ArrayList<>();
for (;j < lines.length; j++) {
if (!lines[j].startsWith(LIST_MARKER)) {
break;
}
listLines.add(GemtextListParser.parse(lines[j]));
}
items.add(new GemtextList(listLines, heading));
return j-1;
}
private static int getLitteral(List<AbstractGemtextLine> items, String[] lines, MemexNodeHeadingId heading, int i) {
int j = i;
List<String> listLines = new ArrayList<>();
for (;j < lines.length; j++) {
if (!lines[j].startsWith(LITERAL_MARKER)) {
break;
}
listLines.add(lines[j]);
}
items.add(new GemtextTextLiteral(listLines, heading));
return j-1;
}
private static int getQuote(List<AbstractGemtextLine> items, String[] lines, MemexNodeHeadingId heading, int i) {
int j = i;
List<String> listLines = new ArrayList<>();
for (;j < lines.length; j++) {
if (!lines[j].startsWith(QUOTE_MARKER)) {
break;
}
listLines.add(GemtextQuoteParser.parse(lines[j]));
}
items.add(new GemtextQuote(listLines, heading));
return j-1;
}
}

View File

@ -1,26 +0,0 @@
package nu.marginalia.memex.gemini.gmi.parser;
import nu.marginalia.memex.gemini.gmi.line.AbstractGemtextLine;
import nu.marginalia.memex.gemini.gmi.line.GemtextPragma;
import nu.marginalia.memex.gemini.gmi.line.GemtextText;
import nu.marginalia.memex.memex.model.MemexNodeHeadingId;
import java.util.regex.Pattern;
public class GemtextPragmaParser {
private static final Pattern pragmaPattern = Pattern.compile("^%%%\\s*(.*|$)$");
public static AbstractGemtextLine parse(String s, MemexNodeHeadingId heading) {
var matcher = pragmaPattern.matcher(s);
if (!matcher.matches()) {
return new GemtextText(s, heading);
}
String task = matcher.group(1);
return new GemtextPragma(task, heading);
}
}

View File

@ -1,17 +0,0 @@
package nu.marginalia.memex.gemini.gmi.parser;
import java.util.regex.Pattern;
public class GemtextQuoteParser {
private static final Pattern listItemPattern = Pattern.compile("^>(.+)$");
public static String parse(String s) {
var matcher = listItemPattern.matcher(s);
if (!matcher.matches()) {
return null;
}
return matcher.group(1);
}
}

View File

@ -1,31 +0,0 @@
package nu.marginalia.memex.gemini.gmi.parser;
import nu.marginalia.memex.gemini.gmi.line.AbstractGemtextLine;
import nu.marginalia.memex.gemini.gmi.line.GemtextTask;
import nu.marginalia.memex.gemini.gmi.line.GemtextText;
import nu.marginalia.memex.memex.model.MemexNodeHeadingId;
import nu.marginalia.memex.memex.model.MemexNodeTaskId;
import nu.marginalia.memex.memex.model.MemexTaskTags;
import java.util.regex.Pattern;
public class GemtextTaskParser {
private static final Pattern taskPattern = Pattern.compile("^(-+)\\s*([^-].*|$)$");
public static AbstractGemtextLine parse(String s, MemexNodeHeadingId heading,
MemexNodeTaskId taskId) {
var matcher = taskPattern.matcher(s);
if (!matcher.matches()) {
return new GemtextText(s, heading);
}
int level = matcher.group(1).length() - 1;
String task = matcher.group(2);
return new GemtextTask(taskId.next(level), task, heading, new MemexTaskTags(task));
}
}

View File

@ -1,91 +0,0 @@
package nu.marginalia.memex.gemini.gmi.renderer;
import nu.marginalia.memex.gemini.gmi.line.*;
import java.util.function.Function;
public class GemtextRenderer implements GemtextLineVisitor<String> {
private final Function<GemtextHeading, String> headingConverter;
private final Function<GemtextLink, String> linkConverter;
private final Function<GemtextList, String> listConverter;
private final Function<GemtextPreformat, String> preformatConverter;
private final Function<GemtextQuote, String> quoteConverter;
private final Function<GemtextText, String> textConverter;
private final Function<GemtextAside, String> asideConverter;
private final Function<GemtextTask, String> taskConverter;
private final Function<GemtextTextLiteral, String> literalConverter;
private final Function<GemtextPragma, String> pragmaConverter;
public GemtextRenderer(Function<GemtextHeading, String> headingConverter,
Function<GemtextLink, String> linkConverter,
Function<GemtextList, String> listConverter,
Function<GemtextPreformat, String> preformatConverter,
Function<GemtextQuote, String> quoteConverter,
Function<GemtextText, String> textConverter,
Function<GemtextAside, String> asideConverter,
Function<GemtextTask, String> taskConverter,
Function<GemtextTextLiteral, String> literalConverter,
Function<GemtextPragma, String> pragmaConverter
) {
this.headingConverter = headingConverter;
this.linkConverter = linkConverter;
this.listConverter = listConverter;
this.preformatConverter = preformatConverter;
this.quoteConverter = quoteConverter;
this.textConverter = textConverter;
this.asideConverter = asideConverter;
this.taskConverter = taskConverter;
this.literalConverter = literalConverter;
this.pragmaConverter = pragmaConverter;
}
public String renderLine(AbstractGemtextLine line) {
return line.visit(this);
}
@Override
public String visit(GemtextHeading g) {
return headingConverter.apply(g);
}
@Override
public String visit(GemtextLink g) {
return linkConverter.apply(g);
}
@Override
public String visit(GemtextList g) {
return listConverter.apply(g);
}
@Override
public String visit(GemtextPreformat g) {
return preformatConverter.apply(g);
}
@Override
public String visit(GemtextQuote g) {
return quoteConverter.apply(g);
}
@Override
public String visit(GemtextText g) {
return textConverter.apply(g);
}
@Override
public String visit(GemtextTextLiteral g) {
return literalConverter.apply(g);
}
@Override
public String visit(GemtextAside g) { return asideConverter.apply(g); }
@Override
public String visit(GemtextTask g) { return taskConverter.apply(g); }
@Override
public String visit(GemtextPragma g) { return pragmaConverter.apply(g); }
}

View File

@ -1,227 +0,0 @@
package nu.marginalia.memex.gemini.gmi.renderer;
import nu.marginalia.memex.gemini.gmi.line.*;
import nu.marginalia.memex.memex.model.MemexNodeUrl;
import nu.marginalia.memex.memex.model.MemexUrl;
import org.apache.logging.log4j.util.Strings;
import java.util.Objects;
import java.util.stream.Collectors;
public class GemtextRendererFactory {
public final String urlBase;
public final String docUrl;
public GemtextRendererFactory(String urlBase, String docUrl) {
this.urlBase = Objects.requireNonNull(urlBase, "urlBase must not be null");
this.docUrl = Objects.requireNonNull(docUrl, "docUrl must not be null");
}
public GemtextRendererFactory(String urlBase) {
this.urlBase = Objects.requireNonNull(urlBase, "urlBase must not be null");
this.docUrl = null;
}
public GemtextRendererFactory() {
this.urlBase = null;
this.docUrl = null;
}
public GemtextRenderer htmlRendererEditable() {
return new GemtextRenderer(this::htmlHeadingEditable,
this::htmlLink, this::htmlList,
this::htmlPre, this::htmlQuote,
this::htmlText, this::htmlAside,
this::htmlTask, this::htmlLiteral,
this::htmlPragma);
}
public GemtextRenderer htmlRendererReadOnly() {
return new GemtextRenderer(this::htmlHeadingReadOnly,
this::htmlLink, this::htmlList,
this::htmlPre, this::htmlQuote,
this::htmlText, this::htmlAside,
this::htmlTask, this::htmlLiteral,
this::htmlPragma);
}
public GemtextRenderer gemtextRendererAsIs() {
return new GemtextRenderer(this::rawHeading,
this::rawLink, this::rawList,
this::rawPre, this::rawQuote,
this::rawText, this::rawAside,
this::rawTask, this::rawLiteral,
this::rawPragma);
}
public GemtextRenderer gemtextRendererPublic() {
return new GemtextRenderer(this::rawHeading,
this::rawLink, this::rawList,
this::rawPre, this::rawQuote,
this::rawText, this::rawAside,
this::rawTask, this::rawLiteral,
this::rawSupressPragma);
}
private String htmlPragma(GemtextPragma gemtextPragma) {
return "<!-- pragma: " + sanitizeText(gemtextPragma.getLine()) + " -->\n";
}
public String htmlHeadingEditable(GemtextHeading g) {
if (docUrl == null) {
throw new UnsupportedOperationException("Wrong constructor used, need urlBase and docUrl");
}
// String editLink = String.format("\n<a class=\"utility\" href=\"%s/edit/%s\">Edit</a>\n", urlBase + docUrl, g.getLevel());
return htmlHeadingReadOnly(g);
}
public String htmlHeadingReadOnly(GemtextHeading g) {
if (g.getLevel().getLevel() == 1)
return String.format("<h1 id=\"%s\">%s</h1>\n", g.getLevel(), sanitizeText(g.getName()));
if (g.getLevel().getLevel() == 2)
return String.format("<h2 id=\"%s\">%s</h2>\n", g.getLevel(), sanitizeText(g.getName()));
if (g.getLevel().getLevel() == 3)
return String.format("<h3 id=\"%s\">%s</h3>\n", g.getLevel(), sanitizeText(g.getName()));
return String.format("<h4 id=\"%s\">%s</h4>\n", g.getLevel(), sanitizeText(g.getName()));
}
public String htmlLink(GemtextLink g) {
if (urlBase == null) {
throw new UnsupportedOperationException("Wrong constructor used, need urlBase");
}
final String linkClass = getLinkClass(g.getUrl());
final String linkUrl = getLinkUrl(g.getUrl()).replaceFirst("^gemini://", "https://proxy.vulpes.one/gemini/");
if (g.getTitle() != null) {
return String.format("<dl class=\"link\"><dt><a class=\"%s\" href=\"%s\">%s</a></dt><dd>%s</dd></dl>\n",
linkClass, linkUrl, g.getUrl(), sanitizeText(g.getTitle()));
}
else {
return String.format("<a class=\"%s\" href=\"%s\">%s</a><br>\n",
linkClass, linkUrl, g.getUrl());
}
}
private String getLinkUrl(MemexUrl url) {
if (url instanceof MemexNodeUrl || url.getUrl().startsWith("/")) {
return urlBase + url;
}
return url.toString();
}
private String getLinkClass(MemexUrl url) {
if (url instanceof MemexNodeUrl) {
return "internal";
}
return "external";
}
public String htmlList(GemtextList g) {
return g.getItems()
.stream()
.map(s -> "<li>" + sanitizeText(s) + "</li>")
.collect(
Collectors.joining("\n", "<ul>\n", "</ul>\n"));
}
public String htmlPre(GemtextPreformat g) {
return g.getItems().stream()
.map(this::sanitizeText)
.collect(
Collectors.joining("\n", "<pre>\n", "</pre>\n"));
}
public String htmlLiteral(GemtextTextLiteral g) {
return g.getItems().stream()
.map(this::sanitizeText)
.collect(
Collectors.joining("\n", "<pre class=\"literal\">\n", "</pre>\n"));
}
public String htmlQuote(GemtextQuote g) {
return g.getItems().stream()
.map(this::sanitizeText)
.collect(
Collectors.joining("<br>\n", "<blockquote>\n", "</blockquote>\n"));
}
public String htmlText(GemtextText g) {
return sanitizeText(g.getLine()) + "<br>\n";
}
public String htmlAside(GemtextAside g) {
return "<aside>" + sanitizeText(g.getLine()) + "</aside>\n";
}
public String sanitizeText(String s) {
return s.replaceAll("<", "&lt;").replaceAll(">", "&gt;");
}
public String htmlTask(GemtextTask g) {
return String.format("<a class=\"task-pointer\" name=\"t%s\"></a><div class=\"task %s\" id=\"%s\">%s %s</div>\n",
g.getId(),
g.getState().style,
g.getId(),
"-".repeat(g.getLevel()),
g.getTask());
}
public String rawHeading(GemtextHeading g) {
if (g.getLevel().getLevel() == 1)
return "# " + g.getName();
if (g.getLevel().getLevel() == 2)
return "## " + g.getName();
if (g.getLevel().getLevel() == 3)
return "### " + g.getName();
return "### " + g.getName();
}
public String rawLink(GemtextLink g) {
if (g.getTitle() != null && !g.getTitle().isBlank()) {
return "=> " + g.getUrl().getUrl() + "\t" + g.getTitle();
}
return "=> " + g.getUrl().getUrl();
}
public String rawList(GemtextList g) {
return g.getItems()
.stream()
.map(s -> "* " + s)
.collect(Collectors.joining("\n"));
}
public String rawPre(GemtextPreformat g) {
return g.getItems().stream()
.collect(Collectors.joining("\n", "```\n", "\n```"));
}
public String rawQuote(GemtextQuote g) {
return g.getItems().stream()
.map(s -> "> " + s)
.collect(Collectors.joining());
}
public String rawText(GemtextText g) {
return g.getLine();
}
public String rawLiteral(GemtextTextLiteral g) {
return Strings.join(g.getItems(), '\n');
}
public String rawAside(GemtextAside g) {
return "(" + g.getLine() + ")";
}
public String rawTask(GemtextTask g) {
return "-".repeat(Math.max(0, g.getLevel())) + " " + g.getTask();
}
private String rawPragma(GemtextPragma gemtextPragma) {
return "%%% " + gemtextPragma.getLine();
}
private String rawSupressPragma(GemtextPragma gemtextPragma) {
return "";
}
}

View File

@ -1,185 +0,0 @@
package nu.marginalia.memex.gemini.io;
import nu.marginalia.memex.gemini.BadBotList;
import nu.marginalia.memex.gemini.plugins.FileType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.net.ssl.SSLSocket;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URI;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Optional;
import java.util.stream.Stream;
public class GeminiConnection {
private final SSLSocket connection;
private final Logger logger = LoggerFactory.getLogger("Server");
private final OutputStream os;
private final InputStream is;
private static final BadBotList badBotList = BadBotList.INSTANCE;
public GeminiConnection(SSLSocket connection) throws IOException {
this.connection = connection;
this.os = connection.getOutputStream();
this.is = connection.getInputStream();
}
public String getAddress() {
return connection.getInetAddress().getHostAddress();
}
public Optional<URI> readUrl() throws Exception {
var str = new GeminiInput().get();
if (!badBotList.isQueryPermitted(connection.getInetAddress(), str)) {
return Optional.empty();
}
if (!str.isBlank()) {
return Optional.of(new URI(str));
}
throw new GeminiUserException("Bad URI");
}
public void redirect(String address) throws IOException {
writeStatusLine(GeminiStatusCode.REDIRECT, address);
}
public void redirectPermanent(String address) throws IOException {
writeStatusLine(GeminiStatusCode.REDIRECT_PERMANENT, address);
}
public GeminiConnection writeStatusLine(int code, String meta) throws IOException {
write(String.format("%2d %s", code, meta));
return this;
}
public GeminiConnection writeBytes(byte[] data) throws IOException {
write(data);
return this;
}
public GeminiConnection printf(String pattern, Object...args) throws IOException {
write(String.format(pattern, args));
return this;
}
public GeminiConnection writeLines(String... lines) throws IOException {
for (String s : lines) {
write(s);
}
return this;
}
public GeminiConnection writeLinesFromFile(Path file) throws IOException {
try (Stream<String> lines = Files.lines(file)) {
lines.forEach(line -> {
try {
write(line);
} catch (IOException e) {
logger.error("IO Error", e);
}
});
}
return this;
}
public GeminiConnection acceptLines(Stream<String> lines) {
lines.forEach(line -> {
try {
write(line);
} catch (IOException e) {
logger.error("IO exception", e);
}
});
return this;
}
private void write(String s) throws IOException {
os.write(s.getBytes(StandardCharsets.UTF_8));
os.write(new byte[] { '\r', '\n'});
}
private void write(byte[] bs) throws IOException {
os.write(bs);
}
// This is a weird pattern but it makes the listing code very much cleaner
public void error(String message) {
logger.error("{}", message);
throw new GeminiUserException(message);
}
public void close() {
try {
connection.shutdownOutput();
connection.close();
} catch (IOException e) {
e.printStackTrace();
}
}
public boolean isConnected() {
return connection.isConnected();
}
public void respondWithFile(Path serverPath, FileType fileType) throws IOException {
if (fileType.binary) {
writeStatusLine(GeminiStatusCode.SUCCESS, fileType.mime)
.writeBytes(Files.readAllBytes(serverPath));
}
else {
writeStatusLine(GeminiStatusCode.SUCCESS, fileType.mime)
.writeLinesFromFile(serverPath);
}
}
public class GeminiInput {
private final byte[] buffer = new byte[1024];
private int idx = 0;
final String result;
public GeminiInput() throws IOException {
for (idx = 0; idx < buffer.length; idx++) {
if (hasEndOfLine()) {
result = new String(buffer, 0, idx-2, StandardCharsets.UTF_8);
return;
}
readCharacter();
}
error("String too long");
// unreachable
result = "";
}
public String get() {
return result;
}
private void readCharacter() throws IOException {
int rb = is.read();
if (-1 == rb) {
error("URL incomplete (no CR LF)");
}
buffer[idx] = (byte) rb;
}
public boolean hasEndOfLine() {
return idx > 2
&& buffer[idx - 1] == (byte) '\n'
&& buffer[idx - 2] == (byte) '\r';
}
}
}

View File

@ -1,49 +0,0 @@
package nu.marginalia.memex.gemini.io;
import com.google.inject.Inject;
import com.google.inject.name.Named;
import javax.net.ssl.*;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.security.KeyStore;
import java.security.SecureRandom;
public class GeminiSSLSetUp {
private final Path certPasswordFile;
private final Path certFile;
@Inject
public GeminiSSLSetUp(
@Named("gemini-cert-file") Path certFile,
@Named("gemini-cert-password-file") Path certPasswordFile) {
this.certFile = certFile;
this.certPasswordFile = certPasswordFile;
}
public String getCertPassword() throws IOException {
return Files.readString(certPasswordFile);
}
private SSLContext getContext() throws Exception {
KeyStore ks = KeyStore.getInstance("JKS", "SUN");
ks.load(Files.newInputStream(certFile), getCertPassword().toCharArray());
KeyManagerFactory kmf = KeyManagerFactory.getInstance("SunX509");
kmf.init(ks, getCertPassword().toCharArray());
KeyManager[] keyManagers = kmf.getKeyManagers();
TrustManagerFactory tmf = TrustManagerFactory.getInstance("X509");
tmf.init(ks);
TrustManager[] trustManagers = tmf.getTrustManagers();
var ctx = SSLContext.getInstance("TLSv1.3");
ctx.init(keyManagers, trustManagers, new SecureRandom());
return ctx;
}
public SSLServerSocketFactory getServerSocketFactory() throws Exception {
return getContext().getServerSocketFactory();
}
}

View File

@ -1,11 +0,0 @@
package nu.marginalia.memex.gemini.io;
public class GeminiStatusCode {
public static final int INPUT = 10;
public static final int SUCCESS = 20;
public static final int ERROR_PERMANENT = 50;
public static final int ERROR_TEMPORARY = 40;
public static final int PROXY_ERROR = 43;
public static final int REDIRECT = 30;
public static final int REDIRECT_PERMANENT = 31;
}

View File

@ -1,8 +0,0 @@
package nu.marginalia.memex.gemini.io;
/** Throw to report message to user */
public class GeminiUserException extends RuntimeException {
public GeminiUserException(String message) {
super(message);
}
}

View File

@ -1,52 +0,0 @@
package nu.marginalia.memex.gemini.plugins;
import com.google.inject.Inject;
import com.google.inject.name.Named;
import nu.marginalia.memex.gemini.GeminiService;
import nu.marginalia.memex.gemini.io.GeminiConnection;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.net.URI;
import java.nio.file.Files;
import java.nio.file.Path;
public class BareStaticPagePlugin implements Plugin {
private final Logger logger = LoggerFactory.getLogger(getClass());
private final Path geminiServerRoot;
@Inject
public BareStaticPagePlugin(@Named("gemini-server-root") Path geminiServerRoot) {
this.geminiServerRoot = geminiServerRoot;
}
@Override
public boolean serve(URI url, GeminiConnection connection) throws IOException {
final Path serverPath = getServerPath(url.getPath());
if (!Files.isRegularFile(serverPath)) {
return false;
}
verifyPath(geminiServerRoot, serverPath);
logger.info("Serving {}", serverPath);
connection.respondWithFile(serverPath, FileType.match(serverPath));
return true;
}
private Path getServerPath(String requestPath) {
final Path serverPath = Path.of(geminiServerRoot + requestPath);
if (Files.isDirectory(serverPath) && Files.isRegularFile(serverPath.resolve(GeminiService.DEFAULT_FILENAME))) {
return serverPath.resolve(GeminiService.DEFAULT_FILENAME);
}
return serverPath;
}
}

View File

@ -1,58 +0,0 @@
package nu.marginalia.memex.gemini.plugins;
import java.nio.file.Path;
public enum FileType {
GMI("gmi", "text/gemini", FileIcons.DOCUMENT, false),
GEM("gem", "text/gemini", FileIcons.DOCUMENT, false),
TXT("txt", "text/plain", FileIcons.DOCUMENT, false),
MARKDOWN("md", "text/markdown", FileIcons.DOCUMENT, false),
JAVA("java", "text/java", FileIcons.JAVA, false),
PROPERTIES("properties", "text/properties", FileIcons.SETTINGS, false),
GRADLE("gradle", "text/gradle", FileIcons.SETTINGS, false),
ZIP("zip", "application/zip", FileIcons.ZIP, true),
PNG("png", "image/png", FileIcons.IMAGE, true),
JPG("jpg", "image/jpg", FileIcons.IMAGE, true),
JPEG("jpeg", "image/jpg", FileIcons.IMAGE, true),
BIN("bin", "application/binary", FileIcons.BINARY, true),
SH("sh", "text/sh", FileIcons.SETTINGS, false),
XML("xml", "text/xml", FileIcons.DOCUMENT, false),
DOCKERFILE("Dockerfile", "text/dockerfile", FileIcons.SETTINGS, false)
;
public static FileType match(String fileName) {
for (var type : values()) {
if (fileName.endsWith(type.suffix)) {
return type;
}
}
return BIN;
}
public static FileType match(Path path) {
return match(path.toString());
}
FileType(String suffix, String mime, String icon, boolean binary) {
this.suffix = suffix;
this.mime = mime;
this.icon = icon;
this.binary = binary;
}
public final String suffix;
public final String mime;
public final String icon;
public final boolean binary;
}
class FileIcons {
public static final String DOCUMENT = "🗒";
public static final String JAVA = "";
public static final String SETTINGS = "💻";
public static final String ZIP = "🗜";
public static final String IMAGE = "🖼";
public static final String DIRECTORY = "🗂";
public static final String BINARY = "📚";
}

View File

@ -1,19 +0,0 @@
package nu.marginalia.memex.gemini.plugins;
import nu.marginalia.memex.gemini.io.GeminiConnection;
import nu.marginalia.memex.gemini.io.GeminiUserException;
import java.io.IOException;
import java.net.URI;
import java.nio.file.Path;
public interface Plugin {
/** @return true if content served */
boolean serve(URI url, GeminiConnection connection) throws IOException;
default void verifyPath(Path root, Path p) {
if (!p.normalize().startsWith(root)) {
throw new GeminiUserException("ಠ_ಠ That path is off limits!");
}
}
}

View File

@ -1,78 +0,0 @@
package nu.marginalia.memex.gemini.plugins;
import com.google.inject.Inject;
import nu.marginalia.memex.gemini.io.GeminiConnection;
import nu.marginalia.memex.gemini.io.GeminiStatusCode;
import org.apache.http.HttpHost;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.conn.routing.HttpRoute;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
public class SearchPlugin implements Plugin {
private final PoolingHttpClientConnectionManager connectionManager;
private final Logger logger = LoggerFactory.getLogger(getClass());
@Inject
public SearchPlugin() {
connectionManager = new PoolingHttpClientConnectionManager();
connectionManager.setMaxTotal(200);
connectionManager.setDefaultMaxPerRoute(20);
HttpHost host = new HttpHost("https://search.marginalia.nu/");
connectionManager.setMaxPerRoute(new HttpRoute(host), 20);
}
@Override
public boolean serve(URI url, GeminiConnection connection) throws IOException {
var client = HttpClients.custom()
.setConnectionManager(connectionManager)
.build();
if (!"/search".equals(url.getPath())) {
return false;
}
String query = url.getRawQuery();
if (null == query || "".equals(query)) {
logger.info("Requesting search terms");
connection.writeStatusLine(GeminiStatusCode.INPUT, "Please enter a search query");
}
else {
logger.info("Delegating search query '{}'", query);
final HttpGet get = new HttpGet(createSearchUri(query));
final byte[] binaryResponse;
try (var rsp = client.execute(get)) {
binaryResponse = rsp.getEntity().getContent().readAllBytes();
}
catch (IOException ex) {
logger.error("backend error", ex);
connection.writeStatusLine(GeminiStatusCode.PROXY_ERROR, "Failed to reach backend server");
return true;
}
connection
.writeStatusLine(GeminiStatusCode.SUCCESS, "text/gemini")
.writeBytes(binaryResponse);
}
return true;
}
private URI createSearchUri(String query) {
try {
return new URI("https://search.marginalia.nu/search?format=gmi&query="+query);
} catch (URISyntaxException e) {
throw new RuntimeException(e);
}
}
}

View File

@ -1,244 +0,0 @@
package nu.marginalia.memex.memex;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.google.inject.name.Named;
import io.reactivex.rxjava3.schedulers.Schedulers;
import nu.marginalia.memex.gemini.GeminiService;
import nu.marginalia.memex.gemini.gmi.GemtextDatabase;
import nu.marginalia.memex.gemini.gmi.GemtextDocument;
import nu.marginalia.memex.util.dithering.FloydSteinbergDither;
import nu.marginalia.memex.util.dithering.Palettes;
import nu.marginalia.memex.memex.change.GemtextTombstoneUpdateCaclulator;
import nu.marginalia.memex.memex.model.MemexImage;
import nu.marginalia.memex.memex.model.MemexNode;
import nu.marginalia.memex.memex.model.MemexNodeUrl;
import nu.marginalia.memex.memex.renderer.MemexRendererers;
import nu.marginalia.memex.memex.system.MemexFileSystemMonitor;
import nu.marginalia.memex.memex.system.MemexFileWriter;
import nu.marginalia.memex.memex.system.git.MemexGitRepo;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.Nullable;
import javax.imageio.ImageIO;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.List;
import java.util.Objects;
import java.util.concurrent.TimeUnit;
@Singleton
public class Memex {
private final MemexData data;
private final MemexFileSystemMonitor monitor;
private final MemexGitRepo gitRepo;
private final MemexLoader loader;
private final MemexFileWriter resources;
private final GemtextTombstoneUpdateCaclulator tombstoneUpdateCaclulator;
private final FloydSteinbergDither ditherer = new FloydSteinbergDither(Palettes.MARGINALIA_PALETTE, 640, 480);
private final MemexRendererers renderers;
private static final Logger logger = LoggerFactory.getLogger(Memex.class);
@Inject
public Memex(MemexData data,
@Nullable MemexFileSystemMonitor monitor,
MemexGitRepo gitRepo, MemexLoader loader,
@Named("html") MemexFileWriter htmlFiles,
GemtextTombstoneUpdateCaclulator tombstoneUpdateCaclulator,
MemexRendererers renderers,
GeminiService geminiService) {
this.data = data;
this.monitor = monitor;
this.gitRepo = gitRepo;
this.loader = loader;
this.resources = htmlFiles;
this.tombstoneUpdateCaclulator = tombstoneUpdateCaclulator;
this.renderers = renderers;
Schedulers.io().scheduleDirect(this::load);
if (monitor != null) {
Schedulers.io().schedulePeriodicallyDirect(this::refreshUpdatedUrls, 1, 1, TimeUnit.SECONDS);
}
Schedulers.newThread().scheduleDirect(geminiService::run);
}
private void refreshUpdatedUrls() {
var updatedUrls = monitor.getUpdatedUrls();
for (var url : updatedUrls) {
try {
if (url.toString().endsWith(".gmi")) {
var updates = loader.reloadNode(url);
updates.forEach(renderers::render);
if (!updates.isEmpty()) {
renderers.render(url.getParentUrl());
}
} else if (url.toString().endsWith(".png")) {
var updates = loader.reloadImage(url);
renderers.render(url);
if (!updates.isEmpty()) {
renderers.render(url.getParentUrl());
}
}
if (tombstoneUpdateCaclulator.isTombstoneFile(url)) {
loader.loadTombstones().forEach(renderers::render);
}
if (tombstoneUpdateCaclulator.isRedirectFile(url)) {
loader.loadRedirects().forEach(renderers::render);
}
}
catch (Exception ex) {
logger.error("Failed to refresh URL " + url, ex);
}
}
}
private void load() {
copyStylesheet();
try {
loader.load();
renderAll();
}
catch (IOException ex) {
logger.error("Failed to load", ex);
}
}
private void copyStylesheet() {
try (var resource = Objects.requireNonNull(
ClassLoader.getSystemResourceAsStream("static/memex/style-new.css"), "Could not load stylesheet")) {
resources.write(new MemexNodeUrl("/style-new.css"), resource.readAllBytes());
}
catch (Exception ex) {
logger.error("Failed to copy stylesheet", ex);
}
try (var resource = Objects.requireNonNull(
ClassLoader.getSystemResourceAsStream("static/memex/ico/dir.png"), "Could not copy file")) {
resources.write(new MemexNodeUrl("/ico/dir.png"), resource.readAllBytes());
}
catch (Exception ex) {
logger.error("Failed to copy file", ex);
}
try (var resource = Objects.requireNonNull(
ClassLoader.getSystemResourceAsStream("static/memex/ico/file.png"), "Could not copy file")) {
resources.write(new MemexNodeUrl("/ico/file.png"), resource.readAllBytes());
}
catch (Exception ex) {
logger.error("Failed to copy file", ex);
}
try (var resource = Objects.requireNonNull(
ClassLoader.getSystemResourceAsStream("static/memex/ico/root.png"), "Could not copy file")) {
resources.write(new MemexNodeUrl("/ico/root.png"), resource.readAllBytes());
}
catch (Exception ex) {
logger.error("Failed to copy file", ex);
}
try (var resource = Objects.requireNonNull(
ClassLoader.getSystemResourceAsStream("static/memex/ico/pic16.png"), "Could not copy file")) {
resources.write(new MemexNodeUrl("/ico/pic16.png"), resource.readAllBytes());
}
catch (Exception ex) {
logger.error("Failed to copy file", ex);
}
}
private void renderAll() {
data.forEach((url, doc) -> {
renderers.render(url);
});
data.getDirectories().forEach(renderers::render);
data.getImages().forEach(img -> renderers.render(img.path));
data.getTombstones().ifPresent(this::renderTombstoneFromGemtextDb);
data.getRedirects().ifPresent(this::renderTombstoneFromGemtextDb);
}
private void renderTombstoneFromGemtextDb(GemtextDatabase db) {
db.keys()
.stream()
.map(MemexNodeUrl::new)
.filter(url -> getDocument(url) == null)
.forEach(renderers::render);
}
public void updateNode(MemexNodeUrl node, String text) throws IOException {
var nodes = loader.updateNode(node, text);
nodes.forEach(renderers::render);
renderers.render(node.getParentUrl());
}
public GemtextDocument getDocument(MemexNodeUrl url) {
return data.getDocument(url);
}
public MemexImage getImage(MemexNodeUrl url) {
return data.getImage(url);
}
public void createNode(MemexNodeUrl node, String text) throws IOException {
var nodes = loader.createNode(node, text);
nodes.forEach(renderers::render);
renderers.render(node.getParentUrl());
}
public void uploadImage(MemexNodeUrl url, byte[] bytes) throws IOException {
var image = ImageIO.read(new ByteArrayInputStream(bytes));
var convertedImage = ditherer.convert(image);
var baosOut = new ByteArrayOutputStream();
ImageIO.write(convertedImage, "png", baosOut);
loader.uploadImage(url, baosOut.toByteArray());
renderers.render(url);
renderers.render(url.getParentUrl());
}
public void delete(MemexNode node, String message) throws IOException {
tombstoneUpdateCaclulator.addTombstone(node.getUrl(), message)
.visit(this);
loader.loadTombstones();
loader.delete(node).forEach(renderers::render);
}
public List<GemtextDocument> getDocumentsByPath(MemexNodeUrl url) {
return data.getDocumentsByPath(url);
}
public void gitPull() {
gitRepo.pull();
}
public void rename(MemexNode src, MemexNodeUrl dst) throws IOException {
tombstoneUpdateCaclulator.addRedirect(src.getUrl(), dst.toString())
.visit(this);
loader.loadRedirects();
loader.rename(src, dst).forEach(renderers::render);
}
public byte[] getRaw(MemexNodeUrl url) throws IOException {
return loader.getRaw(url);
}
}

View File

@ -1,87 +0,0 @@
package nu.marginalia.memex.memex;
import com.google.inject.AbstractModule;
import com.google.inject.Inject;
import com.google.inject.Provider;
import com.google.inject.name.Named;
import com.google.inject.name.Names;
import lombok.SneakyThrows;
import nu.marginalia.memex.gemini.GeminiService;
import nu.marginalia.memex.gemini.GeminiServiceDummy;
import nu.marginalia.memex.gemini.GeminiServiceImpl;
import nu.marginalia.memex.memex.system.MemexFileWriter;
import nu.marginalia.memex.memex.system.git.MemexGitRepo;
import nu.marginalia.memex.memex.system.git.MemexGitRepoDummy;
import nu.marginalia.memex.memex.system.git.MemexGitRepoImpl;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.nio.file.Path;
public class MemexConfigurationModule extends AbstractModule {
private static final Logger logger = LoggerFactory.getLogger(MemexConfigurationModule.class);
private static final String MEMEX_ROOT_PROPERTY = System.getProperty("memex-root", "/var/lib/wmsa/memex");
private static final String MEMEX_HTML_PROPERTY = System.getProperty("memex-html-resources", "/var/lib/wmsa/memex-html");
private static final String MEMEX_GMI_PROPERTY = System.getProperty("memex-gmi-resources", "/var/lib/wmsa/memex-gmi");
private static final boolean MEMEX_DISABLE_GIT = Boolean.getBoolean("memex-disable-git");
private static final boolean MEMEX_DISABLE_GEMINI = Boolean.getBoolean("memex-disable-gemini");
@SneakyThrows
public MemexConfigurationModule() {
Thread.sleep(100);
}
public void configure() {
bind(Path.class).annotatedWith(Names.named("memex-root")).toInstance(Path.of(MEMEX_ROOT_PROPERTY));
bind(Path.class).annotatedWith(Names.named("memex-html-resources")).toInstance(Path.of(MEMEX_HTML_PROPERTY));
bind(Path.class).annotatedWith(Names.named("memex-gmi-resources")).toInstance(Path.of(MEMEX_GMI_PROPERTY));
bind(String.class).annotatedWith(Names.named("tombestone-special-file")).toInstance("/special/tombstone.gmi");
bind(String.class).annotatedWith(Names.named("redirects-special-file")).toInstance("/special/redirect.gmi");
switchImpl(MemexGitRepo.class, MEMEX_DISABLE_GIT, MemexGitRepoDummy.class, MemexGitRepoImpl.class);
switchImpl(GeminiService.class, MEMEX_DISABLE_GEMINI, GeminiServiceDummy.class, GeminiServiceImpl.class);
bind(MemexFileWriter.class).annotatedWith(Names.named("html")).toProvider(MemexHtmlWriterProvider.class);
bind(MemexFileWriter.class).annotatedWith(Names.named("gmi")).toProvider(MemexGmiWriterProvider.class);
}
<T> void switchImpl(Class<T> impl, boolean param, Class<? extends T> ifEnabled, Class<? extends T> ifDisabled) {
final Class<? extends T> choice;
if (param) {
choice = ifEnabled;
}
else {
choice = ifDisabled;
}
bind(impl).to(choice).asEagerSingleton();
}
public static class MemexHtmlWriterProvider implements Provider<MemexFileWriter> {
private final Path path;
@Inject
public MemexHtmlWriterProvider(@Named("memex-html-resources") Path resources) {
this.path = resources;
}
@Override
public MemexFileWriter get() {
return new MemexFileWriter(path);
}
}
public static class MemexGmiWriterProvider implements Provider<MemexFileWriter> {
private final Path path;
@Inject
public MemexGmiWriterProvider(@Named("memex-gmi-resources") Path resources) {
this.path = resources;
}
@Override
public MemexFileWriter get() {
return new MemexFileWriter(path);
}
}
}

View File

@ -1,150 +0,0 @@
package nu.marginalia.memex.memex;
import com.google.inject.Singleton;
import nu.marginalia.memex.gemini.gmi.GemtextDatabase;
import nu.marginalia.memex.gemini.gmi.GemtextDocument;
import nu.marginalia.memex.memex.model.MemexImage;
import nu.marginalia.memex.memex.model.MemexLink;
import nu.marginalia.memex.memex.model.MemexNodeUrl;
import nu.marginalia.memex.memex.model.fs.MemexFileSystem;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.*;
import java.util.function.BiConsumer;
@Singleton
public class MemexData {
private final MemexLinks links = new MemexLinks();
private final Map<MemexNodeUrl, GemtextDocument> documents = new HashMap<>();
private final Map<MemexNodeUrl, MemexImage> images = new HashMap<>();
private final MemexFileSystem fileSystem = new MemexFileSystem();
private final Logger logger = LoggerFactory.getLogger(getClass());
private GemtextDatabase tombstones = null;
private GemtextDatabase redirects = null;
public synchronized Collection<MemexImage> getImages() {
return new ArrayList<>(images.values());
}
public synchronized Collection<GemtextDocument> getDocuments() { return new ArrayList<>(documents.values()); }
public synchronized void setTombstones(GemtextDatabase tombstones) {
this.tombstones = tombstones;
}
public synchronized void setRedirects(GemtextDatabase redirects) {
this.redirects = redirects;
}
public synchronized void addDocument(MemexNodeUrl url, GemtextDocument doc) {
logger.debug("addDocument({})", url);
documents.put(url, doc);
fileSystem.register(doc);
}
public synchronized void addImage(MemexNodeUrl url, MemexImage img) {
images.put(url, img);
fileSystem.register(img);
}
public Optional<GemtextDatabase> getTombstones() {
return Optional.ofNullable(tombstones);
}
public Optional<GemtextDatabase> getRedirects() {
return Optional.ofNullable(redirects);
}
public synchronized void updateOutlinks(MemexNodeUrl url, GemtextDocument doc) {
var linksForNode = new TreeSet<>(Comparator.comparing(MemexLink::getDest));
MemexNodeUrl srcUrl = "index.gmi".equals(url.getFilename()) ? url.getParentUrl() : url;
for (var link : doc.getLinks()) {
link.getUrl().visitNodeUrl(nodeUrl ->
linksForNode.add(new MemexLink(nodeUrl, srcUrl, doc.getTitle(), doc.getHeadingForElement(link), link.getHeading()))
);
}
links.setOutlinks(srcUrl, linksForNode);
}
public synchronized Set<MemexNodeUrl> getNeighbors(MemexNodeUrl url) {
return links.getNeighbors(url);
}
public synchronized void forEach(BiConsumer<MemexNodeUrl, GemtextDocument> consumer) {
documents.forEach(consumer);
}
public synchronized GemtextDocument getDocument(MemexNodeUrl url) {
return documents.get(url);
}
public synchronized MemexImage getImage(MemexNodeUrl url) {
return images.get(url);
}
public synchronized List<MemexLink> getBacklinks(MemexNodeUrl... urls) {
return links.getBacklinks(urls);
}
public synchronized List<GemtextDocument> getDocumentsByPath(MemexNodeUrl url) {
return fileSystem.getDocuments(url);
}
public synchronized List<MemexImage> getImagesByPath(MemexNodeUrl url) {
return fileSystem.getImages(url);
}
public synchronized List<MemexNodeUrl> getSubdirsByPath(MemexNodeUrl url) {
return fileSystem.getSubdirs(url);
}
public MemexFileSystem getFilesystem() {
return fileSystem;
}
public List<MemexNodeUrl> getDirectories() {
return fileSystem.getAllDirectories();
}
public boolean isDirectory(MemexNodeUrl url) {
return fileSystem.isDirectory(url);
}
public synchronized Set<MemexNodeUrl> deleteImage(MemexNodeUrl url) {
images.remove(url);
fileSystem.remove(url);
Set<MemexNodeUrl> affectedUrls = new HashSet<>();
affectedUrls.add(url);
affectedUrls.add(url.getParentUrl());
return affectedUrls;
}
public synchronized Set<MemexNodeUrl> deleteDocument(MemexNodeUrl url) {
Set<MemexNodeUrl> affectedUrls = new HashSet<>();
affectedUrls.add(url);
affectedUrls.add(url.getParentUrl());
links.getOutlinks(url)
.stream()
.map(MemexLink::getDest)
.forEach(affectedUrls::add);
documents.remove(url);
fileSystem.remove(url);
links.remove(url);
return affectedUrls;
}
public boolean hasTombstone(MemexNodeUrl url) {
if (tombstones != null && tombstones.getLinkData(url).isPresent())
return true;
if (redirects != null && redirects.getLinkData(url).isPresent())
return true;
return false;
}
}

View File

@ -1,54 +0,0 @@
package nu.marginalia.memex.memex;
import nu.marginalia.memex.memex.model.MemexLink;
import nu.marginalia.memex.memex.model.MemexNodeUrl;
import java.util.*;
import java.util.stream.Collectors;
public class MemexLinks {
private Map<MemexNodeUrl, List<MemexLink>> backLinks = new HashMap<>();
private final Map<MemexNodeUrl, Set<MemexLink>> links = new HashMap<>();
public void updateBacklinks() {
backLinks.clear();
backLinks = links.values().stream()
.flatMap(Set::stream)
.collect(Collectors.groupingBy(MemexLink::getDest));
}
public Set<MemexNodeUrl> getNeighbors(MemexNodeUrl url) {
final Set<MemexNodeUrl> neighbors = new HashSet<>();
links.getOrDefault(url, Collections.emptySet()).stream().map(MemexLink::getDest)
.forEach(neighbors::add);
backLinks.getOrDefault(url, Collections.emptyList()).stream()
.map(MemexLink::getSrc)
.forEach(neighbors::add);
return neighbors;
}
public void setOutlinks(MemexNodeUrl url, TreeSet<MemexLink> linksForNode) {
links.put(url, linksForNode);
updateBacklinks();
}
public List<MemexLink> getBacklinks(MemexNodeUrl... urls) {
return Arrays.stream(urls)
.map(backLinks::get)
.filter(Objects::nonNull)
.flatMap(List::stream)
.sorted(Comparator.comparing(MemexLink::getSrc))
.collect(Collectors.toList());
}
public Set<MemexLink> getOutlinks(MemexNodeUrl url) {
return links.getOrDefault(url, Collections.emptySet());
}
public void remove(MemexNodeUrl url) {
links.remove(url);
updateBacklinks();
}
}

View File

@ -1,265 +0,0 @@
package nu.marginalia.memex.memex;
import com.google.common.collect.Sets;
import com.google.inject.Inject;
import com.google.inject.name.Named;
import nu.marginalia.memex.gemini.gmi.GemtextDatabase;
import nu.marginalia.memex.gemini.gmi.GemtextDocument;
import nu.marginalia.memex.memex.model.MemexImage;
import nu.marginalia.memex.memex.model.MemexNode;
import nu.marginalia.memex.memex.model.MemexNodeUrl;
import nu.marginalia.memex.memex.system.MemexFileSystemModifiedTimes;
import nu.marginalia.memex.memex.system.MemexSourceFileSystem;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.CheckReturnValue;
import java.io.File;
import java.io.IOException;
import java.nio.file.*;
import java.util.*;
public class MemexLoader {
private final MemexData data;
private final MemexFileSystemModifiedTimes modifiedTimes;
private final Path root;
private final MemexSourceFileSystem sourceFileSystem;
private final String tombstonePath;
private final String redirectsPath;
private static final Logger logger = LoggerFactory.getLogger(MemexLoader.class);
@Inject
public MemexLoader(MemexData data,
MemexFileSystemModifiedTimes modifiedTimes,
MemexSourceFileSystem sourceFileSystem,
@Named("memex-root") Path root,
@Named("tombestone-special-file") String tombstonePath,
@Named("redirects-special-file") String redirectsPath) {
this.data = data;
this.modifiedTimes = modifiedTimes;
this.sourceFileSystem = sourceFileSystem;
this.root = root;
this.tombstonePath = tombstonePath;
this.redirectsPath = redirectsPath;
}
public void load() throws IOException {
loadTombstones();
loadRedirects();
try (var files = Files.walk(root)) {
files.forEach(this::loadFile);
}
data.getFilesystem().recalculateDirectories();
}
private void loadFile(Path p) {
var file = p.toFile();
try {
if (p.toString().contains(".git")) {
return;
}
if (file.isDirectory() && !file.getName().startsWith(".")) {
data.getFilesystem().registerDir(MemexNodeUrl.ofRelativePath(root, p));
} else if (isGemtext(file)) {
loadNode(p);
} else if (isImage(file)) {
loadImage(p);
}
}
catch (IOException ex) {
logger.error("Failed to load file " + p, ex);
}
}
public void loadImage(Path p) throws IOException {
if (!modifiedTimes.isFreshUpdate(p)) {
return;
}
var url = MemexNodeUrl.ofRelativePath(root, p);
data.addImage(url, new MemexImage(url, p));
logger.info("Loading {}", p);
}
public Set<MemexNodeUrl> loadTombstones() {
var oldValues = data.getTombstones();
var newValues = loadGemtextDb(Path.of(root + tombstonePath));
newValues.ifPresent(data::setTombstones);
if (newValues.isPresent()) {
if (oldValues.isPresent()) {
var oldTs = oldValues.get();
var newTs = newValues.get();
return oldTs.difference(newTs);
}
}
return Collections.emptySet();
}
public Set<MemexNodeUrl> loadRedirects() {
var oldValues = data.getTombstones();
var newValues = loadGemtextDb(Path.of(root + redirectsPath));
newValues.ifPresent(data::setRedirects);
if (newValues.isPresent()) {
if (oldValues.isPresent()) {
var oldTs = oldValues.get();
var newTs = newValues.get();
return oldTs.difference(newTs);
}
}
return Collections.emptySet();
}
private Optional<GemtextDatabase> loadGemtextDb(Path p) {
if (Files.exists(p)) {
try {
return Optional.of(GemtextDatabase.of(MemexNodeUrl.ofRelativePath(root, p), p));
} catch (IOException e) {
logger.error("Failed to load database " + p, e);
}
}
return Optional.empty();
}
private boolean isGemtext(File f) {
return f.isFile() && f.getName().endsWith(".gmi");
}
private boolean isImage(File f) {
return f.isFile() && f.getName().endsWith(".png");
}
@CheckReturnValue
public Collection<MemexNodeUrl> updateNode(MemexNodeUrl url, String contents) throws IOException {
sourceFileSystem.replaceFile(url, contents);
return loadNode(url);
}
@CheckReturnValue
public Collection<MemexNodeUrl> createNode(MemexNodeUrl url, String contents) throws IOException {
sourceFileSystem.createFile(url, contents);
return loadNode(url);
}
public MemexImage uploadImage(MemexNodeUrl url, byte[] bytes) throws IOException {
sourceFileSystem.createFile(url, bytes);
var img = new MemexImage(url, url.asAbsolutePath(root));
data.addImage(url, img);
return img;
}
public Set<MemexNodeUrl> reloadImage(MemexNodeUrl url) throws IOException {
var path = url.asAbsolutePath(root);
if (!Files.exists(path)) {
return data.deleteImage(url);
}
else {
loadImage(path);
Set<MemexNodeUrl> affectedUrls = new HashSet<>();
affectedUrls.add(url);
for (var u = url.getParentUrl(); u != null; u = u.getParentUrl()) {
affectedUrls.add(u);
}
return affectedUrls;
}
}
public Set<MemexNodeUrl> reloadNode(MemexNodeUrl url) throws IOException {
var path = url.asAbsolutePath(root);
if (!Files.exists(path)) {
return data.deleteDocument(url);
}
else {
return loadNode(path);
}
}
public Set<MemexNodeUrl> loadNode(Path path) throws IOException {
if (!modifiedTimes.isFreshUpdate(path)) {
return Set.of(MemexNodeUrl.ofRelativePath(root, path));
}
logger.info("Loading {}", path);
return loadNode(MemexNodeUrl.ofRelativePath(root, path));
}
public Set<MemexNodeUrl> loadNode(MemexNodeUrl url) throws IOException {
var doc = GemtextDocument.of(url, url.asAbsolutePath(root));
data.addDocument(url, doc);
Set<MemexNodeUrl> urlsAffected = data.getNeighbors(url);
data.updateOutlinks(url, doc);
urlsAffected.addAll(data.getNeighbors(url));
urlsAffected.add(url);
urlsAffected.removeIf(u -> null == data.getDocument(u));
for (var u = url.getParentUrl(); u != null; u = u.getParentUrl()) {
urlsAffected.add(u);
}
return urlsAffected;
}
public Set<MemexNodeUrl> delete(MemexNode node) throws IOException {
sourceFileSystem.delete(node.getUrl());
return node.visit(new MemexNode.MemexNodeVisitor<>() {
@Override
public Set<MemexNodeUrl> onDocument(MemexNodeUrl url) {
return data.deleteDocument(url);
}
@Override
public Set<MemexNodeUrl> onImage(MemexNodeUrl url) {
return data.deleteImage(url);
}
});
}
public Set<MemexNodeUrl> rename(MemexNode src, MemexNodeUrl dst) throws IOException {
sourceFileSystem.renameFile(src.getUrl(), dst);
return src.visit(new MemexNode.MemexNodeVisitor<Set<MemexNodeUrl>>() {
@Override
public Set<MemexNodeUrl> onDocument(MemexNodeUrl url) throws IOException {
var changes = data.deleteDocument(url);
return Sets.union(changes, reloadNode(dst));
}
@Override
public Set<MemexNodeUrl> onImage(MemexNodeUrl url) throws IOException {
var changes = data.deleteImage(url);
return Sets.union(changes, reloadImage(dst));
}
});
}
public byte[] getRaw(MemexNodeUrl url) throws IOException {
return sourceFileSystem.getRaw(url);
}
}

View File

@ -1,31 +0,0 @@
package nu.marginalia.memex.memex;
import com.google.inject.Guice;
import com.google.inject.Inject;
import com.google.inject.Injector;
import nu.marginalia.memex.MemexServiceDescriptors;
import nu.marginalia.memex.gemini.GeminiConfigurationModule;
import nu.marginalia.service.MainClass;
import nu.marginalia.service.id.ServiceId;
import nu.marginalia.service.module.ConfigurationModule;
import nu.marginalia.service.server.Initialization;
public class MemexMain extends MainClass {
private final MemexService service;
@Inject
public MemexMain(MemexService service) {
this.service = service;
}
public static void main(String... args) {
MainClass.init(ServiceId.Other_Memex, args);
Injector injector = Guice.createInjector(
new MemexConfigurationModule(),
new GeminiConfigurationModule(),
new ConfigurationModule(MemexServiceDescriptors.descriptors, ServiceId.Other_Memex));
injector.getInstance(MemexMain.class);
injector.getInstance(Initialization.class).setReady();
}
}

View File

@ -1,292 +0,0 @@
package nu.marginalia.memex.memex;
import com.google.inject.Inject;
import com.google.inject.name.Named;
import lombok.SneakyThrows;
import nu.marginalia.client.Context;
import nu.marginalia.memex.gemini.gmi.GemtextDocument;
import nu.marginalia.memex.gemini.gmi.renderer.GemtextRendererFactory;
import nu.marginalia.memex.auth.client.AuthClient;
import nu.marginalia.memex.memex.model.render.*;
import nu.marginalia.memex.memex.change.GemtextMutation;
import nu.marginalia.memex.memex.change.update.GemtextDocumentUpdateCalculator;
import nu.marginalia.memex.memex.model.MemexNodeHeadingId;
import nu.marginalia.memex.memex.model.MemexNodeUrl;
import nu.marginalia.memex.memex.renderer.MemexHtmlRenderer;
import nu.marginalia.service.server.Initialization;
import nu.marginalia.service.server.MetricsServer;
import nu.marginalia.service.server.Service;
import org.apache.http.HttpStatus;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import spark.Request;
import spark.Response;
import spark.Spark;
import javax.servlet.MultipartConfigElement;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.Collections;
import java.util.Objects;
import static spark.Spark.*;
public class MemexService extends Service {
private final GemtextDocumentUpdateCalculator updateCalculator;
private final Memex memex;
private final MemexHtmlRenderer renderer;
private final AuthClient authClient;
private final Logger logger = LoggerFactory.getLogger(getClass());
@Inject
public MemexService(@Named("service-host") String ip,
@Named("service-port") Integer port,
GemtextDocumentUpdateCalculator updateCalculator,
Memex memex,
MemexHtmlRenderer renderer,
AuthClient authClient,
Initialization initialization,
MetricsServer metricsServer,
@Named("memex-html-resources") Path memexHtmlDir
) {
super(ip, port, initialization, metricsServer, () -> {
staticFiles.externalLocation(memexHtmlDir.toString());
staticFiles.disableMimeTypeGuessing();
staticFiles.registerMimeType("gmi", "text/html");
staticFiles.registerMimeType("png", "text/html");
staticFiles.expireTime(60);
staticFiles.header("Cache-control", "public,proxy-revalidate");
});
this.updateCalculator = updateCalculator;
this.memex = memex;
this.renderer = renderer;
this.authClient = authClient;
Spark.get("git-pull", this::gitPull);
Spark.path("public/api", () -> {
before((req, rsp) -> {
logger.info("{} {}", req.requestMethod(), req.pathInfo());
});
after((req, rsp) -> {
rsp.header("Cache-control", "no-cache");
});
post("/create", this::create);
get("/create", this::createForm, this::renderModel);
post("/upload", this::upload);
get("/upload", this::uploadForm, this::renderModel);
post("/update", this::update);
get("/update", this::updateForm, this::renderModel);
post("/rename", this::rename);
get("/rename", this::renameForm, this::renderModel);
post("/delete", this::delete);
get("/delete", this::deleteForm, this::renderModel);
get("/raw", this::raw);
});
}
private Object raw(Request request, Response response) throws IOException {
final MemexNodeUrl url = new MemexNodeUrl(Objects.requireNonNull(request.queryParams("url")));
response.type(url.toNode().getType().mime);
response.header("Content-Disposition", "attachment; filename=" + url.getFilename());
response.raw().getOutputStream().write(memex.getRaw(url));
return "";
}
private Object renameForm(Request request, Response response) {
final String type = Objects.requireNonNull(request.queryParams("type"));
final MemexNodeUrl url = new MemexNodeUrl(Objects.requireNonNull(request.queryParams("url")));
authClient.redirectToLoginIfUnauthenticated("MEMEX", request, response);
if ("gmi".equals(type)) {
var doc = memex.getDocument(url);
if (null == doc) {
Spark.halt(404);
}
final String docHtml = doc.render(new GemtextRendererFactory("", url.toString()).htmlRendererEditable());
return new MemexRendererRenameFormModel(docHtml,
null, url, "gmi");
}
else if ("img".equals(type)) {
var img = memex.getImage(url);
if (null == img) {
Spark.halt(404);
}
return new MemexRendererRenameFormModel(null,
new MemexRendererImageModel(img, Collections.emptyList(), null),
url, "img");
}
Spark.halt(HttpStatus.SC_BAD_REQUEST);
return null;
}
private Object rename(Request request, Response response) throws IOException {
authClient.redirectToLoginIfUnauthenticated("MEMEX", request, response);
var url = Objects.requireNonNull(request.queryParams("url"));
var name = Objects.requireNonNull(request.queryParams("name"));
var type = Objects.requireNonNull(request.queryParams("type"));
var confirm = Objects.requireNonNull(request.queryParams("confirm"));
if (!"on".equals(confirm)) {
logger.error("Confirm dialog not checked, was {}", confirm);
Spark.halt(HttpStatus.SC_BAD_REQUEST, "Confirm was not checked");
}
memex.rename(new MemexNodeUrl(url).toNode(), new MemexNodeUrl(name));
response.redirect("https://memex.marginalia.nu/"+name);
return null;
}
private Object gitPull(Request request, Response response) {
logger.info("Git pull by request");
memex.gitPull();
return "Ok";
}
private String renderModel(Object model) {
return ((MemexRendererableDirect)model).render(renderer);
}
private MemexRendererDeleteFormModel deleteForm(Request request, Response response) {
final String type = Objects.requireNonNull(request.queryParams("type"));
final MemexNodeUrl url = new MemexNodeUrl(Objects.requireNonNull(request.queryParams("url")));
authClient.redirectToLoginIfUnauthenticated("MEMEX", request, response);
if ("gmi".equals(type)) {
var doc = memex.getDocument(url);
if (null == doc) {
Spark.halt(404);
}
final String docHtml = doc.render(new GemtextRendererFactory("", url.toString()).htmlRendererEditable());
return new MemexRendererDeleteFormModel(docHtml,
null, url, "gmi");
}
else if ("img".equals(type)) {
var img = memex.getImage(url);
if (null == img) {
Spark.halt(404);
}
return new MemexRendererDeleteFormModel(null,
new MemexRendererImageModel(img, Collections.emptyList(), null),
url, "img");
}
Spark.halt(HttpStatus.SC_BAD_REQUEST);
return null;
}
private Object delete(Request request, Response response) throws IOException {
authClient.requireLogIn(Context.fromRequest(request));
var url = Objects.requireNonNull(request.queryParams("url"));
var message = Objects.requireNonNull(request.queryParams("note"));
var type = Objects.requireNonNull(request.queryParams("type"));
var confirm = Objects.requireNonNull(request.queryParams("confirm"));
if (!"on".equals(confirm)) {
logger.error("Confirm dialog not checked, was {}", confirm);
Spark.halt(HttpStatus.SC_BAD_REQUEST, "Confirm was not checked");
}
memex.delete(new MemexNodeUrl(url).toNode(), message);
response.redirect("https://memex.marginalia.nu/"+url);
return null;
}
private Object update(Request request, Response response) throws IOException {
authClient.requireLogIn(Context.fromRequest(request));
String extUrl = Objects.requireNonNull(request.queryParams("url"));
String extSection = Objects.requireNonNull(request.queryParams("section"));
String newSectionText = Objects.requireNonNull(request.queryParams("text"));
var url = new MemexNodeUrl(extUrl);
var section = MemexNodeHeadingId.parse(extSection);
var lines = Arrays.asList(newSectionText.split("\r?\n")).toArray(String[]:: new);
var sectionGemtext = new GemtextDocument(url, lines, section);
var updates = updateCalculator.calculateUpdates(memex.getDocument(url), section, sectionGemtext);
for (GemtextMutation mutation : updates) {
mutation.visit(memex);
}
response.redirect("https://memex.marginalia.nu/"+extUrl);
return "";
}
private Object create(Request request, Response response) throws IOException {
authClient.requireLogIn(Context.fromRequest(request));
String directory = Objects.requireNonNull(request.queryParams("directory"));
String filename = Objects.requireNonNull(request.queryParams("filename"));
String text = Objects.requireNonNull(request.queryParams("text"));
var url = new MemexNodeUrl(Path.of(directory).resolve(filename).toString());
memex.createNode(url, text);
response.redirect("https://memex.marginalia.nu/"+directory + "/" + filename);
return "";
}
private Object createForm(Request request, Response response) {
final MemexNodeUrl url = new MemexNodeUrl(Objects.requireNonNull(request.queryParams("url")));
authClient.redirectToLoginIfUnauthenticated("MEMEX", request, response);
return new MemexRenderCreateFormModel(url, memex.getDocumentsByPath(url));
}
private Object uploadForm(Request request, Response response) {
final MemexNodeUrl url = new MemexNodeUrl(Objects.requireNonNull(request.queryParams("url")));
authClient.redirectToLoginIfUnauthenticated("MEMEX", request, response);
return new MemexRenderUploadFormModel(url, memex.getDocumentsByPath(url));
}
private Object updateForm(Request request, Response response) {
final MemexNodeUrl url = new MemexNodeUrl(Objects.requireNonNull(request.queryParams("url")));
authClient.redirectToLoginIfUnauthenticated("MEMEX", request, response);
var doc = memex.getDocument(url);
return new MemexRenderUpdateFormModel(url, doc.getTitle(), "0", doc.getSectionGemtext(MemexNodeHeadingId.ROOT));
}
@SneakyThrows
private Object upload(Request request, Response response) {
authClient.requireLogIn(Context.fromRequest(request));
request.attribute("org.eclipse.jetty.multipartConfig", new MultipartConfigElement("/temp", 50*1024*1024, 50*1024*1024, 25*1024*1024));
String directory = Objects.requireNonNull(request.queryParams("directory"));
String filename = Objects.requireNonNull(request.queryParams("filename"));
var url = new MemexNodeUrl(Path.of(directory).resolve(filename).toString());
try (InputStream input = request.raw().getPart("file").getInputStream()) {
byte[] data = input.readAllBytes();
memex.uploadImage(url, data);
}
response.redirect("https://memex.marginalia.nu/"+directory + "/" + filename);
return "";
}
}

View File

@ -1,70 +0,0 @@
package nu.marginalia.memex.memex.change;
import lombok.AllArgsConstructor;
import lombok.ToString;
import nu.marginalia.memex.memex.Memex;
import nu.marginalia.memex.gemini.gmi.GemtextDocument;
import nu.marginalia.memex.gemini.gmi.renderer.GemtextRendererFactory;
import nu.marginalia.memex.memex.model.MemexNodeHeadingId;
import nu.marginalia.memex.memex.model.MemexNodeUrl;
import java.io.IOException;
@AllArgsConstructor @ToString
public class GemtextAppend implements GemtextMutation {
public final MemexNodeUrl doc;
public final MemexNodeHeadingId id;
public final String[] lines;
@Override
public void visit(Memex memex) throws IOException {
memex.updateNode(doc, calculateAppend(memex.getDocument(doc)));
}
public String calculateAppend(GemtextDocument document) {
StringBuilder result = new StringBuilder();
var renderer = new GemtextRendererFactory().gemtextRendererAsIs();
var lines = document.getLines();
int i = 0;
// Copy from before heading
for (; i < lines.length; i++) {
var item = lines[i];
if (item.getHeading().isChildOf(id)) {
break;
}
else {
result.append(item.visit(renderer)).append('\n');
}
}
// Copy contents of heading
for (; i < lines.length; i++) {
var item = lines[i];
if (!item.getHeading().isChildOf(id)) {
break;
}
else {
result.append(item.visit(renderer)).append('\n');
}
}
// Insert new lines
for (String newLine : this.lines) {
result.append(newLine).append('\n');
}
// Copy contents from after heading
for (;i < lines.length; i++) {
var item = lines[i];
result.append(item.visit(renderer)).append('\n');
}
return result.toString();
}
}

View File

@ -1,19 +0,0 @@
package nu.marginalia.memex.memex.change;
import lombok.AllArgsConstructor;
import lombok.ToString;
import nu.marginalia.memex.memex.Memex;
import nu.marginalia.memex.memex.model.MemexNodeUrl;
import java.io.IOException;
@AllArgsConstructor @ToString
public class GemtextCreate implements GemtextMutation {
public final MemexNodeUrl doc;
public final String text;
@Override
public void visit(Memex memex) throws IOException {
memex.createNode(doc, text);
}
}

View File

@ -1,26 +0,0 @@
package nu.marginalia.memex.memex.change;
import lombok.AllArgsConstructor;
import lombok.ToString;
import nu.marginalia.memex.memex.Memex;
import nu.marginalia.memex.memex.model.MemexNodeUrl;
import java.io.IOException;
@AllArgsConstructor @ToString
public class GemtextCreateOrMutate implements GemtextMutation {
public final MemexNodeUrl doc;
public final String text;
public final GemtextMutation mutation;
@Override
public void visit(Memex memex) throws IOException {
if (memex.getDocument(doc) == null) {
memex.createNode(doc, text);
}
if (memex.getDocument(doc) == null)
throw new IllegalStateException();
mutation.visit(memex);
}
}

View File

@ -1,18 +0,0 @@
package nu.marginalia.memex.memex.change;
import nu.marginalia.memex.memex.Memex;
import nu.marginalia.memex.memex.model.MemexNodeHeadingId;
import nu.marginalia.memex.memex.model.MemexNodeUrl;
import java.io.IOException;
public interface GemtextMutation {
void visit(Memex memex) throws IOException;
static GemtextMutation createOrAppend(MemexNodeUrl url, String template, MemexNodeHeadingId heading, String... lines) {
return new GemtextCreateOrMutate(url, template, new GemtextAppend(url, heading, lines));
}
static GemtextMutation createOrPrepend(MemexNodeUrl url, String template, MemexNodeHeadingId heading, String... lines) {
return new GemtextCreateOrMutate(url, template, new GemtextPrepend(url, heading, lines));
}
}

Some files were not shown because too many files have changed in this diff Show More