From a284682deb4a258601641aab881f9d21d6d5cec2 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 5 Sep 2023 10:38:51 +0200 Subject: [PATCH 01/14] (parquet) Add parquet library This small library, while great, will require some modifications to fit the project's needs, so it goes into third-party directly. --- README.md | 9 +- settings.gradle | 1 + third-party/parquet-floor/build.gradle | 20 ++ third-party/parquet-floor/readme.md | 8 + .../blue/strategic/parquet/Dehydrator.java | 14 + .../java/blue/strategic/parquet/Hydrator.java | 29 ++ .../strategic/parquet/HydratorSupplier.java | 20 ++ .../blue/strategic/parquet/ParquetReader.java | 260 ++++++++++++++++++ .../blue/strategic/parquet/ParquetWriter.java | 166 +++++++++++ .../blue/strategic/parquet/ValueWriter.java | 5 + .../org/apache/hadoop/conf/Configurable.java | 5 + .../org/apache/hadoop/conf/Configuration.java | 19 ++ .../apache/hadoop/fs/FSDataInputStream.java | 51 ++++ .../apache/hadoop/fs/FSDataOutputStream.java | 28 ++ .../java/org/apache/hadoop/fs/FileStatus.java | 21 ++ .../java/org/apache/hadoop/fs/FileSystem.java | 51 ++++ .../main/java/org/apache/hadoop/fs/Path.java | 22 ++ .../java/org/apache/hadoop/fs/PathFilter.java | 4 + .../apache/hadoop/fs/PositionedReadable.java | 84 ++++++ .../java/org/apache/hadoop/fs/Seekable.java | 43 +++ .../apache/hadoop/io/compress/CodecPool.java | 21 ++ .../hadoop/io/compress/CompressionCodec.java | 11 + .../io/compress/CompressionInputStream.java | 123 +++++++++ .../io/compress/CompressionOutputStream.java | 92 +++++++ .../apache/hadoop/io/compress/Compressor.java | 116 ++++++++ .../hadoop/io/compress/CompressorStream.java | 113 ++++++++ .../hadoop/io/compress/Decompressor.java | 124 +++++++++ .../io/compress/DecompressorStream.java | 239 ++++++++++++++++ .../java/org/apache/hadoop/mapreduce/Job.java | 4 + .../apache/hadoop/mapreduce/JobContext.java | 4 + .../hadoop/mapreduce/OutputCommitter.java | 4 + .../apache/hadoop/mapreduce/RecordReader.java | 4 + .../apache/hadoop/mapreduce/RecordWriter.java | 4 + .../hadoop/mapreduce/TaskAttemptContext.java | 4 + .../mapreduce/lib/input/FileInputFormat.java | 4 + .../lib/output/FileOutputCommitter.java | 6 + .../lib/output/FileOutputFormat.java | 4 + .../apache/hadoop/util/ReflectionUtils.java | 22 ++ 38 files changed, 1756 insertions(+), 3 deletions(-) create mode 100644 third-party/parquet-floor/build.gradle create mode 100644 third-party/parquet-floor/readme.md create mode 100644 third-party/parquet-floor/src/main/java/blue/strategic/parquet/Dehydrator.java create mode 100644 third-party/parquet-floor/src/main/java/blue/strategic/parquet/Hydrator.java create mode 100644 third-party/parquet-floor/src/main/java/blue/strategic/parquet/HydratorSupplier.java create mode 100644 third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetReader.java create mode 100644 third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetWriter.java create mode 100644 third-party/parquet-floor/src/main/java/blue/strategic/parquet/ValueWriter.java create mode 100644 third-party/parquet-floor/src/main/java/org/apache/hadoop/conf/Configurable.java create mode 100644 third-party/parquet-floor/src/main/java/org/apache/hadoop/conf/Configuration.java create mode 100644 third-party/parquet-floor/src/main/java/org/apache/hadoop/fs/FSDataInputStream.java create mode 100644 third-party/parquet-floor/src/main/java/org/apache/hadoop/fs/FSDataOutputStream.java create mode 100644 third-party/parquet-floor/src/main/java/org/apache/hadoop/fs/FileStatus.java create mode 100644 third-party/parquet-floor/src/main/java/org/apache/hadoop/fs/FileSystem.java create mode 100644 third-party/parquet-floor/src/main/java/org/apache/hadoop/fs/Path.java create mode 100644 third-party/parquet-floor/src/main/java/org/apache/hadoop/fs/PathFilter.java create mode 100644 third-party/parquet-floor/src/main/java/org/apache/hadoop/fs/PositionedReadable.java create mode 100644 third-party/parquet-floor/src/main/java/org/apache/hadoop/fs/Seekable.java create mode 100644 third-party/parquet-floor/src/main/java/org/apache/hadoop/io/compress/CodecPool.java create mode 100644 third-party/parquet-floor/src/main/java/org/apache/hadoop/io/compress/CompressionCodec.java create mode 100644 third-party/parquet-floor/src/main/java/org/apache/hadoop/io/compress/CompressionInputStream.java create mode 100644 third-party/parquet-floor/src/main/java/org/apache/hadoop/io/compress/CompressionOutputStream.java create mode 100644 third-party/parquet-floor/src/main/java/org/apache/hadoop/io/compress/Compressor.java create mode 100644 third-party/parquet-floor/src/main/java/org/apache/hadoop/io/compress/CompressorStream.java create mode 100644 third-party/parquet-floor/src/main/java/org/apache/hadoop/io/compress/Decompressor.java create mode 100644 third-party/parquet-floor/src/main/java/org/apache/hadoop/io/compress/DecompressorStream.java create mode 100644 third-party/parquet-floor/src/main/java/org/apache/hadoop/mapreduce/Job.java create mode 100644 third-party/parquet-floor/src/main/java/org/apache/hadoop/mapreduce/JobContext.java create mode 100644 third-party/parquet-floor/src/main/java/org/apache/hadoop/mapreduce/OutputCommitter.java create mode 100644 third-party/parquet-floor/src/main/java/org/apache/hadoop/mapreduce/RecordReader.java create mode 100644 third-party/parquet-floor/src/main/java/org/apache/hadoop/mapreduce/RecordWriter.java create mode 100644 third-party/parquet-floor/src/main/java/org/apache/hadoop/mapreduce/TaskAttemptContext.java create mode 100644 third-party/parquet-floor/src/main/java/org/apache/hadoop/mapreduce/lib/input/FileInputFormat.java create mode 100644 third-party/parquet-floor/src/main/java/org/apache/hadoop/mapreduce/lib/output/FileOutputCommitter.java create mode 100644 third-party/parquet-floor/src/main/java/org/apache/hadoop/mapreduce/lib/output/FileOutputFormat.java create mode 100644 third-party/parquet-floor/src/main/java/org/apache/hadoop/util/ReflectionUtils.java diff --git a/README.md b/README.md index ff9c1aa8..58f84e55 100644 --- a/README.md +++ b/README.md @@ -22,9 +22,12 @@ To set up a local test environment, follow the instructions in [📄 run/readme. ## Hardware Requirements -A production-like environment requires at least 128 Gb of RAM and ideally 2 Tb+ of enterprise -grade SSD storage, as well as some additional terabytes of slower harddrives for storing crawl -data. It can be made to run on smaller hardware by limiting size of the index. +A production-like environment requires a lot of RAM and ideally enterprise SSDs for +the index, as well as some additional terabytes of slower harddrives for storing crawl +data. It can be made to run on smaller hardware by limiting size of the index. + +The system will definitely run on a 32 Gb machine, possibly smaller, but at that size it may not perform +very well as it relies on disk caching to be fast. A local developer's deployment is possible with much smaller hardware (and index size). diff --git a/settings.gradle b/settings.gradle index 3e82b3a7..18c86ddd 100644 --- a/settings.gradle +++ b/settings.gradle @@ -83,6 +83,7 @@ include 'third-party:count-min-sketch' include 'third-party:monkey-patch-opennlp' include 'third-party:monkey-patch-gson' include 'third-party:commons-codec' +include 'third-party:parquet-floor' dependencyResolutionManagement { diff --git a/third-party/parquet-floor/build.gradle b/third-party/parquet-floor/build.gradle new file mode 100644 index 00000000..d286c43d --- /dev/null +++ b/third-party/parquet-floor/build.gradle @@ -0,0 +1,20 @@ +plugins { + id 'java' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(20)) + } +} + +dependencies { + implementation 'org.apache.parquet:parquet-column:1.13.1' + implementation('org.apache.parquet:parquet-hadoop:1.13.1') { + exclude group: 'commons-pool', module: 'commons-pool' + } +} + +test { + useJUnitPlatform() +} diff --git a/third-party/parquet-floor/readme.md b/third-party/parquet-floor/readme.md new file mode 100644 index 00000000..b1e21c40 --- /dev/null +++ b/third-party/parquet-floor/readme.md @@ -0,0 +1,8 @@ +# Parquet Floor + +License: APL 2.0 + +Git: https://github.com/strategicblue/parquet-floor + +It's basically an adaptor for Parquet I/O without +needing to pull half of Hadoop into your project. diff --git a/third-party/parquet-floor/src/main/java/blue/strategic/parquet/Dehydrator.java b/third-party/parquet-floor/src/main/java/blue/strategic/parquet/Dehydrator.java new file mode 100644 index 00000000..9391b20e --- /dev/null +++ b/third-party/parquet-floor/src/main/java/blue/strategic/parquet/Dehydrator.java @@ -0,0 +1,14 @@ +package blue.strategic.parquet; + + +/** + * Dehydrates a rich java object into a Parquet row. + */ +public interface Dehydrator { + /** + * Write the specified record into the Parquet row using the supplied writer. + * @param record the rich java object + * @param valueWriter facilitates writing to the Parquet row + */ + void dehydrate(T record, ValueWriter valueWriter); +} diff --git a/third-party/parquet-floor/src/main/java/blue/strategic/parquet/Hydrator.java b/third-party/parquet-floor/src/main/java/blue/strategic/parquet/Hydrator.java new file mode 100644 index 00000000..b8410617 --- /dev/null +++ b/third-party/parquet-floor/src/main/java/blue/strategic/parquet/Hydrator.java @@ -0,0 +1,29 @@ +package blue.strategic.parquet; + +/** + * Creates and hydrates a rich domain object from a Parquet row. + */ +public interface Hydrator { + + /** + * Creates a new mutable instance to be hydrated. + * @return new instance to be hydrated + */ + U start(); + + /** + * Hydrates the target instance by applying the specified value from the Parquet row. + * @param target object being hydrated + * @param heading the name of the column whose value is being applied + * @param value the value to apply + * @return the new target + */ + U add(U target, String heading, Object value); + + /** + * Seals the mutable hydration target. + * @param target object being hydrated + * @return the sealed object + */ + S finish(U target); +} diff --git a/third-party/parquet-floor/src/main/java/blue/strategic/parquet/HydratorSupplier.java b/third-party/parquet-floor/src/main/java/blue/strategic/parquet/HydratorSupplier.java new file mode 100644 index 00000000..fba801c1 --- /dev/null +++ b/third-party/parquet-floor/src/main/java/blue/strategic/parquet/HydratorSupplier.java @@ -0,0 +1,20 @@ +package blue.strategic.parquet; + +import org.apache.parquet.column.ColumnDescriptor; + +import java.util.List; + +/** + * Supplies hydrdators. + */ +public interface HydratorSupplier { + /** + * Supplies a hydrdator from the specified list of columns. Values will always be added to the hydrator + * in the same order as the columns supplied to this function. + */ + Hydrator get(List columns); + + static HydratorSupplier constantly(final Hydrator hydrator) { + return columns -> hydrator; + } +} diff --git a/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetReader.java b/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetReader.java new file mode 100644 index 00000000..4ebcfe60 --- /dev/null +++ b/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetReader.java @@ -0,0 +1,260 @@ +package blue.strategic.parquet; + +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.ColumnReadStore; +import org.apache.parquet.column.ColumnReader; +import org.apache.parquet.column.impl.ColumnReadStoreImpl; +import org.apache.parquet.column.page.PageReadStore; +import org.apache.parquet.example.DummyRecordConverter; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.metadata.FileMetaData; +import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.io.DelegatingSeekableInputStream; +import org.apache.parquet.io.InputFile; +import org.apache.parquet.io.SeekableInputStream; +import org.apache.parquet.io.api.GroupConverter; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType; + +import java.io.Closeable; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.Set; +import java.util.Spliterator; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.Consumer; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; + +public final class ParquetReader implements Spliterator, Closeable { + private final ParquetFileReader reader; + private final Hydrator hydrator; + private final List columns; + private final MessageType schema; + private final GroupConverter recordConverter; + private final String createdBy; + + private boolean finished; + private long currentRowGroupSize = -1L; + private List currentRowGroupColumnReaders; + private long currentRowIndex = -1L; + + public static Stream streamContent(File file, HydratorSupplier hydrator) throws IOException { + return streamContent(file, hydrator, null); + } + + public static Stream streamContent(File file, HydratorSupplier hydrator, Collection columns) throws IOException { + return streamContent(makeInputFile(file), hydrator, columns); + } + + public static Stream streamContent(InputFile file, HydratorSupplier hydrator) throws IOException { + return streamContent(file, hydrator, null); + } + + public static Stream streamContent(InputFile file, HydratorSupplier hydrator, Collection columns) throws IOException { + return stream(spliterator(file, hydrator, columns)); + } + + public static ParquetReader spliterator(File file, HydratorSupplier hydrator) throws IOException { + return spliterator(file, hydrator, null); + } + + public static ParquetReader spliterator(File file, HydratorSupplier hydrator, Collection columns) throws IOException { + return spliterator(makeInputFile(file), hydrator, columns); + } + + public static ParquetReader spliterator(InputFile file, HydratorSupplier hydrator) throws IOException { + return spliterator(file, hydrator, null); + } + + public static ParquetReader spliterator(InputFile file, HydratorSupplier hydrator, Collection columns) throws IOException { + Set columnSet = (null == columns) ? Collections.emptySet() : Set.copyOf(columns); + return new ParquetReader<>(file, columnSet, hydrator); + } + + public static Stream stream(ParquetReader reader) { + return StreamSupport + .stream(reader, false) + .onClose(() -> closeSilently(reader)); + } + + public static Stream streamContentToStrings(File file) throws IOException { + return stream(spliterator(makeInputFile(file), columns -> { + final AtomicInteger pos = new AtomicInteger(0); + return new Hydrator() { + @Override + public String[] start() { + return new String[columns.size()]; + } + + @Override + public String[] add(String[] target, String heading, Object value) { + target[pos.getAndIncrement()] = heading + "=" + value.toString(); + return target; + } + + @Override + public String[] finish(String[] target) { + return target; + } + }; + }, null)); + } + + public static ParquetMetadata readMetadata(File file) throws IOException { + return readMetadata(makeInputFile(file)); + } + + public static ParquetMetadata readMetadata(InputFile file) throws IOException { + try (ParquetFileReader reader = ParquetFileReader.open(file)) { + return reader.getFooter(); + } + } + + private ParquetReader(InputFile file, Set columnNames, HydratorSupplier hydratorSupplier) throws IOException { + this.reader = ParquetFileReader.open(file); + FileMetaData meta = reader.getFooter().getFileMetaData(); + this.schema = meta.getSchema(); + this.recordConverter = new DummyRecordConverter(this.schema).getRootConverter(); + this.createdBy = meta.getCreatedBy(); + + this.columns = schema.getColumns().stream() + .filter(c -> columnNames.isEmpty() || columnNames.contains(c.getPath()[0])) + .collect(Collectors.toList()); + + this.hydrator = hydratorSupplier.get(this.columns); + } + + private static void closeSilently(Closeable resource) { + try { + resource.close(); + } catch (Exception e) { + // ignore + } + } + + private static Object readValue(ColumnReader columnReader) { + ColumnDescriptor column = columnReader.getDescriptor(); + PrimitiveType primitiveType = column.getPrimitiveType(); + int maxDefinitionLevel = column.getMaxDefinitionLevel(); + + if (columnReader.getCurrentDefinitionLevel() == maxDefinitionLevel) { + switch (primitiveType.getPrimitiveTypeName()) { + case BINARY: + case FIXED_LEN_BYTE_ARRAY: + case INT96: + return primitiveType.stringifier().stringify(columnReader.getBinary()); + case BOOLEAN: + return columnReader.getBoolean(); + case DOUBLE: + return columnReader.getDouble(); + case FLOAT: + return columnReader.getFloat(); + case INT32: + return columnReader.getInteger(); + case INT64: + return columnReader.getLong(); + default: + throw new IllegalArgumentException("Unsupported type: " + primitiveType); + } + } else { + return null; + } + } + + @Override + public void close() throws IOException { + reader.close(); + } + + @Override + public boolean tryAdvance(Consumer action) { + try { + if (this.finished) { + return false; + } + + if (currentRowIndex == currentRowGroupSize) { + PageReadStore rowGroup = reader.readNextRowGroup(); + if (rowGroup == null) { + this.finished = true; + return false; + } + + ColumnReadStore columnReadStore = new ColumnReadStoreImpl(rowGroup, this.recordConverter, this.schema, this.createdBy); + + this.currentRowGroupSize = rowGroup.getRowCount(); + this.currentRowGroupColumnReaders = columns.stream().map(columnReadStore::getColumnReader).collect(Collectors.toList()); + this.currentRowIndex = 0L; + } + + U record = hydrator.start(); + for (ColumnReader columnReader: this.currentRowGroupColumnReaders) { + record = hydrator.add(record, columnReader.getDescriptor().getPath()[0], readValue(columnReader)); + columnReader.consume(); + if (columnReader.getCurrentRepetitionLevel() != 0) { + throw new IllegalStateException("Unexpected repetition"); + } + } + + action.accept(hydrator.finish(record)); + this.currentRowIndex++; + + return true; + } catch (Exception e) { + throw new RuntimeException("Failed to read parquet", e); + } + } + + @Override + public Spliterator trySplit() { + return null; + } + + @Override + public long estimateSize() { + return reader.getRecordCount(); + } + + @Override + public int characteristics() { + return ORDERED | NONNULL | DISTINCT; + } + + public ParquetMetadata metaData() { + return this.reader.getFooter(); + } + + public static InputFile makeInputFile(File file) { + return new InputFile() { + @Override + public long getLength() { + return file.length(); + } + + @Override + public SeekableInputStream newStream() throws IOException { + FileInputStream fis = new FileInputStream(file); + return new DelegatingSeekableInputStream(fis) { + private long position; + + @Override + public long getPos() { + return position; + } + + @Override + public void seek(long newPos) throws IOException { + fis.getChannel().position(newPos); + position = newPos; + } + }; + } + }; + } +} diff --git a/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetWriter.java b/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetWriter.java new file mode 100644 index 00000000..7d75b057 --- /dev/null +++ b/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetWriter.java @@ -0,0 +1,166 @@ +package blue.strategic.parquet; + +import org.apache.hadoop.conf.Configuration; +import org.apache.parquet.column.ParquetProperties; +import org.apache.parquet.hadoop.api.WriteSupport; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; +import org.apache.parquet.io.DelegatingPositionOutputStream; +import org.apache.parquet.io.OutputFile; +import org.apache.parquet.io.PositionOutputStream; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.io.api.RecordConsumer; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType; + +import java.io.Closeable; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.Collections; + +public final class ParquetWriter implements Closeable { + + private final org.apache.parquet.hadoop.ParquetWriter writer; + + public static ParquetWriter writeFile(MessageType schema, File out, Dehydrator dehydrator) throws IOException { + OutputFile f = new OutputFile() { + @Override + public PositionOutputStream create(long blockSizeHint) throws IOException { + return createOrOverwrite(blockSizeHint); + } + + @Override + public PositionOutputStream createOrOverwrite(long blockSizeHint) throws IOException { + FileOutputStream fos = new FileOutputStream(out); + return new DelegatingPositionOutputStream(fos) { + @Override + public long getPos() throws IOException { + return fos.getChannel().position(); + } + }; + } + + @Override + public boolean supportsBlockSize() { + return false; + } + + @Override + public long defaultBlockSize() { + return 1024L; + } + }; + return writeOutputFile(schema, f, dehydrator); + } + + private static ParquetWriter writeOutputFile(MessageType schema, OutputFile file, Dehydrator dehydrator) throws IOException { + return new ParquetWriter<>(file, schema, dehydrator); + } + + private ParquetWriter(OutputFile outputFile, MessageType schema, Dehydrator dehydrator) throws IOException { + this.writer = new Builder(outputFile) + .withType(schema) + .withDehydrator(dehydrator) + .withCompressionCodec(CompressionCodecName.SNAPPY) + .withWriterVersion(ParquetProperties.WriterVersion.PARQUET_2_0) + .build(); + } + + public void write(T record) throws IOException { + writer.write(record); + } + + @Override + public void close() throws IOException { + this.writer.close(); + } + + private static final class Builder extends org.apache.parquet.hadoop.ParquetWriter.Builder> { + private MessageType schema; + private Dehydrator dehydrator; + + private Builder(OutputFile file) { + super(file); + } + + public ParquetWriter.Builder withType(MessageType schema) { + this.schema = schema; + return this; + } + + public ParquetWriter.Builder withDehydrator(Dehydrator dehydrator) { + this.dehydrator = dehydrator; + return this; + } + + @Override + protected ParquetWriter.Builder self() { + return this; + } + + @Override + protected WriteSupport getWriteSupport(Configuration conf) { + return new SimpleWriteSupport<>(schema, dehydrator); + } + } + + private static class SimpleWriteSupport extends WriteSupport { + private final MessageType schema; + private final Dehydrator dehydrator; + private final ValueWriter valueWriter = SimpleWriteSupport.this::writeField; + + private RecordConsumer recordConsumer; + + SimpleWriteSupport(MessageType schema, Dehydrator dehydrator) { + this.schema = schema; + this.dehydrator = dehydrator; + } + + @Override + public WriteContext init(Configuration configuration) { + return new WriteContext(schema, Collections.emptyMap()); + } + + @Override + public void prepareForWrite(RecordConsumer recordConsumer) { + this.recordConsumer = recordConsumer; + } + + @Override + public void write(T record) { + recordConsumer.startMessage(); + dehydrator.dehydrate(record, valueWriter); + recordConsumer.endMessage(); + } + + @Override + public String getName() { + return "blue.strategic.parquet.ParquetWriter"; + } + + private void writeField(String name, Object value) { + int fieldIndex = schema.getFieldIndex(name); + PrimitiveType type = schema.getType(fieldIndex).asPrimitiveType(); + recordConsumer.startField(name, fieldIndex); + + switch (type.getPrimitiveTypeName()) { + case INT32: recordConsumer.addInteger((int)value); break; + case INT64: recordConsumer.addLong((long)value); break; + case DOUBLE: recordConsumer.addDouble((double)value); break; + case BOOLEAN: recordConsumer.addBoolean((boolean)value); break; + case FLOAT: recordConsumer.addFloat((float)value); break; + case BINARY: + if (type.getLogicalTypeAnnotation() == LogicalTypeAnnotation.stringType()) { + recordConsumer.addBinary(Binary.fromString((String)value)); + } else { + throw new UnsupportedOperationException("We don't support writing " + type.getLogicalTypeAnnotation()); + } + break; + default: + throw new UnsupportedOperationException("We don't support writing " + type.getPrimitiveTypeName()); + } + recordConsumer.endField(name, fieldIndex); + } + } +} diff --git a/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ValueWriter.java b/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ValueWriter.java new file mode 100644 index 00000000..cf8cce3a --- /dev/null +++ b/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ValueWriter.java @@ -0,0 +1,5 @@ +package blue.strategic.parquet; + +public interface ValueWriter { + void write(String name, Object value); +} diff --git a/third-party/parquet-floor/src/main/java/org/apache/hadoop/conf/Configurable.java b/third-party/parquet-floor/src/main/java/org/apache/hadoop/conf/Configurable.java new file mode 100644 index 00000000..f7ca25f6 --- /dev/null +++ b/third-party/parquet-floor/src/main/java/org/apache/hadoop/conf/Configurable.java @@ -0,0 +1,5 @@ +package org.apache.hadoop.conf; + +public interface Configurable { + void setConf(Configuration conf); +} diff --git a/third-party/parquet-floor/src/main/java/org/apache/hadoop/conf/Configuration.java b/third-party/parquet-floor/src/main/java/org/apache/hadoop/conf/Configuration.java new file mode 100644 index 00000000..a9c3231d --- /dev/null +++ b/third-party/parquet-floor/src/main/java/org/apache/hadoop/conf/Configuration.java @@ -0,0 +1,19 @@ +package org.apache.hadoop.conf; + +public class Configuration { + + public boolean getBoolean(String x, boolean y) { + return y; + } + + public void setBoolean(String x, boolean y) { + } + + public int getInt(String x, int y) { + return y; + } + + public String get(String x) { + return null; + } +} diff --git a/third-party/parquet-floor/src/main/java/org/apache/hadoop/fs/FSDataInputStream.java b/third-party/parquet-floor/src/main/java/org/apache/hadoop/fs/FSDataInputStream.java new file mode 100644 index 00000000..f51a64e5 --- /dev/null +++ b/third-party/parquet-floor/src/main/java/org/apache/hadoop/fs/FSDataInputStream.java @@ -0,0 +1,51 @@ +package org.apache.hadoop.fs; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.io.RandomAccessFile; + +public class FSDataInputStream extends InputStream { + private final RandomAccessFile input; + + public FSDataInputStream(org.apache.hadoop.fs.Path p) throws FileNotFoundException { + this.input = new RandomAccessFile(p.file(), "r"); + } + + @Override + public int read() throws IOException { + return input.read(); + } + + @Override + public int read(byte[] buf, int off, int len) throws IOException { + try { + input.readFully(buf, off, len); + return len; + } catch (IOException e) { + e.printStackTrace(); + return -1; + } + } + + public void seek(long pos) { + try { + input.seek(pos); + } catch (IOException e) { + e.printStackTrace(); + } + } + + public void readFully(byte[] buf, int a, int b) { + try { + input.readFully(buf, a, b); + } catch (IOException e) { + e.printStackTrace(); + } + } + + @Override + public void close() throws IOException { + input.close(); + } +} diff --git a/third-party/parquet-floor/src/main/java/org/apache/hadoop/fs/FSDataOutputStream.java b/third-party/parquet-floor/src/main/java/org/apache/hadoop/fs/FSDataOutputStream.java new file mode 100644 index 00000000..fb065899 --- /dev/null +++ b/third-party/parquet-floor/src/main/java/org/apache/hadoop/fs/FSDataOutputStream.java @@ -0,0 +1,28 @@ +package org.apache.hadoop.fs; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.OutputStream; +import java.io.RandomAccessFile; + +public class FSDataOutputStream extends OutputStream { + private final RandomAccessFile output; + + public FSDataOutputStream(org.apache.hadoop.fs.Path p) throws FileNotFoundException { + this.output = new RandomAccessFile(p.file(), "rw"); + } + + @Override + public void write(int b) throws IOException { + this.output.write(b); + } + + @Override + public void close() throws IOException { + output.close(); + } + + public long getPos() throws IOException { + return this.output.getFilePointer(); + } +} diff --git a/third-party/parquet-floor/src/main/java/org/apache/hadoop/fs/FileStatus.java b/third-party/parquet-floor/src/main/java/org/apache/hadoop/fs/FileStatus.java new file mode 100644 index 00000000..4ba53fcb --- /dev/null +++ b/third-party/parquet-floor/src/main/java/org/apache/hadoop/fs/FileStatus.java @@ -0,0 +1,21 @@ +package org.apache.hadoop.fs; + +public class FileStatus { + private final org.apache.hadoop.fs.Path path; + + public FileStatus(org.apache.hadoop.fs.Path p) { + path = p; + } + + public boolean isFile() { + return true; + } + + public org.apache.hadoop.fs.Path getPath() { + return path; + } + + public long getLen() { + return path.file().length(); + } +} diff --git a/third-party/parquet-floor/src/main/java/org/apache/hadoop/fs/FileSystem.java b/third-party/parquet-floor/src/main/java/org/apache/hadoop/fs/FileSystem.java new file mode 100644 index 00000000..c725b460 --- /dev/null +++ b/third-party/parquet-floor/src/main/java/org/apache/hadoop/fs/FileSystem.java @@ -0,0 +1,51 @@ +package org.apache.hadoop.fs; + +import java.io.FileNotFoundException; +import java.net.URI; +import java.net.URISyntaxException; + +public class FileSystem { + + public FileStatus getFileStatus(org.apache.hadoop.fs.Path p) { + return new FileStatus(p); + } + + public org.apache.hadoop.fs.Path makeQualified(org.apache.hadoop.fs.Path p) { + return p; + } + + public URI getUri() { + try { + return new URI("http://localhost/"); + } catch (URISyntaxException e) { + e.printStackTrace(); + throw new RuntimeException(e); + } + } + + public short getDefaultReplication(org.apache.hadoop.fs.Path p) { + return 0; + } + + public long getDefaultBlockSize(org.apache.hadoop.fs.Path p) { + return 1024; + } + + public FSDataInputStream open(org.apache.hadoop.fs.Path p) { + try { + return new FSDataInputStream(p); + } catch (FileNotFoundException e) { + e.printStackTrace(); + throw new RuntimeException(e); + } + } + + public org.apache.hadoop.fs.FSDataOutputStream create(org.apache.hadoop.fs.Path p, boolean a, int b, short c, long d) { + try { + return new FSDataOutputStream(p); + } catch (FileNotFoundException e) { + e.printStackTrace(); + throw new RuntimeException(e); + } + } +} diff --git a/third-party/parquet-floor/src/main/java/org/apache/hadoop/fs/Path.java b/third-party/parquet-floor/src/main/java/org/apache/hadoop/fs/Path.java new file mode 100644 index 00000000..e2392459 --- /dev/null +++ b/third-party/parquet-floor/src/main/java/org/apache/hadoop/fs/Path.java @@ -0,0 +1,22 @@ +package org.apache.hadoop.fs; + +import org.apache.hadoop.conf.Configuration; + +import java.io.File; + +public class Path { + + private final File file; + + public Path(String path) { + file = new File(path); + } + + public FileSystem getFileSystem(Configuration conf) { + return new FileSystem(); + } + + public File file() { + return file; + } +} diff --git a/third-party/parquet-floor/src/main/java/org/apache/hadoop/fs/PathFilter.java b/third-party/parquet-floor/src/main/java/org/apache/hadoop/fs/PathFilter.java new file mode 100644 index 00000000..90ab8b39 --- /dev/null +++ b/third-party/parquet-floor/src/main/java/org/apache/hadoop/fs/PathFilter.java @@ -0,0 +1,4 @@ +package org.apache.hadoop.fs; + +public interface PathFilter { +} diff --git a/third-party/parquet-floor/src/main/java/org/apache/hadoop/fs/PositionedReadable.java b/third-party/parquet-floor/src/main/java/org/apache/hadoop/fs/PositionedReadable.java new file mode 100644 index 00000000..6ac1b55d --- /dev/null +++ b/third-party/parquet-floor/src/main/java/org/apache/hadoop/fs/PositionedReadable.java @@ -0,0 +1,84 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.fs; + +import java.io.EOFException; +import java.io.IOException; + +/** + * Stream that permits positional reading. + * + * Implementations are required to implement thread-safe operations; this may + * be supported by concurrent access to the data, or by using a synchronization + * mechanism to serialize access. + * + * Not all implementations meet this requirement. Those that do not cannot + * be used as a backing store for some applications, such as Apache HBase. + * + * Independent of whether or not they are thread safe, some implementations + * may make the intermediate state of the system, specifically the position + * obtained in {@code Seekable.getPos()} visible. + */ +public interface PositionedReadable { + /** + * Read up to the specified number of bytes, from a given + * position within a file, and return the number of bytes read. This does not + * change the current offset of a file, and is thread-safe. + * + * Warning: Not all filesystems satisfy the thread-safety requirement. + * @param position position within file + * @param buffer destination buffer + * @param offset offset in the buffer + * @param length number of bytes to read + * @return actual number of bytes read; -1 means "none" + * @throws IOException IO problems. + */ + int read(long position, byte[] buffer, int offset, int length) + throws IOException; + + /** + * Read the specified number of bytes, from a given + * position within a file. This does not + * change the current offset of a file, and is thread-safe. + * + * Warning: Not all filesystems satisfy the thread-safety requirement. + * @param position position within file + * @param buffer destination buffer + * @param offset offset in the buffer + * @param length number of bytes to read + * @throws IOException IO problems. + * @throws EOFException the end of the data was reached before + * the read operation completed + */ + void readFully(long position, byte[] buffer, int offset, int length) + throws IOException; + + /** + * Read number of bytes equal to the length of the buffer, from a given + * position within a file. This does not + * change the current offset of a file, and is thread-safe. + * + * Warning: Not all filesystems satisfy the thread-safety requirement. + * @param position position within file + * @param buffer destination buffer + * @throws IOException IO problems. + * @throws EOFException the end of the data was reached before + * the read operation completed + */ + void readFully(long position, byte[] buffer) throws IOException; +} diff --git a/third-party/parquet-floor/src/main/java/org/apache/hadoop/fs/Seekable.java b/third-party/parquet-floor/src/main/java/org/apache/hadoop/fs/Seekable.java new file mode 100644 index 00000000..66a8d3dd --- /dev/null +++ b/third-party/parquet-floor/src/main/java/org/apache/hadoop/fs/Seekable.java @@ -0,0 +1,43 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.fs; + +import java.io.IOException; + +/** + * Stream that permits seeking. + */ +public interface Seekable { + /** + * Seek to the given offset from the start of the file. + * The next read() will be from that location. Can't + * seek past the end of the file. + */ + void seek(long pos) throws IOException; + + /** + * Return the current offset from the start of the file + */ + long getPos() throws IOException; + + /** + * Seeks a different copy of the data. Returns true if + * found a new source, false otherwise. + */ + boolean seekToNewSource(long targetPos) throws IOException; +} diff --git a/third-party/parquet-floor/src/main/java/org/apache/hadoop/io/compress/CodecPool.java b/third-party/parquet-floor/src/main/java/org/apache/hadoop/io/compress/CodecPool.java new file mode 100644 index 00000000..3e6873c2 --- /dev/null +++ b/third-party/parquet-floor/src/main/java/org/apache/hadoop/io/compress/CodecPool.java @@ -0,0 +1,21 @@ +package org.apache.hadoop.io.compress; + +public final class CodecPool { + + private CodecPool() { /* prevent instantiation */ } + public static Decompressor getDecompressor(CompressionCodec codec) { + return codec.createDecompressor(); + } + + public static void returnDecompressor(Decompressor decompressor) { + + } + + public static Compressor getCompressor(CompressionCodec codec) { + return codec.createCompressor(); + } + + public static void returnCompressor(Compressor compressor) { + + } +} diff --git a/third-party/parquet-floor/src/main/java/org/apache/hadoop/io/compress/CompressionCodec.java b/third-party/parquet-floor/src/main/java/org/apache/hadoop/io/compress/CompressionCodec.java new file mode 100644 index 00000000..3d5263e0 --- /dev/null +++ b/third-party/parquet-floor/src/main/java/org/apache/hadoop/io/compress/CompressionCodec.java @@ -0,0 +1,11 @@ +package org.apache.hadoop.io.compress; + +import java.io.InputStream; +import java.io.OutputStream; + +public interface CompressionCodec { + Decompressor createDecompressor(); + Compressor createCompressor(); + CompressionInputStream createInputStream(InputStream is, Decompressor d); + CompressionOutputStream createOutputStream(OutputStream os, Compressor c); +} diff --git a/third-party/parquet-floor/src/main/java/org/apache/hadoop/io/compress/CompressionInputStream.java b/third-party/parquet-floor/src/main/java/org/apache/hadoop/io/compress/CompressionInputStream.java new file mode 100644 index 00000000..2539b301 --- /dev/null +++ b/third-party/parquet-floor/src/main/java/org/apache/hadoop/io/compress/CompressionInputStream.java @@ -0,0 +1,123 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.io.compress; + +import java.io.IOException; +import java.io.InputStream; + +import org.apache.hadoop.fs.PositionedReadable; +import org.apache.hadoop.fs.Seekable; +/** + * A compression input stream. + * + *

Implementations are assumed to be buffered. This permits clients to + * reposition the underlying input stream then call {@link #resetState()}, + * without having to also synchronize client buffers. + */ +public abstract class CompressionInputStream extends InputStream implements Seekable { + /** + * The input stream to be compressed. + */ + protected final InputStream in; + protected long maxAvailableData; + + private Decompressor trackedDecompressor; + + /** + * Create a compression input stream that reads + * the decompressed bytes from the given stream. + * + * @param in The input stream to be compressed. + * @throws IOException + */ + protected CompressionInputStream(InputStream in) throws IOException { + if (!(in instanceof Seekable) || !(in instanceof PositionedReadable)) { + this.maxAvailableData = in.available(); + } + this.in = in; + } + + @Override + public void close() throws IOException { + try { + in.close(); + } finally { + if (trackedDecompressor != null) { + CodecPool.returnDecompressor(trackedDecompressor); + trackedDecompressor = null; + } + } + } + + /** + * Read bytes from the stream. + * Made abstract to prevent leakage to underlying stream. + */ + @Override + public abstract int read(byte[] b, int off, int len) throws IOException; + + /** + * Reset the decompressor to its initial state and discard any buffered data, + * as the underlying stream may have been repositioned. + */ + public abstract void resetState() throws IOException; + + /** + * This method returns the current position in the stream. + * + * @return Current position in stream as a long + */ + @Override + public long getPos() throws IOException { + if (!(in instanceof Seekable) || !(in instanceof PositionedReadable)) { + //This way of getting the current position will not work for file + //size which can be fit in an int and hence can not be returned by + //available method. + return this.maxAvailableData - this.in.available(); + } else { + return ((Seekable)this.in).getPos(); + } + + } + + /** + * This method is current not supported. + * + * @throws UnsupportedOperationException + */ + + @Override + public void seek(long pos) throws UnsupportedOperationException { + throw new UnsupportedOperationException(); + } + + /** + * This method is current not supported. + * + * @throws UnsupportedOperationException + */ + @Override + public boolean seekToNewSource(long targetPos) throws UnsupportedOperationException { + throw new UnsupportedOperationException(); + } + + void setTrackedDecompressor(Decompressor decompressor) { + trackedDecompressor = decompressor; + } +} diff --git a/third-party/parquet-floor/src/main/java/org/apache/hadoop/io/compress/CompressionOutputStream.java b/third-party/parquet-floor/src/main/java/org/apache/hadoop/io/compress/CompressionOutputStream.java new file mode 100644 index 00000000..b0ac3482 --- /dev/null +++ b/third-party/parquet-floor/src/main/java/org/apache/hadoop/io/compress/CompressionOutputStream.java @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.io.compress; + +import java.io.IOException; +import java.io.OutputStream; + +/** + * A compression output stream. + */ +public abstract class CompressionOutputStream extends OutputStream { + /** + * The output stream to be compressed. + */ + protected final OutputStream out; + + /** + * If non-null, this is the Compressor object that we should call + * CodecPool#returnCompressor on when this stream is closed. + */ + private Compressor trackedCompressor; + + /** + * Create a compression output stream that writes + * the compressed bytes to the given stream. + * @param out + */ + protected CompressionOutputStream(OutputStream out) { + this.out = out; + } + + void setTrackedCompressor(Compressor compressor) { + trackedCompressor = compressor; + } + + @Override + public void close() throws IOException { + try { + finish(); + } finally { + try { + out.close(); + } finally { + if (trackedCompressor != null) { + CodecPool.returnCompressor(trackedCompressor); + trackedCompressor = null; + } + } + } + } + + @Override + public void flush() throws IOException { + out.flush(); + } + + /** + * Write compressed bytes to the stream. + * Made abstract to prevent leakage to underlying stream. + */ + @Override + public abstract void write(byte[] b, int off, int len) throws IOException; + + /** + * Finishes writing compressed data to the output stream + * without closing the underlying stream. + */ + public abstract void finish() throws IOException; + + /** + * Reset the compression to the initial state. + * Does not reset the underlying stream. + */ + public abstract void resetState() throws IOException; + +} diff --git a/third-party/parquet-floor/src/main/java/org/apache/hadoop/io/compress/Compressor.java b/third-party/parquet-floor/src/main/java/org/apache/hadoop/io/compress/Compressor.java new file mode 100644 index 00000000..3aa4e002 --- /dev/null +++ b/third-party/parquet-floor/src/main/java/org/apache/hadoop/io/compress/Compressor.java @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.io.compress; + +import java.io.IOException; + +import org.apache.hadoop.conf.Configuration; + +/** + * Specification of a stream-based 'compressor' which can be + * plugged into a {@link CompressionOutputStream} to compress data. + * This is modelled after {@link java.util.zip.Deflater} + * + */ +public interface Compressor { + /** + * Sets input data for compression. + * This should be called whenever #needsInput() returns + * true indicating that more input data is required. + * + * @param b Input data + * @param off Start offset + * @param len Length + */ + void setInput(byte[] b, int off, int len); + + /** + * Returns true if the input data buffer is empty and + * #setInput() should be called to provide more input. + * + * @return true if the input data buffer is empty and + * #setInput() should be called in order to provide more input. + */ + boolean needsInput(); + + /** + * Sets preset dictionary for compression. A preset dictionary + * is used when the history buffer can be predetermined. + * + * @param b Dictionary data bytes + * @param off Start offset + * @param len Length + */ + void setDictionary(byte[] b, int off, int len); + + /** + * Return number of uncompressed bytes input so far. + */ + long getBytesRead(); + + /** + * Return number of compressed bytes output so far. + */ + long getBytesWritten(); + + /** + * When called, indicates that compression should end + * with the current contents of the input buffer. + */ + void finish(); + + /** + * Returns true if the end of the compressed + * data output stream has been reached. + * @return true if the end of the compressed + * data output stream has been reached. + */ + boolean finished(); + + /** + * Fills specified buffer with compressed data. Returns actual number + * of bytes of compressed data. A return value of 0 indicates that + * needsInput() should be called in order to determine if more input + * data is required. + * + * @param b Buffer for the compressed data + * @param off Start offset of the data + * @param len Size of the buffer + * @return The actual number of bytes of compressed data. + */ + int compress(byte[] b, int off, int len) throws IOException; + + /** + * Resets compressor so that a new set of input data can be processed. + */ + void reset(); + + /** + * Closes the compressor and discards any unprocessed input. + */ + void end(); + + /** + * Prepare the compressor to be used in a new stream with settings defined in + * the given Configuration + * + * @param conf Configuration from which new setting are fetched + */ + void reinit(Configuration conf); +} diff --git a/third-party/parquet-floor/src/main/java/org/apache/hadoop/io/compress/CompressorStream.java b/third-party/parquet-floor/src/main/java/org/apache/hadoop/io/compress/CompressorStream.java new file mode 100644 index 00000000..f1f066bb --- /dev/null +++ b/third-party/parquet-floor/src/main/java/org/apache/hadoop/io/compress/CompressorStream.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.io.compress; + +import java.io.IOException; +import java.io.OutputStream; + +public class CompressorStream extends CompressionOutputStream { + protected Compressor compressor; + protected byte[] buffer; + protected boolean closed; + + public CompressorStream(OutputStream out, Compressor compressor, int bufferSize) { + super(out); + + if (out == null || compressor == null) { + throw new NullPointerException(); + } else if (bufferSize <= 0) { + throw new IllegalArgumentException("Illegal bufferSize"); + } + + this.compressor = compressor; + buffer = new byte[bufferSize]; + } + + public CompressorStream(OutputStream out, Compressor compressor) { + this(out, compressor, 512); + } + + /** + * Allow derived classes to directly set the underlying stream. + * + * @param out Underlying output stream. + */ + protected CompressorStream(OutputStream out) { + super(out); + } + + @Override + public void write(byte[] b, int off, int len) throws IOException { + // Sanity checks + if (compressor.finished()) { + throw new IOException("write beyond end of stream"); + } + if ((off | len | (off + len) | (b.length - (off + len))) < 0) { + throw new IndexOutOfBoundsException(); + } else if (len == 0) { + return; + } + + compressor.setInput(b, off, len); + while (!compressor.needsInput()) { + compress(); + } + } + + protected void compress() throws IOException { + int len = compressor.compress(buffer, 0, buffer.length); + if (len > 0) { + out.write(buffer, 0, len); + } + } + + @Override + public void finish() throws IOException { + if (!compressor.finished()) { + compressor.finish(); + while (!compressor.finished()) { + compress(); + } + } + } + + @Override + public void resetState() throws IOException { + compressor.reset(); + } + + @Override + public void close() throws IOException { + if (!closed) { + try { + super.close(); + } finally { + closed = true; + } + } + } + + private byte[] oneByte = new byte[1]; + @Override + public void write(int b) throws IOException { + oneByte[0] = (byte)(b & 0xff); + write(oneByte, 0, oneByte.length); + } + +} diff --git a/third-party/parquet-floor/src/main/java/org/apache/hadoop/io/compress/Decompressor.java b/third-party/parquet-floor/src/main/java/org/apache/hadoop/io/compress/Decompressor.java new file mode 100644 index 00000000..d5799037 --- /dev/null +++ b/third-party/parquet-floor/src/main/java/org/apache/hadoop/io/compress/Decompressor.java @@ -0,0 +1,124 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.io.compress; + +import java.io.IOException; + + +/** + * Specification of a stream-based 'de-compressor' which can be + * plugged into a {@link CompressionInputStream} to compress data. + * This is modelled after {@link java.util.zip.Inflater} + * + */ +public interface Decompressor { + /** + * Sets input data for decompression. + * This should be called if and only if {@link #needsInput()} returns + * true indicating that more input data is required. + * (Both native and non-native versions of various Decompressors require + * that the data passed in via b[] remain unmodified until + * the caller is explicitly notified--via {@link #needsInput()}--that the + * buffer may be safely modified. With this requirement, an extra + * buffer-copy can be avoided.) + * + * @param b Input data + * @param off Start offset + * @param len Length + */ + void setInput(byte[] b, int off, int len); + + /** + * Returns true if the input data buffer is empty and + * {@link #setInput(byte[], int, int)} should be called to + * provide more input. + * + * @return true if the input data buffer is empty and + * {@link #setInput(byte[], int, int)} should be called in + * order to provide more input. + */ + boolean needsInput(); + + /** + * Sets preset dictionary for compression. A preset dictionary + * is used when the history buffer can be predetermined. + * + * @param b Dictionary data bytes + * @param off Start offset + * @param len Length + */ + void setDictionary(byte[] b, int off, int len); + + /** + * Returns true if a preset dictionary is needed for decompression. + * @return true if a preset dictionary is needed for decompression + */ + boolean needsDictionary(); + + /** + * Returns true if the end of the decompressed + * data output stream has been reached. Indicates a concatenated data stream + * when finished() returns true and {@link #getRemaining()} + * returns a positive value. finished() will be reset with the + * {@link #reset()} method. + * @return true if the end of the decompressed + * data output stream has been reached. + */ + boolean finished(); + + /** + * Fills specified buffer with uncompressed data. Returns actual number + * of bytes of uncompressed data. A return value of 0 indicates that + * {@link #needsInput()} should be called in order to determine if more + * input data is required. + * + * @param b Buffer for the compressed data + * @param off Start offset of the data + * @param len Size of the buffer + * @return The actual number of bytes of uncompressed data. + * @throws IOException + */ + int decompress(byte[] b, int off, int len) throws IOException; + + /** + * Returns the number of bytes remaining in the compressed data buffer. + * Indicates a concatenated data stream if {@link #finished()} returns + * true and getRemaining() returns a positive value. If + * {@link #finished()} returns true and getRemaining() returns + * a zero value, indicates that the end of data stream has been reached and + * is not a concatenated data stream. + * @return The number of bytes remaining in the compressed data buffer. + */ + int getRemaining(); + + /** + * Resets decompressor and input and output buffers so that a new set of + * input data can be processed. If {@link #finished()}} returns + * true and {@link #getRemaining()} returns a positive value, + * reset() is called before processing of the next data stream in the + * concatenated data stream. {@link #finished()} will be reset and will + * return false when reset() is called. + */ + void reset(); + + /** + * Closes the decompressor and discards any unprocessed input. + */ + void end(); +} diff --git a/third-party/parquet-floor/src/main/java/org/apache/hadoop/io/compress/DecompressorStream.java b/third-party/parquet-floor/src/main/java/org/apache/hadoop/io/compress/DecompressorStream.java new file mode 100644 index 00000000..a516d4b2 --- /dev/null +++ b/third-party/parquet-floor/src/main/java/org/apache/hadoop/io/compress/DecompressorStream.java @@ -0,0 +1,239 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.io.compress; + +import java.io.EOFException; +import java.io.IOException; +import java.io.InputStream; + +public class DecompressorStream extends CompressionInputStream { + /** + * The maximum input buffer size. + */ + private static final int MAX_INPUT_BUFFER_SIZE = 512; + /** + * MAX_SKIP_BUFFER_SIZE is used to determine the maximum buffer size to + * use when skipping. See {@link java.io.InputStream}. + */ + private static final int MAX_SKIP_BUFFER_SIZE = 2048; + + private byte[] skipBytes; + private byte[] oneByte = new byte[1]; + + protected Decompressor decompressor; + protected byte[] buffer; + protected boolean eof; + protected boolean closed; + private int lastBytesSent; + + DecompressorStream(InputStream in, Decompressor decompressor, + int bufferSize, int skipBufferSize) + throws IOException { + super(in); + + if (decompressor == null) { + throw new NullPointerException(); + } else if (bufferSize <= 0) { + throw new IllegalArgumentException("Illegal bufferSize"); + } + + this.decompressor = decompressor; + buffer = new byte[bufferSize]; + skipBytes = new byte[skipBufferSize]; + } + + public DecompressorStream(InputStream in, Decompressor decompressor, + int bufferSize) + throws IOException { + this(in, decompressor, bufferSize, MAX_SKIP_BUFFER_SIZE); + } + + public DecompressorStream(InputStream in, Decompressor decompressor) + throws IOException { + this(in, decompressor, MAX_INPUT_BUFFER_SIZE); + } + + /** + * Allow derived classes to directly set the underlying stream. + * + * @param in Underlying input stream. + * @throws IOException + */ + protected DecompressorStream(InputStream in) throws IOException { + super(in); + } + + @Override + public int read() throws IOException { + checkStream(); + return (read(oneByte, 0, oneByte.length) == -1) ? -1 : (oneByte[0] & 0xff); + } + + @Override + public int read(byte[] b, int off, int len) throws IOException { + checkStream(); + + if ((off | len | (off + len) | (b.length - (off + len))) < 0) { + throw new IndexOutOfBoundsException(); + } else if (len == 0) { + return 0; + } + + return decompress(b, off, len); + } + + protected int decompress(byte[] b, int off, int len) throws IOException { + int n; + + while ((n = decompressor.decompress(b, off, len)) == 0) { + if (decompressor.needsDictionary()) { + eof = true; + return -1; + } + + if (decompressor.finished()) { + // First see if there was any leftover buffered input from previous + // stream; if not, attempt to refill buffer. If refill -> EOF, we're + // all done; else reset, fix up input buffer, and get ready for next + // concatenated substream/"member". + int nRemaining = decompressor.getRemaining(); + if (nRemaining == 0) { + int m = getCompressedData(); + if (m == -1) { + // apparently the previous end-of-stream was also end-of-file: + // return success, as if we had never called getCompressedData() + eof = true; + return -1; + } + decompressor.reset(); + decompressor.setInput(buffer, 0, m); + lastBytesSent = m; + } else { + // looks like it's a concatenated stream: reset low-level zlib (or + // other engine) and buffers, then "resend" remaining input data + decompressor.reset(); + int leftoverOffset = lastBytesSent - nRemaining; + assert leftoverOffset >= 0; + // this recopies userBuf -> direct buffer if using native libraries: + decompressor.setInput(buffer, leftoverOffset, nRemaining); + // NOTE: this is the one place we do NOT want to save the number + // of bytes sent (nRemaining here) into lastBytesSent: since we + // are resending what we've already sent before, offset is nonzero + // in general (only way it could be zero is if it already equals + // nRemaining), which would then screw up the offset calculation + // _next_ time around. IOW, getRemaining() is in terms of the + // original, zero-offset bufferload, so lastBytesSent must be as + // well. Cheesy ASCII art: + // + // <------------ m, lastBytesSent -----------> + // +===============================================+ + // buffer: |1111111111|22222222222222222|333333333333| | + // +===============================================+ + // #1: <-- off -->|<-------- nRemaining ---------> + // #2: <----------- off ----------->|<-- nRem. --> + // #3: (final substream: nRemaining == 0; eof = true) + // + // If lastBytesSent is anything other than m, as shown, then "off" + // will be calculated incorrectly. + } + } else if (decompressor.needsInput()) { + int m = getCompressedData(); + if (m == -1) { + throw new EOFException("Unexpected end of input stream"); + } + decompressor.setInput(buffer, 0, m); + lastBytesSent = m; + } + } + + return n; + } + + protected int getCompressedData() throws IOException { + checkStream(); + + // note that the _caller_ is now required to call setInput() or throw + return in.read(buffer, 0, buffer.length); + } + + protected void checkStream() throws IOException { + if (closed) { + throw new IOException("Stream closed"); + } + } + + @Override + public void resetState() throws IOException { + decompressor.reset(); + } + + @Override + public long skip(long n) throws IOException { + // Sanity checks + if (n < 0) { + throw new IllegalArgumentException("negative skip length"); + } + checkStream(); + + // Read 'n' bytes + int skipped = 0; + while (skipped < n) { + int len = Math.min((int)n - skipped, skipBytes.length); + len = read(skipBytes, 0, len); + if (len == -1) { + eof = true; + break; + } + skipped += len; + } + return skipped; + } + + @Override + public int available() throws IOException { + checkStream(); + return eof ? 0 : 1; + } + + @Override + public void close() throws IOException { + if (!closed) { + try { + super.close(); + } finally { + closed = true; + } + } + } + + @Override + public boolean markSupported() { + return false; + } + + @Override + public synchronized void mark(int readlimit) { + } + + @Override + public synchronized void reset() throws IOException { + throw new IOException("mark/reset not supported"); + } + +} diff --git a/third-party/parquet-floor/src/main/java/org/apache/hadoop/mapreduce/Job.java b/third-party/parquet-floor/src/main/java/org/apache/hadoop/mapreduce/Job.java new file mode 100644 index 00000000..f262d0dd --- /dev/null +++ b/third-party/parquet-floor/src/main/java/org/apache/hadoop/mapreduce/Job.java @@ -0,0 +1,4 @@ +package org.apache.hadoop.mapreduce; + +public class Job extends JobContext { +} diff --git a/third-party/parquet-floor/src/main/java/org/apache/hadoop/mapreduce/JobContext.java b/third-party/parquet-floor/src/main/java/org/apache/hadoop/mapreduce/JobContext.java new file mode 100644 index 00000000..e781120e --- /dev/null +++ b/third-party/parquet-floor/src/main/java/org/apache/hadoop/mapreduce/JobContext.java @@ -0,0 +1,4 @@ +package org.apache.hadoop.mapreduce; + +public class JobContext { +} diff --git a/third-party/parquet-floor/src/main/java/org/apache/hadoop/mapreduce/OutputCommitter.java b/third-party/parquet-floor/src/main/java/org/apache/hadoop/mapreduce/OutputCommitter.java new file mode 100644 index 00000000..6cf0bd24 --- /dev/null +++ b/third-party/parquet-floor/src/main/java/org/apache/hadoop/mapreduce/OutputCommitter.java @@ -0,0 +1,4 @@ +package org.apache.hadoop.mapreduce; + +public interface OutputCommitter { +} diff --git a/third-party/parquet-floor/src/main/java/org/apache/hadoop/mapreduce/RecordReader.java b/third-party/parquet-floor/src/main/java/org/apache/hadoop/mapreduce/RecordReader.java new file mode 100644 index 00000000..92e3474c --- /dev/null +++ b/third-party/parquet-floor/src/main/java/org/apache/hadoop/mapreduce/RecordReader.java @@ -0,0 +1,4 @@ +package org.apache.hadoop.mapreduce; + +public class RecordReader { +} diff --git a/third-party/parquet-floor/src/main/java/org/apache/hadoop/mapreduce/RecordWriter.java b/third-party/parquet-floor/src/main/java/org/apache/hadoop/mapreduce/RecordWriter.java new file mode 100644 index 00000000..a578a73d --- /dev/null +++ b/third-party/parquet-floor/src/main/java/org/apache/hadoop/mapreduce/RecordWriter.java @@ -0,0 +1,4 @@ +package org.apache.hadoop.mapreduce; + +public class RecordWriter { +} diff --git a/third-party/parquet-floor/src/main/java/org/apache/hadoop/mapreduce/TaskAttemptContext.java b/third-party/parquet-floor/src/main/java/org/apache/hadoop/mapreduce/TaskAttemptContext.java new file mode 100644 index 00000000..e5e80827 --- /dev/null +++ b/third-party/parquet-floor/src/main/java/org/apache/hadoop/mapreduce/TaskAttemptContext.java @@ -0,0 +1,4 @@ +package org.apache.hadoop.mapreduce; + +public class TaskAttemptContext extends JobContext { +} diff --git a/third-party/parquet-floor/src/main/java/org/apache/hadoop/mapreduce/lib/input/FileInputFormat.java b/third-party/parquet-floor/src/main/java/org/apache/hadoop/mapreduce/lib/input/FileInputFormat.java new file mode 100644 index 00000000..62e748d5 --- /dev/null +++ b/third-party/parquet-floor/src/main/java/org/apache/hadoop/mapreduce/lib/input/FileInputFormat.java @@ -0,0 +1,4 @@ +package org.apache.hadoop.mapreduce.lib.input; + +public class FileInputFormat { +} diff --git a/third-party/parquet-floor/src/main/java/org/apache/hadoop/mapreduce/lib/output/FileOutputCommitter.java b/third-party/parquet-floor/src/main/java/org/apache/hadoop/mapreduce/lib/output/FileOutputCommitter.java new file mode 100644 index 00000000..aa66b4f0 --- /dev/null +++ b/third-party/parquet-floor/src/main/java/org/apache/hadoop/mapreduce/lib/output/FileOutputCommitter.java @@ -0,0 +1,6 @@ +package org.apache.hadoop.mapreduce.lib.output; + +import org.apache.hadoop.mapreduce.OutputCommitter; + +public class FileOutputCommitter implements OutputCommitter { +} diff --git a/third-party/parquet-floor/src/main/java/org/apache/hadoop/mapreduce/lib/output/FileOutputFormat.java b/third-party/parquet-floor/src/main/java/org/apache/hadoop/mapreduce/lib/output/FileOutputFormat.java new file mode 100644 index 00000000..daa34d02 --- /dev/null +++ b/third-party/parquet-floor/src/main/java/org/apache/hadoop/mapreduce/lib/output/FileOutputFormat.java @@ -0,0 +1,4 @@ +package org.apache.hadoop.mapreduce.lib.output; + +public class FileOutputFormat { +} diff --git a/third-party/parquet-floor/src/main/java/org/apache/hadoop/util/ReflectionUtils.java b/third-party/parquet-floor/src/main/java/org/apache/hadoop/util/ReflectionUtils.java new file mode 100644 index 00000000..d4f0c9f2 --- /dev/null +++ b/third-party/parquet-floor/src/main/java/org/apache/hadoop/util/ReflectionUtils.java @@ -0,0 +1,22 @@ +package org.apache.hadoop.util; + +import org.apache.hadoop.conf.Configurable; +import org.apache.hadoop.conf.Configuration; + +public final class ReflectionUtils { + + private ReflectionUtils() { /* prevent instantitation */ } + + public static Object newInstance(Class type, Configuration x) { + try { + Object o = type.newInstance(); + if (o instanceof Configurable) { + ((Configurable) o).setConf(x); + } + return o; + } catch (Exception e) { + e.printStackTrace(); + return null; + } + } +} From dbe974f5103db9bdd270e31c64c1f6321991e64b Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 11 Sep 2023 09:02:58 +0200 Subject: [PATCH 02/14] (parquet) Use ZSTD compression by default. --- settings.gradle | 7 +++++-- third-party/parquet-floor/build.gradle | 5 ++++- .../main/java/blue/strategic/parquet/ParquetWriter.java | 2 +- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/settings.gradle b/settings.gradle index 18c86ddd..884160c9 100644 --- a/settings.gradle +++ b/settings.gradle @@ -116,6 +116,7 @@ dependencyResolutionManagement { library('spark', 'com.sparkjava', 'spark-core').version('2.9.4') library('guice', 'com.google.inject', 'guice').version('7.0.0') library('guava', 'com.google.guava', 'guava').version('32.0.1-jre') + library('protobuf', 'com.google.protobuf', 'protobuf-java').version('3.0.0') library('rxjava', 'io.reactivex.rxjava3', 'rxjava').version('3.1.6') @@ -160,7 +161,6 @@ dependencyResolutionManagement { library('opencsv','com.opencsv','opencsv').version('5.6') library('bucket4j','com.github.vladimir-bukhtoyarov','bucket4j-core').version('7.5.0') - library('protobuf','com.google.protobuf','protobuf-java').version('3.0.0') library('gson','com.google.code.gson','gson').version('2.10.1') library('gson-type-adapter','com.github.Marcono1234','gson-record-type-adapter-factory').version('0.2.0') @@ -182,6 +182,9 @@ dependencyResolutionManagement { library('sqlite','org.xerial','sqlite-jdbc').version('3.41.2.1') + library('parquet-column', 'org.apache.parquet','parquet-column').version('1.13.1') + library('parquet-hadoop', 'org.apache.parquet','parquet-hadoop').version('1.13.1') + bundle('slf4j', ['slf4j.api', 'log4j.api', 'log4j.core', 'log4j.slf4j']) bundle('slf4j.test', ['slf4j.jdk14']) bundle('prometheus', ['prometheus', 'prometheus-servlet', 'prometheus-server', 'prometheus-hotspot']) @@ -192,7 +195,7 @@ dependencyResolutionManagement { bundle('gson', ['gson', 'gson-type-adapter']) bundle('httpcomponents', ['httpcomponents.core', 'httpcomponents.client']) - + bundle('parquet', ['parquet-column', 'parquet-hadoop']) bundle('junit', ['junit.jupiter', 'junit.jupiter.engine']) } diff --git a/third-party/parquet-floor/build.gradle b/third-party/parquet-floor/build.gradle index d286c43d..f8393044 100644 --- a/third-party/parquet-floor/build.gradle +++ b/third-party/parquet-floor/build.gradle @@ -9,9 +9,12 @@ java { } dependencies { - implementation 'org.apache.parquet:parquet-column:1.13.1' + implementation ('org.apache.parquet:parquet-column:1.13.1') { + transitive = true + } implementation('org.apache.parquet:parquet-hadoop:1.13.1') { exclude group: 'commons-pool', module: 'commons-pool' + transitive = true } } diff --git a/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetWriter.java b/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetWriter.java index 7d75b057..68d4ba76 100644 --- a/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetWriter.java +++ b/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetWriter.java @@ -62,7 +62,7 @@ public final class ParquetWriter implements Closeable { this.writer = new Builder(outputFile) .withType(schema) .withDehydrator(dehydrator) - .withCompressionCodec(CompressionCodecName.SNAPPY) + .withCompressionCodec(CompressionCodecName.ZSTD) .withWriterVersion(ParquetProperties.WriterVersion.PARQUET_2_0) .build(); } From a00cabe223eaff2cf2ed74593212c506bad90f1e Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 11 Sep 2023 14:06:43 +0200 Subject: [PATCH 03/14] (parquet-floor) Patch in support for writing and reading repeated values --- .../blue/strategic/parquet/ParquetReader.java | 13 ++-- .../blue/strategic/parquet/ParquetWriter.java | 59 ++++++++++++++----- .../blue/strategic/parquet/ValueWriter.java | 3 + 3 files changed, 52 insertions(+), 23 deletions(-) diff --git a/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetReader.java b/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetReader.java index 4ebcfe60..3eee03d0 100644 --- a/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetReader.java +++ b/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetReader.java @@ -20,11 +20,7 @@ import java.io.Closeable; import java.io.File; import java.io.FileInputStream; import java.io.IOException; -import java.util.Collection; -import java.util.Collections; -import java.util.List; -import java.util.Set; -import java.util.Spliterator; +import java.util.*; import java.util.concurrent.atomic.AtomicInteger; import java.util.function.Consumer; import java.util.stream.Collectors; @@ -195,8 +191,11 @@ public final class ParquetReader implements Spliterator, Closeable { U record = hydrator.start(); for (ColumnReader columnReader: this.currentRowGroupColumnReaders) { - record = hydrator.add(record, columnReader.getDescriptor().getPath()[0], readValue(columnReader)); - columnReader.consume(); + do { + record = hydrator.add(record, columnReader.getDescriptor().getPath()[0], readValue(columnReader)); + columnReader.consume(); + } while (columnReader.getCurrentRepetitionLevel() != 0); + if (columnReader.getCurrentRepetitionLevel() != 0) { throw new IllegalStateException("Unexpected repetition"); } diff --git a/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetWriter.java b/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetWriter.java index 68d4ba76..7840c49e 100644 --- a/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetWriter.java +++ b/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetWriter.java @@ -18,6 +18,7 @@ import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.util.Collections; +import java.util.List; public final class ParquetWriter implements Closeable { @@ -108,7 +109,17 @@ public final class ParquetWriter implements Closeable { private static class SimpleWriteSupport extends WriteSupport { private final MessageType schema; private final Dehydrator dehydrator; - private final ValueWriter valueWriter = SimpleWriteSupport.this::writeField; + private final ValueWriter valueWriter = new ValueWriter() { + @Override + public void write(String name, Object value) { + SimpleWriteSupport.this.writeField(name, value); + } + + @Override + public void writeList(String name, List value) { + SimpleWriteSupport.this.writeList(name, value); + } + }; private RecordConsumer recordConsumer; @@ -144,23 +155,39 @@ public final class ParquetWriter implements Closeable { PrimitiveType type = schema.getType(fieldIndex).asPrimitiveType(); recordConsumer.startField(name, fieldIndex); - switch (type.getPrimitiveTypeName()) { - case INT32: recordConsumer.addInteger((int)value); break; - case INT64: recordConsumer.addLong((long)value); break; - case DOUBLE: recordConsumer.addDouble((double)value); break; - case BOOLEAN: recordConsumer.addBoolean((boolean)value); break; - case FLOAT: recordConsumer.addFloat((float)value); break; - case BINARY: - if (type.getLogicalTypeAnnotation() == LogicalTypeAnnotation.stringType()) { - recordConsumer.addBinary(Binary.fromString((String)value)); - } else { - throw new UnsupportedOperationException("We don't support writing " + type.getLogicalTypeAnnotation()); - } - break; - default: - throw new UnsupportedOperationException("We don't support writing " + type.getPrimitiveTypeName()); + writeValue(type, value); + + recordConsumer.endField(name, fieldIndex); + } + + private void writeList(String name, List values) { + int fieldIndex = schema.getFieldIndex(name); + PrimitiveType type = schema.getType(fieldIndex).asPrimitiveType(); + recordConsumer.startField(name, fieldIndex); + for (var value : values) { + writeValue(type, value); } recordConsumer.endField(name, fieldIndex); } + + void writeValue(PrimitiveType type, Object value) { + switch (type.getPrimitiveTypeName()) { + case INT32: recordConsumer.addInteger((int)value); break; + case INT64: recordConsumer.addLong((long)value); break; + case DOUBLE: recordConsumer.addDouble((double)value); break; + case BOOLEAN: recordConsumer.addBoolean((boolean)value); break; + case FLOAT: recordConsumer.addFloat((float)value); break; + case BINARY: + if (type.getLogicalTypeAnnotation() == LogicalTypeAnnotation.stringType()) { + recordConsumer.addBinary(Binary.fromString((String)value)); + } else { + throw new UnsupportedOperationException("We don't support writing logical annotation type " + type.getLogicalTypeAnnotation()); + } + break; + default: + throw new UnsupportedOperationException("We don't support writing " + type.getPrimitiveTypeName()); + } + + } } } diff --git a/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ValueWriter.java b/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ValueWriter.java index cf8cce3a..e8cda912 100644 --- a/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ValueWriter.java +++ b/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ValueWriter.java @@ -1,5 +1,8 @@ package blue.strategic.parquet; +import java.util.List; + public interface ValueWriter { void write(String name, Object value); + void writeList(String name, List value); } From a52d78c8ee38057c87cc8284620d9693a45b5164 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 11 Sep 2023 14:07:52 +0200 Subject: [PATCH 04/14] (work-log) New batching work log --- code/process-models/work-log/build.gradle | 33 +++ .../marginalia/worklog/BatchingWorkLog.java | 35 +++ .../worklog/BatchingWorkLogImpl.java | 221 ++++++++++++++++++ .../worklog/BatchingWorkLogImplTest.java | 63 +++++ settings.gradle | 1 + 5 files changed, 353 insertions(+) create mode 100644 code/process-models/work-log/build.gradle create mode 100644 code/process-models/work-log/src/main/java/nu/marginalia/worklog/BatchingWorkLog.java create mode 100644 code/process-models/work-log/src/main/java/nu/marginalia/worklog/BatchingWorkLogImpl.java create mode 100644 code/process-models/work-log/src/test/java/nu/marginalia/worklog/BatchingWorkLogImplTest.java diff --git a/code/process-models/work-log/build.gradle b/code/process-models/work-log/build.gradle new file mode 100644 index 00000000..507621c8 --- /dev/null +++ b/code/process-models/work-log/build.gradle @@ -0,0 +1,33 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "8.2.2" + + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(20)) + } +} +dependencies { + implementation libs.lombok + annotationProcessor libs.lombok + implementation libs.bundles.slf4j + + implementation libs.notnull + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} + +test { + useJUnitPlatform() +} + +task fastTests(type: Test) { + useJUnitPlatform { + excludeTags "slow" + } +} diff --git a/code/process-models/work-log/src/main/java/nu/marginalia/worklog/BatchingWorkLog.java b/code/process-models/work-log/src/main/java/nu/marginalia/worklog/BatchingWorkLog.java new file mode 100644 index 00000000..9c6ba9bb --- /dev/null +++ b/code/process-models/work-log/src/main/java/nu/marginalia/worklog/BatchingWorkLog.java @@ -0,0 +1,35 @@ +package nu.marginalia.worklog; + +import java.io.IOException; + +/** The BatchingWorkLog is a work log for items of work performed in batches, + * where each batch needs to be finalized before the items it consists of can be + * considered done. This is needed when the data is serialized into a format such + * as Parquet, where disparate items go into the same file, and the writer needs to be + * properly closed before the file can be read. + */ +public interface BatchingWorkLog extends AutoCloseable { + + /** Returns true if logItem(id) has been run, + * and logFinishedBatch has been run after that. + */ + boolean isItemCommitted(String id); + + /** Returns true if logItem(id) has been run + * but not logFinishedBatch(). + *

+ * Unlike isItemCommitted(), this state is ephemeral and not + * retained if e.g. the process crashes and resumes. + * */ + boolean isItemInCurrentBatch(String id); + + /** Log additional item to the current batch */ + void logItem(String id) throws IOException; + + /** Mark the current batch as finished and increment + * the batch number counter + */ + void logFinishedBatch() throws IOException; + + int getBatchNumber(); +} diff --git a/code/process-models/work-log/src/main/java/nu/marginalia/worklog/BatchingWorkLogImpl.java b/code/process-models/work-log/src/main/java/nu/marginalia/worklog/BatchingWorkLogImpl.java new file mode 100644 index 00000000..b1538b10 --- /dev/null +++ b/code/process-models/work-log/src/main/java/nu/marginalia/worklog/BatchingWorkLogImpl.java @@ -0,0 +1,221 @@ +package nu.marginalia.worklog; + +import java.io.*; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; +import java.time.LocalDateTime; +import java.util.HashSet; +import java.util.Set; + +public class BatchingWorkLogImpl implements BatchingWorkLog { + private int batchNumber = 0; + private final Set currentBatchItems = new HashSet<>(1000); + private final Set commitedItems = new HashSet<>(10_000); + private final OutputStream writer; + + public BatchingWorkLogImpl(Path file) throws IOException { + if (Files.exists(file)) { + try (var linesStream = Files.lines(file)) { + linesStream.map(WorkLogItem::parse).forEach( + item -> item.replay(this) + ); + } + + writer = Files.newOutputStream(file, StandardOpenOption.APPEND); + writeLogEntry(new CommentLine("Log resumed on " + LocalDateTime.now())); + if (getCurrentBatchSize() > 0) { + writeLogEntry(new CrashMarker()); + } + } + else { + writer = Files.newOutputStream(file, StandardOpenOption.CREATE_NEW); + writeLogEntry(new CommentLine("Log created on " + LocalDateTime.now())); + writeLogEntry(new CommentLine(" Format: ")); + writeLogEntry(new CommentLine(" " + AddItem.MARKER + " ID\tsignifies adding an item to the current batch")); + writeLogEntry(new CommentLine(" " + FinishBatch.MARKER + "\tsignifies finalizing the current batch and switching to the next")); + writeLogEntry(new CommentLine(" " + CrashMarker.MARKER + "\tdiscard contents from the current batch and start over, written after a crash")); + writeLogEntry(new CommentLine("Upon a crash, items that have re-process until their batch is finalized")); + } + + + } + + void writeLogEntry(WorkLogItem item) throws IOException { + item.write(this); + } + + void writeLine(String line) throws IOException { + writer.write(line.getBytes(StandardCharsets.UTF_8)); + writer.write('\n'); + writer.flush(); + } + + @Override + public boolean isItemCommitted(String id) { + return commitedItems.contains(id); + } + + @Override + public boolean isItemInCurrentBatch(String id) { + return currentBatchItems.contains(id); + } + @Override + public void logItem(String id) throws IOException { + writeLogEntry(new AddItem(id)); + } + + @Override + public void logFinishedBatch() throws IOException { + writeLogEntry(new FinishBatch()); + incrementBatch(); + } + + void incrementBatch() { + batchNumber++; + + // Transfer all items from the current batch to the committed items' batch + commitedItems.addAll(currentBatchItems); + currentBatchItems.clear(); + } + + void restartBatch() { + currentBatchItems.clear(); + } + + void addItemToCurrentBatch(String id) { + currentBatchItems.add(id); + } + + @Override + public void close() throws IOException { + writer.flush(); + writer.close(); + } + + @Override + public int getBatchNumber() { + return batchNumber; + } + + public int getCurrentBatchSize() { + return currentBatchItems.size(); + } +} + +interface WorkLogItem { + + void replay(BatchingWorkLogImpl bwl); + void write(BatchingWorkLogImpl bwl) throws IOException; + + static WorkLogItem parse(String line) { + if (line.isBlank()) + return new BlankLine(); + + var lineParts = LogLineParts.parse(line); + + return switch (lineParts.tag()) { + case CommentLine.MARKER -> new CommentLine(lineParts.arg()); + case AddItem.MARKER -> new AddItem(lineParts.arg()); + case FinishBatch.MARKER -> new FinishBatch(); + case CrashMarker.MARKER -> new CrashMarker(); + default -> throw new WorkLogParseException(line); + }; + } +} + +record LogLineParts(char tag, String arg) { + public static LogLineParts parse(String line) { + line = line.trim(); + + char tag = line.charAt(0); + String arg = line.substring(1).trim(); + + int commentIdx = arg.indexOf('#'); + if (commentIdx >= 0) arg = arg.substring(0, commentIdx).trim(); + + return new LogLineParts(tag, arg); + } +} + +record CommentLine(String comment) implements WorkLogItem { + final static char MARKER = '#'; + + @Override + public void replay(BatchingWorkLogImpl bwl) {} + + @Override + public void write(BatchingWorkLogImpl bwl) throws IOException { + bwl.writeLine(MARKER + " " + comment); + } +} +record BlankLine() implements WorkLogItem { + final static char MARKER = ' '; + + @Override + public void replay(BatchingWorkLogImpl bwl) {} + + @Override + public void write(BatchingWorkLogImpl bwl) throws IOException { + bwl.writeLine(MARKER + ""); + } +} + +record FinishBatch() implements WorkLogItem { + final static char MARKER = 'F'; + + @Override + public void replay(BatchingWorkLogImpl bwl) { + bwl.incrementBatch(); + } + + @Override + public void write(BatchingWorkLogImpl bwl) throws IOException { + bwl.writeLine("# " + LocalDateTime.now()); + bwl.writeLine("# finalizing batchNumber = " + bwl.getBatchNumber()); + bwl.writeLine(Character.toString(MARKER)); + } + + +} + +record CrashMarker() implements WorkLogItem { + final static char MARKER = 'X'; + + @Override + public void replay(BatchingWorkLogImpl bwl) { + bwl.restartBatch(); + } + + @Override + public void write(BatchingWorkLogImpl bwl) throws IOException { + bwl.writeLine("# " + LocalDateTime.now()); + bwl.writeLine("# discarding batchNumber = " + bwl.getBatchNumber()); + bwl.writeLine(Character.toString(MARKER)); + } + + +} +record AddItem(String id) implements WorkLogItem { + final static char MARKER = '+'; + + @Override + public void replay(BatchingWorkLogImpl bwl) { + bwl.addItemToCurrentBatch(id); + } + + @Override + public void write(BatchingWorkLogImpl bwl) throws IOException { + bwl.writeLine(MARKER + " " + id); + } +} + +class WorkLogParseException extends RuntimeException { + @Serial + private static final long serialVersionUID = -1238138989389021166L; + + public WorkLogParseException(String logLine) { + super("Failed to parse work log line: '" + logLine + "'"); + } +} \ No newline at end of file diff --git a/code/process-models/work-log/src/test/java/nu/marginalia/worklog/BatchingWorkLogImplTest.java b/code/process-models/work-log/src/test/java/nu/marginalia/worklog/BatchingWorkLogImplTest.java new file mode 100644 index 00000000..fa765fd2 --- /dev/null +++ b/code/process-models/work-log/src/test/java/nu/marginalia/worklog/BatchingWorkLogImplTest.java @@ -0,0 +1,63 @@ +package nu.marginalia.worklog; + +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +import static org.junit.jupiter.api.Assertions.*; + +class BatchingWorkLogImplTest { + Path fileName; + + @BeforeEach + public void setUp() throws IOException { + fileName = Files.createTempFile(getClass().getSimpleName(), ".test"); + } + + @AfterEach + public void tearDown() throws IOException { + Files.deleteIfExists(fileName); + } + + @Test + public void testResumeOnEmptyFile() throws IOException { + Files.delete(fileName); + + try (var wl = new BatchingWorkLogImpl(fileName)) { + wl.logItem("1"); + wl.logItem("2"); + wl.logItem("3"); + wl.logFinishedBatch(); + wl.logItem("4"); + wl.logItem("5"); + wl.logFinishedBatch(); + wl.logItem("6"); + } + + try (var wl = new BatchingWorkLogImpl(fileName)) { + assertTrue(wl.isItemCommitted("1")); + assertTrue(wl.isItemCommitted("2")); + assertTrue(wl.isItemCommitted("3")); + assertTrue(wl.isItemCommitted("4")); + assertTrue(wl.isItemCommitted("5")); + assertFalse(wl.isItemCommitted("6")); + wl.logItem("7"); + wl.logFinishedBatch(); + } + try (var wl = new BatchingWorkLogImpl(fileName)) { + assertTrue(wl.isItemCommitted("1")); + assertTrue(wl.isItemCommitted("2")); + assertTrue(wl.isItemCommitted("3")); + assertTrue(wl.isItemCommitted("4")); + assertTrue(wl.isItemCommitted("5")); + assertFalse(wl.isItemCommitted("6")); + assertTrue(wl.isItemCommitted("7")); + } + + Files.readAllLines(fileName).forEach(System.out::println); + } +} \ No newline at end of file diff --git a/settings.gradle b/settings.gradle index 884160c9..d2b54d41 100644 --- a/settings.gradle +++ b/settings.gradle @@ -65,6 +65,7 @@ include 'code:processes:test-data' include 'code:process-models:converting-model' include 'code:process-models:crawling-model' +include 'code:process-models:work-log' include 'code:tools:term-frequency-extractor' include 'code:tools:crawl-job-extractor' From 064bc5ee764bc6bdccf5bb6daa2571588fb2acde Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 11 Sep 2023 14:08:40 +0200 Subject: [PATCH 05/14] (processed-data) New parquet-serializable models for converter output --- .../processed-data/build.gradle | 37 ++++++ .../ProcessedDocumentDataDehydrator.java | 37 ++++++ .../ProcessedDocumentDataHydrator.java | 24 ++++ .../ProcessedDomainDataDehydrator.java | 26 +++++ ...ProcessedDomainDataDomainNameHydrator.java | 26 +++++ .../ProcessedDomainDataHydrator.java | 23 ++++ .../ProcessedDocumentParquetFileReader.java | 20 ++++ .../ProcessedDocumentParquetFileWriter.java | 25 +++++ .../ProcessedDomainParquetFileReader.java | 32 ++++++ .../ProcessedDomainParquetFileWriter.java | 25 +++++ .../model/processed/ProcessedDataCodec.java | 6 + .../processed/ProcessedDocumentData.java | 105 ++++++++++++++++++ .../model/processed/ProcessedDomainData.java | 55 +++++++++ ...rocessedDocumentParquetFileReaderTest.java | 89 +++++++++++++++ .../ProcessedDomainParquetFileReaderTest.java | 63 +++++++++++ settings.gradle | 1 + 16 files changed, 594 insertions(+) create mode 100644 code/process-models/processed-data/build.gradle create mode 100644 code/process-models/processed-data/src/main/java/nu/marginalia/codec/processed/ProcessedDocumentDataDehydrator.java create mode 100644 code/process-models/processed-data/src/main/java/nu/marginalia/codec/processed/ProcessedDocumentDataHydrator.java create mode 100644 code/process-models/processed-data/src/main/java/nu/marginalia/codec/processed/ProcessedDomainDataDehydrator.java create mode 100644 code/process-models/processed-data/src/main/java/nu/marginalia/codec/processed/ProcessedDomainDataDomainNameHydrator.java create mode 100644 code/process-models/processed-data/src/main/java/nu/marginalia/codec/processed/ProcessedDomainDataHydrator.java create mode 100644 code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/ProcessedDocumentParquetFileReader.java create mode 100644 code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/ProcessedDocumentParquetFileWriter.java create mode 100644 code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/ProcessedDomainParquetFileReader.java create mode 100644 code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/ProcessedDomainParquetFileWriter.java create mode 100644 code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/ProcessedDataCodec.java create mode 100644 code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/ProcessedDocumentData.java create mode 100644 code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/ProcessedDomainData.java create mode 100644 code/process-models/processed-data/src/test/java/nu/marginalia/io/processed/ProcessedDocumentParquetFileReaderTest.java create mode 100644 code/process-models/processed-data/src/test/java/nu/marginalia/io/processed/ProcessedDomainParquetFileReaderTest.java diff --git a/code/process-models/processed-data/build.gradle b/code/process-models/processed-data/build.gradle new file mode 100644 index 00000000..5159bc8b --- /dev/null +++ b/code/process-models/processed-data/build.gradle @@ -0,0 +1,37 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "8.2.2" + + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(20)) + } +} +dependencies { + implementation libs.lombok + annotationProcessor libs.lombok + implementation libs.bundles.slf4j + + implementation project(':third-party:parquet-floor') + + implementation libs.notnull + implementation libs.trove + implementation libs.bundles.parquet + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} + +test { + useJUnitPlatform() +} + +task fastTests(type: Test) { + useJUnitPlatform { + excludeTags "slow" + } +} diff --git a/code/process-models/processed-data/src/main/java/nu/marginalia/codec/processed/ProcessedDocumentDataDehydrator.java b/code/process-models/processed-data/src/main/java/nu/marginalia/codec/processed/ProcessedDocumentDataDehydrator.java new file mode 100644 index 00000000..8a615186 --- /dev/null +++ b/code/process-models/processed-data/src/main/java/nu/marginalia/codec/processed/ProcessedDocumentDataDehydrator.java @@ -0,0 +1,37 @@ +package nu.marginalia.codec.processed; + +import blue.strategic.parquet.Dehydrator; +import blue.strategic.parquet.ValueWriter; +import nu.marginalia.model.processed.ProcessedDocumentData; + +public class ProcessedDocumentDataDehydrator implements Dehydrator { + @Override + public void dehydrate(ProcessedDocumentData record, ValueWriter valueWriter) { + valueWriter.write("domain", record.domain); + valueWriter.write("url", record.url); + valueWriter.write("ordinal", record.ordinal); + valueWriter.write("state", record.state); + + if (record.stateReason != null) + valueWriter.write("stateReason", record.stateReason); + if (record.title != null) + valueWriter.write("title", record.title); + if (record.description != null) + valueWriter.write("description", record.description); + valueWriter.write("htmlFeatures", record.htmlFeatures); + valueWriter.write("htmlStandard", record.htmlStandard); + valueWriter.write("length", record.length); + valueWriter.write("hash", record.hash); + valueWriter.write("quality", record.quality); + if (record.pubYear != null) { + valueWriter.write("pubYear", record.pubYear); + } + + if (record.metas != null) { + valueWriter.writeList("wordMeta", record.metas); + } + if (record.words != null) { + valueWriter.writeList("word", record.words); + } + } +} diff --git a/code/process-models/processed-data/src/main/java/nu/marginalia/codec/processed/ProcessedDocumentDataHydrator.java b/code/process-models/processed-data/src/main/java/nu/marginalia/codec/processed/ProcessedDocumentDataHydrator.java new file mode 100644 index 00000000..c04147fd --- /dev/null +++ b/code/process-models/processed-data/src/main/java/nu/marginalia/codec/processed/ProcessedDocumentDataHydrator.java @@ -0,0 +1,24 @@ +package nu.marginalia.codec.processed; + +import blue.strategic.parquet.Hydrator; +import nu.marginalia.model.processed.ProcessedDocumentData; +import nu.marginalia.model.processed.ProcessedDomainData; + +public class ProcessedDocumentDataHydrator implements Hydrator { + + @Override + public ProcessedDocumentData start() { + return new ProcessedDocumentData(); + } + + @Override + public ProcessedDocumentData add(ProcessedDocumentData target, String heading, Object value) { + return target.add(heading, value); + } + + @Override + public ProcessedDocumentData finish(ProcessedDocumentData target) { + return target; + } + +} diff --git a/code/process-models/processed-data/src/main/java/nu/marginalia/codec/processed/ProcessedDomainDataDehydrator.java b/code/process-models/processed-data/src/main/java/nu/marginalia/codec/processed/ProcessedDomainDataDehydrator.java new file mode 100644 index 00000000..4a52a54c --- /dev/null +++ b/code/process-models/processed-data/src/main/java/nu/marginalia/codec/processed/ProcessedDomainDataDehydrator.java @@ -0,0 +1,26 @@ +package nu.marginalia.codec.processed; + +import blue.strategic.parquet.Dehydrator; +import blue.strategic.parquet.ValueWriter; +import nu.marginalia.model.processed.ProcessedDomainData; + +public class ProcessedDomainDataDehydrator implements Dehydrator { + + + @Override + public void dehydrate(ProcessedDomainData record, ValueWriter valueWriter) { + valueWriter.write("domain", record.domain); + valueWriter.write("knownUrls", record.knownUrls); + valueWriter.write("goodUrls", record.goodUrls); + valueWriter.write("visitedUrls", record.visitedUrls); + if (record.state != null) { + valueWriter.write("state", record.state); + } + if (record.redirectDomain != null) { + valueWriter.write("redirectDomain", record.redirectDomain); + } + if (record.ip != null) { + valueWriter.write("ip", record.ip); + } + } +} diff --git a/code/process-models/processed-data/src/main/java/nu/marginalia/codec/processed/ProcessedDomainDataDomainNameHydrator.java b/code/process-models/processed-data/src/main/java/nu/marginalia/codec/processed/ProcessedDomainDataDomainNameHydrator.java new file mode 100644 index 00000000..945fad26 --- /dev/null +++ b/code/process-models/processed-data/src/main/java/nu/marginalia/codec/processed/ProcessedDomainDataDomainNameHydrator.java @@ -0,0 +1,26 @@ +package nu.marginalia.codec.processed; + +import blue.strategic.parquet.Hydrator; + + +public class ProcessedDomainDataDomainNameHydrator implements Hydrator { + + @Override + public String start() { + return ""; + } + + @Override + public String add(String target, String heading, Object value) { + if ("domain".equals(heading)) { + return (String) value; + } + return target; + } + + @Override + public String finish(String target) { + return target; + } + +} diff --git a/code/process-models/processed-data/src/main/java/nu/marginalia/codec/processed/ProcessedDomainDataHydrator.java b/code/process-models/processed-data/src/main/java/nu/marginalia/codec/processed/ProcessedDomainDataHydrator.java new file mode 100644 index 00000000..aa9531a1 --- /dev/null +++ b/code/process-models/processed-data/src/main/java/nu/marginalia/codec/processed/ProcessedDomainDataHydrator.java @@ -0,0 +1,23 @@ +package nu.marginalia.codec.processed; + +import blue.strategic.parquet.Hydrator; +import nu.marginalia.model.processed.ProcessedDomainData; + +public class ProcessedDomainDataHydrator implements Hydrator { + + @Override + public ProcessedDomainData start() { + return new ProcessedDomainData(); + } + + @Override + public ProcessedDomainData add(ProcessedDomainData target, String heading, Object value) { + return target.add(heading, value); + } + + @Override + public ProcessedDomainData finish(ProcessedDomainData target) { + return target; + } + +} diff --git a/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/ProcessedDocumentParquetFileReader.java b/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/ProcessedDocumentParquetFileReader.java new file mode 100644 index 00000000..ff82a197 --- /dev/null +++ b/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/ProcessedDocumentParquetFileReader.java @@ -0,0 +1,20 @@ +package nu.marginalia.io.processed; + +import blue.strategic.parquet.HydratorSupplier; +import blue.strategic.parquet.ParquetReader; +import nu.marginalia.codec.processed.ProcessedDocumentDataHydrator; +import nu.marginalia.model.processed.ProcessedDocumentData; +import org.jetbrains.annotations.NotNull; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.stream.Stream; + +public class ProcessedDocumentParquetFileReader { + + @NotNull + public static Stream stream(Path path) throws IOException { + return ParquetReader.streamContent(path.toFile(), + HydratorSupplier.constantly(new ProcessedDocumentDataHydrator())); + } +} diff --git a/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/ProcessedDocumentParquetFileWriter.java b/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/ProcessedDocumentParquetFileWriter.java new file mode 100644 index 00000000..37f92a78 --- /dev/null +++ b/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/ProcessedDocumentParquetFileWriter.java @@ -0,0 +1,25 @@ +package nu.marginalia.io.processed; + +import blue.strategic.parquet.ParquetWriter; +import nu.marginalia.codec.processed.ProcessedDocumentDataDehydrator; +import nu.marginalia.model.processed.ProcessedDocumentData; + +import java.io.IOException; +import java.nio.file.Path; + +public class ProcessedDocumentParquetFileWriter implements AutoCloseable { + private final ParquetWriter writer; + + public ProcessedDocumentParquetFileWriter(Path file) throws IOException { + writer = ParquetWriter.writeFile(ProcessedDocumentData.schema, + file.toFile(), new ProcessedDocumentDataDehydrator()); + } + + public void write(ProcessedDocumentData domainData) throws IOException { + writer.write(domainData); + } + + public void close() throws IOException { + writer.close(); + } +} diff --git a/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/ProcessedDomainParquetFileReader.java b/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/ProcessedDomainParquetFileReader.java new file mode 100644 index 00000000..1324cfe1 --- /dev/null +++ b/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/ProcessedDomainParquetFileReader.java @@ -0,0 +1,32 @@ +package nu.marginalia.io.processed; + +import blue.strategic.parquet.HydratorSupplier; +import blue.strategic.parquet.ParquetReader; +import nu.marginalia.codec.processed.ProcessedDomainDataDomainNameHydrator; +import nu.marginalia.codec.processed.ProcessedDomainDataHydrator; +import nu.marginalia.model.processed.ProcessedDomainData; +import org.jetbrains.annotations.NotNull; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.List; +import java.util.stream.Stream; + +public class ProcessedDomainParquetFileReader { + + @NotNull + public static Stream stream(Path path) throws IOException { + return ParquetReader.streamContent(path.toFile(), + HydratorSupplier.constantly(new ProcessedDomainDataHydrator())); + } + + @NotNull + public static List getDomainNames(Path path) throws IOException { + return ParquetReader.streamContent(path.toFile(), + HydratorSupplier.constantly(new ProcessedDomainDataDomainNameHydrator()), + List.of("domain")) + .toList(); + } + + +} diff --git a/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/ProcessedDomainParquetFileWriter.java b/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/ProcessedDomainParquetFileWriter.java new file mode 100644 index 00000000..862615a5 --- /dev/null +++ b/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/ProcessedDomainParquetFileWriter.java @@ -0,0 +1,25 @@ +package nu.marginalia.io.processed; + +import blue.strategic.parquet.ParquetWriter; +import nu.marginalia.codec.processed.ProcessedDomainDataDehydrator; +import nu.marginalia.model.processed.ProcessedDomainData; + +import java.io.IOException; +import java.nio.file.Path; + +public class ProcessedDomainParquetFileWriter implements AutoCloseable { + private final ParquetWriter writer; + + public ProcessedDomainParquetFileWriter(Path file) throws IOException { + writer = ParquetWriter.writeFile(ProcessedDomainData.schema, + file.toFile(), new ProcessedDomainDataDehydrator()); + } + + public void write(ProcessedDomainData domainData) throws IOException { + writer.write(domainData); + } + + public void close() throws IOException { + writer.close(); + } +} diff --git a/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/ProcessedDataCodec.java b/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/ProcessedDataCodec.java new file mode 100644 index 00000000..54ed35c7 --- /dev/null +++ b/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/ProcessedDataCodec.java @@ -0,0 +1,6 @@ +package nu.marginalia.model.processed; + + +public class ProcessedDataCodec { + +} diff --git a/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/ProcessedDocumentData.java b/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/ProcessedDocumentData.java new file mode 100644 index 00000000..5d7fad8f --- /dev/null +++ b/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/ProcessedDocumentData.java @@ -0,0 +1,105 @@ +package nu.marginalia.model.processed; + +import gnu.trove.list.TLongList; +import gnu.trove.list.array.TLongArrayList; +import lombok.*; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.Types; +import org.jetbrains.annotations.NotNull; +import org.jetbrains.annotations.Nullable; + +import java.util.ArrayList; +import java.util.List; + +import static org.apache.parquet.schema.LogicalTypeAnnotation.stringType; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.*; + +@Getter +@Setter +@NoArgsConstructor +@AllArgsConstructor +@EqualsAndHashCode +@ToString +public class ProcessedDocumentData { + @NotNull + public String domain; + @NotNull + public String url; + + public int ordinal; + + @NotNull + public String state; + @Nullable + public String stateReason; + + @Nullable + public String title; + @Nullable + public String description; + public int htmlFeatures; + @Nullable + public String htmlStandard; + + public int length; + public long hash; + public float quality; + + @Nullable + public Integer pubYear; + + @Nullable + public List words; + @Nullable + public List metas; + + public static MessageType schema = new MessageType( + ProcessedDocumentData.class.getSimpleName(), + Types.required(BINARY).as(stringType()).named("domain"), + Types.required(BINARY).as(stringType()).named("url"), + Types.required(INT32).named("ordinal"), + Types.required(BINARY).as(stringType()).named("state"), + Types.optional(BINARY).as(stringType()).named("stateReason"), + Types.optional(BINARY).as(stringType()).named("title"), + Types.optional(BINARY).as(stringType()).named("description"), + Types.optional(INT32).named("htmlFeatures"), + Types.optional(BINARY).as(stringType()).named("htmlStandard"), + Types.optional(INT64).named("hash"), + Types.optional(INT32).named("length"), + Types.optional(FLOAT).named("quality"), + Types.optional(INT32).named("pubYear"), + Types.repeated(INT64).named("wordMeta"), + Types.repeated(BINARY).as(stringType()).named("word") + ); + + public ProcessedDocumentData add(String heading, Object value) { + switch (heading) { + case "domain" -> domain = (String) value; + case "url" -> url = (String) value; + case "ordinal" -> ordinal = (Integer) value; + case "htmlFeatures" -> htmlFeatures = (Integer) value; + case "length" -> length = (Integer) value; + case "pubYear" -> pubYear = (Integer) value; + case "hash" -> hash = (Long) value; + case "quality" -> quality = (Float) value; + case "state" -> state = (String) value; + case "stateReason" -> stateReason = (String) value; + case "title" -> title = (String) value; + case "description" -> description = (String) value; + case "htmlStandard" -> htmlStandard = (String) value; + case "word" -> { + if (this.words == null) + this.words = new ArrayList<>(100); + this.words.add((String) value); + } + case "wordMeta" -> { + if (this.metas == null) { + this.metas = new ArrayList<>(100); + } + this.metas.add((Long) value); + } + default -> throw new UnsupportedOperationException("Unknown heading '" + heading + '"'); + } + return this; + } +} diff --git a/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/ProcessedDomainData.java b/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/ProcessedDomainData.java new file mode 100644 index 00000000..81b0241c --- /dev/null +++ b/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/ProcessedDomainData.java @@ -0,0 +1,55 @@ +package nu.marginalia.model.processed; + +import lombok.*; +import org.apache.parquet.schema.*; +import org.jetbrains.annotations.NotNull; +import org.jetbrains.annotations.Nullable; + +import static org.apache.parquet.schema.LogicalTypeAnnotation.*; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; + +@Getter +@Setter +@NoArgsConstructor +@AllArgsConstructor +@EqualsAndHashCode +public class ProcessedDomainData { + @NotNull + public String domain; + + public int knownUrls; + public int goodUrls; + public int visitedUrls; + + @Nullable + public String state; + @Nullable + public String redirectDomain; + @Nullable + public String ip; + + public static MessageType schema = new MessageType( + ProcessedDomainData.class.getSimpleName(), + Types.required(BINARY).as(stringType()).named("domain"), + Types.optional(INT32).named("knownUrls"), + Types.optional(INT32).named("visitedUrls"), + Types.optional(INT32).named("goodUrls"), + Types.required(BINARY).as(stringType()).named("state"), + Types.optional(BINARY).as(stringType()).named("redirectDomain"), + Types.optional(BINARY).as(stringType()).named("ip")); + + public ProcessedDomainData add(String heading, Object value) { + switch (heading) { + case "domain" -> domain = (String) value; + case "knownUrls" -> knownUrls = (Integer) value; + case "visitedUrls" -> visitedUrls = (Integer) value; + case "goodUrls" -> goodUrls = (Integer) value; + case "state" -> state = (String) value; + case "redirectDomain" -> redirectDomain = (String) value; + case "ip" -> ip = (String) value; + default -> throw new UnsupportedOperationException("Unknown heading '" + heading + '"'); + } + return this; + } +} diff --git a/code/process-models/processed-data/src/test/java/nu/marginalia/io/processed/ProcessedDocumentParquetFileReaderTest.java b/code/process-models/processed-data/src/test/java/nu/marginalia/io/processed/ProcessedDocumentParquetFileReaderTest.java new file mode 100644 index 00000000..5090a65c --- /dev/null +++ b/code/process-models/processed-data/src/test/java/nu/marginalia/io/processed/ProcessedDocumentParquetFileReaderTest.java @@ -0,0 +1,89 @@ +package nu.marginalia.io.processed; + +import nu.marginalia.model.processed.ProcessedDocumentData; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; +import java.util.stream.IntStream; +import java.util.stream.LongStream; + +import static org.junit.jupiter.api.Assertions.*; + +class ProcessedDocumentParquetFileReaderTest { + Path parquetFile; + + @BeforeEach + public void setUp() throws IOException { + parquetFile = Files.createTempFile(getClass().getSimpleName(), ".parquet"); + } + + @AfterEach + public void tearDown() throws IOException { + Files.deleteIfExists(parquetFile); + } + + @Test + public void test() throws IOException { + var doc = new ProcessedDocumentData( + "www.marginalia.nu", + "https://www.marginalia.nu/", + 0, + "OK", + null, + "Itsa me, Marginalia!", + "Hello World", + 3, + "HTML5", + 123, + 0xF00BA3L, + 0.25f, + null, + List.of("Hello", "world"), + List.of(2L, 3L) + ); + + try (var writer = new ProcessedDocumentParquetFileWriter(parquetFile)) { + writer.write(doc); + } + + var read = ProcessedDocumentParquetFileReader.stream(parquetFile).toList(); + assertEquals(List.of(doc), read); + } + + @Test + public void testHugePayload() throws IOException { + List words = IntStream.range(0, 100000).mapToObj(Integer::toString).toList(); + List metas = LongStream.range(0, 100000).boxed().toList(); + + var doc = new ProcessedDocumentData( + "www.marginalia.nu", + "https://www.marginalia.nu/", + 0, + "OK", + null, + "Itsa me, Marginalia!", + "Hello World", + 3, + "HTML5", + 123, + 0xF00BA3L, + 0.25f, + null, + words, + metas + ); + + try (var writer = new ProcessedDocumentParquetFileWriter(parquetFile)) { + writer.write(doc); + } + + var read = ProcessedDocumentParquetFileReader.stream(parquetFile).toList(); + assertEquals(List.of(doc), read); + } + +} \ No newline at end of file diff --git a/code/process-models/processed-data/src/test/java/nu/marginalia/io/processed/ProcessedDomainParquetFileReaderTest.java b/code/process-models/processed-data/src/test/java/nu/marginalia/io/processed/ProcessedDomainParquetFileReaderTest.java new file mode 100644 index 00000000..5264624d --- /dev/null +++ b/code/process-models/processed-data/src/test/java/nu/marginalia/io/processed/ProcessedDomainParquetFileReaderTest.java @@ -0,0 +1,63 @@ +package nu.marginalia.io.processed; + +import nu.marginalia.model.processed.ProcessedDomainData; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.*; + +class ProcessedDomainParquetFileReaderTest { + Path parquetFile; + + @BeforeEach + public void setUp() throws IOException { + parquetFile = Files.createTempFile(getClass().getSimpleName(), ".parquet"); + } + + @AfterEach + public void tearDown() throws IOException { + Files.deleteIfExists(parquetFile); + } + + @Test + public void testReadFull() throws IOException { + var first = new ProcessedDomainData( + "www.marginalia.nu", + 10, + 3, + 5, + "'sall good man", + null, + "127.0.0.1" + ); + var second = new ProcessedDomainData( + "memex.marginalia.nu", + 0, + 0, + 0, + "REDIRECT", + "www.marginalia.nu", + "127.0.0.1" + ); + + try (var writer = new ProcessedDomainParquetFileWriter(parquetFile)) { + writer.write(first); + writer.write(second); + } + + var domainNames = ProcessedDomainParquetFileReader.getDomainNames(parquetFile); + assertEquals(List.of("www.marginalia.nu", "memex.marginalia.nu"), domainNames); + + var items = ProcessedDomainParquetFileReader + .stream(parquetFile) + .toList(); + assertEquals(List.of(first, second), items); + } + +} \ No newline at end of file diff --git a/settings.gradle b/settings.gradle index d2b54d41..af44349d 100644 --- a/settings.gradle +++ b/settings.gradle @@ -66,6 +66,7 @@ include 'code:processes:test-data' include 'code:process-models:converting-model' include 'code:process-models:crawling-model' include 'code:process-models:work-log' +include 'code:process-models:processed-data' include 'code:tools:term-frequency-extractor' include 'code:tools:crawl-job-extractor' From 9f672a0cf4a648f18e3e1b3d3c90b4d027640842 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 13 Sep 2023 15:56:35 +0200 Subject: [PATCH 06/14] (parquet-floor) Modify the parquet library to permit list-fields. --- third-party/parquet-floor/build.gradle | 2 + .../blue/strategic/parquet/ParquetReader.java | 5 +- .../blue/strategic/parquet/ParquetWriter.java | 61 +++++++++++++++++++ .../blue/strategic/parquet/ValueWriter.java | 5 ++ 4 files changed, 72 insertions(+), 1 deletion(-) diff --git a/third-party/parquet-floor/build.gradle b/third-party/parquet-floor/build.gradle index f8393044..05277c51 100644 --- a/third-party/parquet-floor/build.gradle +++ b/third-party/parquet-floor/build.gradle @@ -16,6 +16,8 @@ dependencies { exclude group: 'commons-pool', module: 'commons-pool' transitive = true } + + implementation libs.trove } test { diff --git a/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetReader.java b/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetReader.java index 3eee03d0..1ec3e7fb 100644 --- a/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetReader.java +++ b/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetReader.java @@ -192,7 +192,10 @@ public final class ParquetReader implements Spliterator, Closeable { U record = hydrator.start(); for (ColumnReader columnReader: this.currentRowGroupColumnReaders) { do { - record = hydrator.add(record, columnReader.getDescriptor().getPath()[0], readValue(columnReader)); + var value = readValue(columnReader); + if (value != null) { + record = hydrator.add(record, columnReader.getDescriptor().getPath()[0], value); + } columnReader.consume(); } while (columnReader.getCurrentRepetitionLevel() != 0); diff --git a/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetWriter.java b/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetWriter.java index 7840c49e..6e53c189 100644 --- a/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetWriter.java +++ b/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetWriter.java @@ -1,5 +1,7 @@ package blue.strategic.parquet; +import gnu.trove.list.TIntList; +import gnu.trove.list.TLongList; import org.apache.hadoop.conf.Configuration; import org.apache.parquet.column.ParquetProperties; import org.apache.parquet.hadoop.api.WriteSupport; @@ -20,6 +22,9 @@ import java.io.IOException; import java.util.Collections; import java.util.List; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64; + public final class ParquetWriter implements Closeable { private final org.apache.parquet.hadoop.ParquetWriter writer; @@ -117,6 +122,28 @@ public final class ParquetWriter implements Closeable { @Override public void writeList(String name, List value) { + if (value.isEmpty()) { + return; + } + + SimpleWriteSupport.this.writeList(name, value); + } + + @Override + public void writeList(String name, TLongList value) { + if (value.isEmpty()) { + return; + } + + SimpleWriteSupport.this.writeList(name, value); + } + + @Override + public void writeList(String name, TIntList value) { + if (value.isEmpty()) { + return; + } + SimpleWriteSupport.this.writeList(name, value); } }; @@ -170,6 +197,40 @@ public final class ParquetWriter implements Closeable { recordConsumer.endField(name, fieldIndex); } + private void writeList(String name, TLongList values) { + int fieldIndex = schema.getFieldIndex(name); + PrimitiveType type = schema.getType(fieldIndex).asPrimitiveType(); + recordConsumer.startField(name, fieldIndex); + + for (int i = 0; i < values.size(); i++) { + writeValue(type, values.get(i)); + } + + recordConsumer.endField(name, fieldIndex); + } + + private void writeList(String name, TIntList values) { + int fieldIndex = schema.getFieldIndex(name); + PrimitiveType type = schema.getType(fieldIndex).asPrimitiveType(); + recordConsumer.startField(name, fieldIndex); + + for (int i = 0; i < values.size(); i++) { + writeValue(type, values.get(i)); + } + + recordConsumer.endField(name, fieldIndex); + } + + void writeValue(PrimitiveType type, long value) { + assert type.getPrimitiveTypeName() == INT64; + recordConsumer.addLong(value); + } + + void writeValue(PrimitiveType type, int value) { + assert type.getPrimitiveTypeName() == INT32; + recordConsumer.addInteger(value); + } + void writeValue(PrimitiveType type, Object value) { switch (type.getPrimitiveTypeName()) { case INT32: recordConsumer.addInteger((int)value); break; diff --git a/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ValueWriter.java b/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ValueWriter.java index e8cda912..962f3b50 100644 --- a/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ValueWriter.java +++ b/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ValueWriter.java @@ -1,8 +1,13 @@ package blue.strategic.parquet; +import gnu.trove.list.TIntList; +import gnu.trove.list.TLongList; + import java.util.List; public interface ValueWriter { void write(String name, Object value); void writeList(String name, List value); + void writeList(String name, TLongList value); + void writeList(String name, TIntList value); } From 24b4606f9653c6a9bc2264d785552ea1445d9fbd Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 13 Sep 2023 16:13:41 +0200 Subject: [PATCH 07/14] (converter,loader) Converter outputs parquet files instead of compressed json. --- .../ProcessedDocumentDataDehydrator.java | 37 --- .../ProcessedDocumentDataHydrator.java | 24 -- .../ProcessedDomainDataDehydrator.java | 26 -- ...ProcessedDomainDataDomainNameHydrator.java | 26 -- .../ProcessedDomainDataHydrator.java | 23 -- .../DocumentRecordParquetFileReader.java | 37 +++ .../DocumentRecordParquetFileWriter.java | 24 ++ .../DomainLinkRecordParquetFileReader.java | 31 +++ .../DomainLinkRecordParquetFileWriter.java | 24 ++ ...ava => DomainRecordParquetFileReader.java} | 12 +- .../DomainRecordParquetFileWriter.java | 24 ++ .../io/processed/ProcessedDataFileNames.java | 73 ++++++ .../ProcessedDocumentParquetFileReader.java | 20 -- .../ProcessedDocumentParquetFileWriter.java | 25 -- .../ProcessedDomainParquetFileWriter.java | 25 -- ...dDocumentData.java => DocumentRecord.java} | 78 +++++- .../DocumentRecordKeywordsProjection.java | 92 +++++++ .../DocumentRecordMetadataProjection.java | 100 ++++++++ .../model/processed/DomainLinkRecord.java | 97 ++++++++ .../model/processed/DomainRecord.java | 146 +++++++++++ .../model/processed/ProcessedDataCodec.java | 6 - .../model/processed/ProcessedDomainData.java | 55 ----- ... DocumentRecordParquetFileReaderTest.java} | 23 +- ...DomainLinkRecordParquetFileReaderTest.java | 49 ++++ ...=> DomainRecordParquetFileReaderTest.java} | 20 +- .../marginalia/worklog/BatchingWorkLog.java | 9 + .../worklog/BatchingWorkLogImpl.java | 34 ++- .../worklog/BatchingWorkLogInspector.java | 22 ++ .../worklog/BatchingWorkLogImplTest.java | 11 + .../processes/converting-process/build.gradle | 2 + .../marginalia/converting/ConverterMain.java | 32 +-- .../writer/ConverterBatchWriter.java | 232 ++++++++++++++++++ .../converting/writer/ConverterWriter.java | 125 ++++++++++ code/processes/loading-process/build.gradle | 2 + .../loading/ConvertedDomainReader.java | 109 -------- .../nu/marginalia/loading/LoaderMain.java | 126 ++++------ .../java/nu/marginalia/loading/TaskStats.java | 37 --- .../documents/DocumentLoaderService.java | 97 ++++++++ .../documents/KeywordLoaderService.java | 57 +++++ .../LoaderIndexJournalWriter.java | 14 +- .../loading/domains/DomainIdRegistry.java | 36 +++ .../loading/domains/DomainLoaderService.java | 97 ++++++++ .../links/DomainLinksLoaderService.java | 104 ++++++++ .../loading/loader/IndexLoadKeywords.java | 47 ---- .../loader/LdbLoadProcessedDocument.java | 83 ------- .../nu/marginalia/loading/loader/Loader.java | 118 --------- .../marginalia/loading/loader/LoaderData.java | 38 --- .../loading/loader/LoaderFactory.java | 31 --- .../loading/loader/SqlLoadDomainLinks.java | 84 ------- .../loading/loader/SqlLoadDomainMetadata.java | 41 ---- .../loading/loader/SqlLoadDomains.java | 163 ------------ .../loader/SqlLoadProcessedDomain.java | 104 -------- .../loader/SqlLoadDomainLinksTest.java | 52 ---- .../marginalia/loader/SqlLoadDomainsTest.java | 55 ----- .../loader/SqlLoadProcessedDomainTest.java | 76 ------ .../domains/DomainLoaderServiceTest.java | 130 ++++++++++ .../links/DomainLinksLoaderServiceTest.java | 124 ++++++++++ .../loader/LoaderIndexJournalWriterTest.java | 1 + 58 files changed, 1944 insertions(+), 1446 deletions(-) delete mode 100644 code/process-models/processed-data/src/main/java/nu/marginalia/codec/processed/ProcessedDocumentDataDehydrator.java delete mode 100644 code/process-models/processed-data/src/main/java/nu/marginalia/codec/processed/ProcessedDocumentDataHydrator.java delete mode 100644 code/process-models/processed-data/src/main/java/nu/marginalia/codec/processed/ProcessedDomainDataDehydrator.java delete mode 100644 code/process-models/processed-data/src/main/java/nu/marginalia/codec/processed/ProcessedDomainDataDomainNameHydrator.java delete mode 100644 code/process-models/processed-data/src/main/java/nu/marginalia/codec/processed/ProcessedDomainDataHydrator.java create mode 100644 code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/DocumentRecordParquetFileReader.java create mode 100644 code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/DocumentRecordParquetFileWriter.java create mode 100644 code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/DomainLinkRecordParquetFileReader.java create mode 100644 code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/DomainLinkRecordParquetFileWriter.java rename code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/{ProcessedDomainParquetFileReader.java => DomainRecordParquetFileReader.java} (53%) create mode 100644 code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/DomainRecordParquetFileWriter.java create mode 100644 code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/ProcessedDataFileNames.java delete mode 100644 code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/ProcessedDocumentParquetFileReader.java delete mode 100644 code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/ProcessedDocumentParquetFileWriter.java delete mode 100644 code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/ProcessedDomainParquetFileWriter.java rename code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/{ProcessedDocumentData.java => DocumentRecord.java} (58%) create mode 100644 code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DocumentRecordKeywordsProjection.java create mode 100644 code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DocumentRecordMetadataProjection.java create mode 100644 code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainLinkRecord.java create mode 100644 code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainRecord.java delete mode 100644 code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/ProcessedDataCodec.java delete mode 100644 code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/ProcessedDomainData.java rename code/process-models/processed-data/src/test/java/nu/marginalia/io/processed/{ProcessedDocumentParquetFileReaderTest.java => DocumentRecordParquetFileReaderTest.java} (72%) create mode 100644 code/process-models/processed-data/src/test/java/nu/marginalia/io/processed/DomainLinkRecordParquetFileReaderTest.java rename code/process-models/processed-data/src/test/java/nu/marginalia/io/processed/{ProcessedDomainParquetFileReaderTest.java => DomainRecordParquetFileReaderTest.java} (72%) create mode 100644 code/process-models/work-log/src/main/java/nu/marginalia/worklog/BatchingWorkLogInspector.java create mode 100644 code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterBatchWriter.java create mode 100644 code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterWriter.java delete mode 100644 code/processes/loading-process/src/main/java/nu/marginalia/loading/ConvertedDomainReader.java delete mode 100644 code/processes/loading-process/src/main/java/nu/marginalia/loading/TaskStats.java create mode 100644 code/processes/loading-process/src/main/java/nu/marginalia/loading/documents/DocumentLoaderService.java create mode 100644 code/processes/loading-process/src/main/java/nu/marginalia/loading/documents/KeywordLoaderService.java rename code/processes/loading-process/src/main/java/nu/marginalia/loading/{loader => documents}/LoaderIndexJournalWriter.java (90%) create mode 100644 code/processes/loading-process/src/main/java/nu/marginalia/loading/domains/DomainIdRegistry.java create mode 100644 code/processes/loading-process/src/main/java/nu/marginalia/loading/domains/DomainLoaderService.java create mode 100644 code/processes/loading-process/src/main/java/nu/marginalia/loading/links/DomainLinksLoaderService.java delete mode 100644 code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/IndexLoadKeywords.java delete mode 100644 code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LdbLoadProcessedDocument.java delete mode 100644 code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/Loader.java delete mode 100644 code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderData.java delete mode 100644 code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderFactory.java delete mode 100644 code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadDomainLinks.java delete mode 100644 code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadDomainMetadata.java delete mode 100644 code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadDomains.java delete mode 100644 code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDomain.java delete mode 100644 code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadDomainLinksTest.java delete mode 100644 code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadDomainsTest.java delete mode 100644 code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDomainTest.java create mode 100644 code/processes/loading-process/src/test/java/nu/marginalia/loading/domains/DomainLoaderServiceTest.java create mode 100644 code/processes/loading-process/src/test/java/nu/marginalia/loading/links/DomainLinksLoaderServiceTest.java diff --git a/code/process-models/processed-data/src/main/java/nu/marginalia/codec/processed/ProcessedDocumentDataDehydrator.java b/code/process-models/processed-data/src/main/java/nu/marginalia/codec/processed/ProcessedDocumentDataDehydrator.java deleted file mode 100644 index 8a615186..00000000 --- a/code/process-models/processed-data/src/main/java/nu/marginalia/codec/processed/ProcessedDocumentDataDehydrator.java +++ /dev/null @@ -1,37 +0,0 @@ -package nu.marginalia.codec.processed; - -import blue.strategic.parquet.Dehydrator; -import blue.strategic.parquet.ValueWriter; -import nu.marginalia.model.processed.ProcessedDocumentData; - -public class ProcessedDocumentDataDehydrator implements Dehydrator { - @Override - public void dehydrate(ProcessedDocumentData record, ValueWriter valueWriter) { - valueWriter.write("domain", record.domain); - valueWriter.write("url", record.url); - valueWriter.write("ordinal", record.ordinal); - valueWriter.write("state", record.state); - - if (record.stateReason != null) - valueWriter.write("stateReason", record.stateReason); - if (record.title != null) - valueWriter.write("title", record.title); - if (record.description != null) - valueWriter.write("description", record.description); - valueWriter.write("htmlFeatures", record.htmlFeatures); - valueWriter.write("htmlStandard", record.htmlStandard); - valueWriter.write("length", record.length); - valueWriter.write("hash", record.hash); - valueWriter.write("quality", record.quality); - if (record.pubYear != null) { - valueWriter.write("pubYear", record.pubYear); - } - - if (record.metas != null) { - valueWriter.writeList("wordMeta", record.metas); - } - if (record.words != null) { - valueWriter.writeList("word", record.words); - } - } -} diff --git a/code/process-models/processed-data/src/main/java/nu/marginalia/codec/processed/ProcessedDocumentDataHydrator.java b/code/process-models/processed-data/src/main/java/nu/marginalia/codec/processed/ProcessedDocumentDataHydrator.java deleted file mode 100644 index c04147fd..00000000 --- a/code/process-models/processed-data/src/main/java/nu/marginalia/codec/processed/ProcessedDocumentDataHydrator.java +++ /dev/null @@ -1,24 +0,0 @@ -package nu.marginalia.codec.processed; - -import blue.strategic.parquet.Hydrator; -import nu.marginalia.model.processed.ProcessedDocumentData; -import nu.marginalia.model.processed.ProcessedDomainData; - -public class ProcessedDocumentDataHydrator implements Hydrator { - - @Override - public ProcessedDocumentData start() { - return new ProcessedDocumentData(); - } - - @Override - public ProcessedDocumentData add(ProcessedDocumentData target, String heading, Object value) { - return target.add(heading, value); - } - - @Override - public ProcessedDocumentData finish(ProcessedDocumentData target) { - return target; - } - -} diff --git a/code/process-models/processed-data/src/main/java/nu/marginalia/codec/processed/ProcessedDomainDataDehydrator.java b/code/process-models/processed-data/src/main/java/nu/marginalia/codec/processed/ProcessedDomainDataDehydrator.java deleted file mode 100644 index 4a52a54c..00000000 --- a/code/process-models/processed-data/src/main/java/nu/marginalia/codec/processed/ProcessedDomainDataDehydrator.java +++ /dev/null @@ -1,26 +0,0 @@ -package nu.marginalia.codec.processed; - -import blue.strategic.parquet.Dehydrator; -import blue.strategic.parquet.ValueWriter; -import nu.marginalia.model.processed.ProcessedDomainData; - -public class ProcessedDomainDataDehydrator implements Dehydrator { - - - @Override - public void dehydrate(ProcessedDomainData record, ValueWriter valueWriter) { - valueWriter.write("domain", record.domain); - valueWriter.write("knownUrls", record.knownUrls); - valueWriter.write("goodUrls", record.goodUrls); - valueWriter.write("visitedUrls", record.visitedUrls); - if (record.state != null) { - valueWriter.write("state", record.state); - } - if (record.redirectDomain != null) { - valueWriter.write("redirectDomain", record.redirectDomain); - } - if (record.ip != null) { - valueWriter.write("ip", record.ip); - } - } -} diff --git a/code/process-models/processed-data/src/main/java/nu/marginalia/codec/processed/ProcessedDomainDataDomainNameHydrator.java b/code/process-models/processed-data/src/main/java/nu/marginalia/codec/processed/ProcessedDomainDataDomainNameHydrator.java deleted file mode 100644 index 945fad26..00000000 --- a/code/process-models/processed-data/src/main/java/nu/marginalia/codec/processed/ProcessedDomainDataDomainNameHydrator.java +++ /dev/null @@ -1,26 +0,0 @@ -package nu.marginalia.codec.processed; - -import blue.strategic.parquet.Hydrator; - - -public class ProcessedDomainDataDomainNameHydrator implements Hydrator { - - @Override - public String start() { - return ""; - } - - @Override - public String add(String target, String heading, Object value) { - if ("domain".equals(heading)) { - return (String) value; - } - return target; - } - - @Override - public String finish(String target) { - return target; - } - -} diff --git a/code/process-models/processed-data/src/main/java/nu/marginalia/codec/processed/ProcessedDomainDataHydrator.java b/code/process-models/processed-data/src/main/java/nu/marginalia/codec/processed/ProcessedDomainDataHydrator.java deleted file mode 100644 index aa9531a1..00000000 --- a/code/process-models/processed-data/src/main/java/nu/marginalia/codec/processed/ProcessedDomainDataHydrator.java +++ /dev/null @@ -1,23 +0,0 @@ -package nu.marginalia.codec.processed; - -import blue.strategic.parquet.Hydrator; -import nu.marginalia.model.processed.ProcessedDomainData; - -public class ProcessedDomainDataHydrator implements Hydrator { - - @Override - public ProcessedDomainData start() { - return new ProcessedDomainData(); - } - - @Override - public ProcessedDomainData add(ProcessedDomainData target, String heading, Object value) { - return target.add(heading, value); - } - - @Override - public ProcessedDomainData finish(ProcessedDomainData target) { - return target; - } - -} diff --git a/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/DocumentRecordParquetFileReader.java b/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/DocumentRecordParquetFileReader.java new file mode 100644 index 00000000..dae53224 --- /dev/null +++ b/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/DocumentRecordParquetFileReader.java @@ -0,0 +1,37 @@ +package nu.marginalia.io.processed; + +import blue.strategic.parquet.HydratorSupplier; +import blue.strategic.parquet.ParquetReader; +import nu.marginalia.model.processed.DocumentRecord; +import nu.marginalia.model.processed.DocumentRecordKeywordsProjection; +import nu.marginalia.model.processed.DocumentRecordMetadataProjection; +import org.jetbrains.annotations.NotNull; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.stream.Stream; + +public class DocumentRecordParquetFileReader { + + @NotNull + public static Stream stream(Path path) throws IOException { + return ParquetReader.streamContent(path.toFile(), + HydratorSupplier.constantly(DocumentRecord.newHydrator())); + } + + @NotNull + public static Stream streamKeywordsProjection(Path path) throws IOException { + return ParquetReader.streamContent(path.toFile(), + HydratorSupplier.constantly(DocumentRecordKeywordsProjection.newHydrator()), + DocumentRecordKeywordsProjection.requiredColumns() + ); + } + + @NotNull + public static Stream streamMetadataProjection(Path path) throws IOException { + return ParquetReader.streamContent(path.toFile(), + HydratorSupplier.constantly(DocumentRecordMetadataProjection.newHydrator()), + DocumentRecordMetadataProjection.requiredColumns() + ); + } +} diff --git a/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/DocumentRecordParquetFileWriter.java b/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/DocumentRecordParquetFileWriter.java new file mode 100644 index 00000000..62eec879 --- /dev/null +++ b/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/DocumentRecordParquetFileWriter.java @@ -0,0 +1,24 @@ +package nu.marginalia.io.processed; + +import blue.strategic.parquet.ParquetWriter; +import nu.marginalia.model.processed.DocumentRecord; + +import java.io.IOException; +import java.nio.file.Path; + +public class DocumentRecordParquetFileWriter implements AutoCloseable { + private final ParquetWriter writer; + + public DocumentRecordParquetFileWriter(Path file) throws IOException { + writer = ParquetWriter.writeFile(DocumentRecord.schema, + file.toFile(), DocumentRecord.newDehydrator()); + } + + public void write(DocumentRecord domainData) throws IOException { + writer.write(domainData); + } + + public void close() throws IOException { + writer.close(); + } +} diff --git a/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/DomainLinkRecordParquetFileReader.java b/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/DomainLinkRecordParquetFileReader.java new file mode 100644 index 00000000..e778169e --- /dev/null +++ b/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/DomainLinkRecordParquetFileReader.java @@ -0,0 +1,31 @@ +package nu.marginalia.io.processed; + +import blue.strategic.parquet.HydratorSupplier; +import blue.strategic.parquet.ParquetReader; +import nu.marginalia.model.processed.DomainLinkRecord; +import nu.marginalia.model.processed.DomainRecord; +import org.jetbrains.annotations.NotNull; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +public class DomainLinkRecordParquetFileReader { + @NotNull + public static Stream stream(Path path) throws IOException { + return ParquetReader.streamContent(path.toFile(), + HydratorSupplier.constantly(DomainLinkRecord.newHydrator())); + } + + @NotNull + public static Set getDestDomainNames(Path path) throws IOException { + return ParquetReader.streamContent(path.toFile(), + HydratorSupplier.constantly(DomainLinkRecord.newDestDomainHydrator()), + List.of("dest")) + .collect(Collectors.toSet()); + } + +} diff --git a/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/DomainLinkRecordParquetFileWriter.java b/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/DomainLinkRecordParquetFileWriter.java new file mode 100644 index 00000000..28cf3aa0 --- /dev/null +++ b/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/DomainLinkRecordParquetFileWriter.java @@ -0,0 +1,24 @@ +package nu.marginalia.io.processed; + +import blue.strategic.parquet.ParquetWriter; +import nu.marginalia.model.processed.DomainLinkRecord; + +import java.io.IOException; +import java.nio.file.Path; + +public class DomainLinkRecordParquetFileWriter implements AutoCloseable { + private final ParquetWriter writer; + + public DomainLinkRecordParquetFileWriter(Path file) throws IOException { + writer = ParquetWriter.writeFile(DomainLinkRecord.schema, + file.toFile(), DomainLinkRecord.newDehydrator()); + } + + public void write(DomainLinkRecord domainData) throws IOException { + writer.write(domainData); + } + + public void close() throws IOException { + writer.close(); + } +} diff --git a/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/ProcessedDomainParquetFileReader.java b/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/DomainRecordParquetFileReader.java similarity index 53% rename from code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/ProcessedDomainParquetFileReader.java rename to code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/DomainRecordParquetFileReader.java index 1324cfe1..a31b199d 100644 --- a/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/ProcessedDomainParquetFileReader.java +++ b/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/DomainRecordParquetFileReader.java @@ -2,9 +2,7 @@ package nu.marginalia.io.processed; import blue.strategic.parquet.HydratorSupplier; import blue.strategic.parquet.ParquetReader; -import nu.marginalia.codec.processed.ProcessedDomainDataDomainNameHydrator; -import nu.marginalia.codec.processed.ProcessedDomainDataHydrator; -import nu.marginalia.model.processed.ProcessedDomainData; +import nu.marginalia.model.processed.DomainRecord; import org.jetbrains.annotations.NotNull; import java.io.IOException; @@ -12,18 +10,18 @@ import java.nio.file.Path; import java.util.List; import java.util.stream.Stream; -public class ProcessedDomainParquetFileReader { +public class DomainRecordParquetFileReader { @NotNull - public static Stream stream(Path path) throws IOException { + public static Stream stream(Path path) throws IOException { return ParquetReader.streamContent(path.toFile(), - HydratorSupplier.constantly(new ProcessedDomainDataHydrator())); + HydratorSupplier.constantly(DomainRecord.newHydrator())); } @NotNull public static List getDomainNames(Path path) throws IOException { return ParquetReader.streamContent(path.toFile(), - HydratorSupplier.constantly(new ProcessedDomainDataDomainNameHydrator()), + HydratorSupplier.constantly(DomainRecord.newDomainNameHydrator()), List.of("domain")) .toList(); } diff --git a/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/DomainRecordParquetFileWriter.java b/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/DomainRecordParquetFileWriter.java new file mode 100644 index 00000000..31c59582 --- /dev/null +++ b/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/DomainRecordParquetFileWriter.java @@ -0,0 +1,24 @@ +package nu.marginalia.io.processed; + +import blue.strategic.parquet.ParquetWriter; +import nu.marginalia.model.processed.DomainRecord; + +import java.io.IOException; +import java.nio.file.Path; + +public class DomainRecordParquetFileWriter implements AutoCloseable { + private final ParquetWriter writer; + + public DomainRecordParquetFileWriter(Path file) throws IOException { + writer = ParquetWriter.writeFile(DomainRecord.schema, + file.toFile(), DomainRecord.newDehydrator()); + } + + public void write(DomainRecord domainData) throws IOException { + writer.write(domainData); + } + + public void close() throws IOException { + writer.close(); + } +} diff --git a/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/ProcessedDataFileNames.java b/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/ProcessedDataFileNames.java new file mode 100644 index 00000000..fafb393f --- /dev/null +++ b/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/ProcessedDataFileNames.java @@ -0,0 +1,73 @@ +package nu.marginalia.io.processed; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; + +public class ProcessedDataFileNames { + public static Path documentFileName(Path base, int batchNumber) { + return base.resolve(String.format("document%04d.parquet", batchNumber)); + } + public static Path domainFileName(Path base, int batchNumber) { + return base.resolve(String.format("domain%04d.parquet", batchNumber)); + } + public static Path domainLinkFileName(Path base, int batchNumber) { + return base.resolve(String.format("domain-link%04d.parquet", batchNumber)); + } + + public static List listDocumentFiles(Path base, int untilBatch) { + List ret = new ArrayList<>(untilBatch); + + for (int i = 0; i < untilBatch; i++) { + Path maybe = documentFileName(base, i); + if (Files.exists(maybe)) { + ret.add(maybe); + } + } + + return ret; + } + + public static List listDomainFiles(Path base, int untilBatch) { + List ret = new ArrayList<>(untilBatch); + + for (int i = 0; i < untilBatch; i++) { + Path maybe = domainFileName(base, i); + if (Files.exists(maybe)) { + ret.add(maybe); + } + } + + return ret; + } + + public static List listDomainFiles(Path base) { + List ret = new ArrayList<>(); + + for (int i = 0;; i++) { + Path maybe = domainFileName(base, i); + if (Files.exists(maybe)) { + ret.add(maybe); + } + else { + break; + } + } + + return ret; + } + + public static List listDomainLinkFiles(Path base, int untilBatch) { + List ret = new ArrayList<>(untilBatch); + + for (int i = 0; i < untilBatch; i++) { + Path maybe = domainLinkFileName(base, i); + if (Files.exists(maybe)) { + ret.add(maybe); + } + } + + return ret; + } +} diff --git a/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/ProcessedDocumentParquetFileReader.java b/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/ProcessedDocumentParquetFileReader.java deleted file mode 100644 index ff82a197..00000000 --- a/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/ProcessedDocumentParquetFileReader.java +++ /dev/null @@ -1,20 +0,0 @@ -package nu.marginalia.io.processed; - -import blue.strategic.parquet.HydratorSupplier; -import blue.strategic.parquet.ParquetReader; -import nu.marginalia.codec.processed.ProcessedDocumentDataHydrator; -import nu.marginalia.model.processed.ProcessedDocumentData; -import org.jetbrains.annotations.NotNull; - -import java.io.IOException; -import java.nio.file.Path; -import java.util.stream.Stream; - -public class ProcessedDocumentParquetFileReader { - - @NotNull - public static Stream stream(Path path) throws IOException { - return ParquetReader.streamContent(path.toFile(), - HydratorSupplier.constantly(new ProcessedDocumentDataHydrator())); - } -} diff --git a/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/ProcessedDocumentParquetFileWriter.java b/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/ProcessedDocumentParquetFileWriter.java deleted file mode 100644 index 37f92a78..00000000 --- a/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/ProcessedDocumentParquetFileWriter.java +++ /dev/null @@ -1,25 +0,0 @@ -package nu.marginalia.io.processed; - -import blue.strategic.parquet.ParquetWriter; -import nu.marginalia.codec.processed.ProcessedDocumentDataDehydrator; -import nu.marginalia.model.processed.ProcessedDocumentData; - -import java.io.IOException; -import java.nio.file.Path; - -public class ProcessedDocumentParquetFileWriter implements AutoCloseable { - private final ParquetWriter writer; - - public ProcessedDocumentParquetFileWriter(Path file) throws IOException { - writer = ParquetWriter.writeFile(ProcessedDocumentData.schema, - file.toFile(), new ProcessedDocumentDataDehydrator()); - } - - public void write(ProcessedDocumentData domainData) throws IOException { - writer.write(domainData); - } - - public void close() throws IOException { - writer.close(); - } -} diff --git a/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/ProcessedDomainParquetFileWriter.java b/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/ProcessedDomainParquetFileWriter.java deleted file mode 100644 index 862615a5..00000000 --- a/code/process-models/processed-data/src/main/java/nu/marginalia/io/processed/ProcessedDomainParquetFileWriter.java +++ /dev/null @@ -1,25 +0,0 @@ -package nu.marginalia.io.processed; - -import blue.strategic.parquet.ParquetWriter; -import nu.marginalia.codec.processed.ProcessedDomainDataDehydrator; -import nu.marginalia.model.processed.ProcessedDomainData; - -import java.io.IOException; -import java.nio.file.Path; - -public class ProcessedDomainParquetFileWriter implements AutoCloseable { - private final ParquetWriter writer; - - public ProcessedDomainParquetFileWriter(Path file) throws IOException { - writer = ParquetWriter.writeFile(ProcessedDomainData.schema, - file.toFile(), new ProcessedDomainDataDehydrator()); - } - - public void write(ProcessedDomainData domainData) throws IOException { - writer.write(domainData); - } - - public void close() throws IOException { - writer.close(); - } -} diff --git a/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/ProcessedDocumentData.java b/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DocumentRecord.java similarity index 58% rename from code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/ProcessedDocumentData.java rename to code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DocumentRecord.java index 5d7fad8f..c90df7ee 100644 --- a/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/ProcessedDocumentData.java +++ b/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DocumentRecord.java @@ -1,5 +1,8 @@ package nu.marginalia.model.processed; +import blue.strategic.parquet.Dehydrator; +import blue.strategic.parquet.Hydrator; +import blue.strategic.parquet.ValueWriter; import gnu.trove.list.TLongList; import gnu.trove.list.array.TLongArrayList; import lombok.*; @@ -20,7 +23,7 @@ import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.*; @AllArgsConstructor @EqualsAndHashCode @ToString -public class ProcessedDocumentData { +public class DocumentRecord { @NotNull public String domain; @NotNull @@ -45,16 +48,26 @@ public class ProcessedDocumentData { public long hash; public float quality; + public long documentMetadata; + @Nullable public Integer pubYear; @Nullable public List words; @Nullable - public List metas; + public TLongList metas; + + public static Hydrator newHydrator() { + return new DocumentDataHydrator(); + } + + public static Dehydrator newDehydrator() { + return DocumentRecord::dehydrate; + } public static MessageType schema = new MessageType( - ProcessedDocumentData.class.getSimpleName(), + DocumentRecord.class.getSimpleName(), Types.required(BINARY).as(stringType()).named("domain"), Types.required(BINARY).as(stringType()).named("url"), Types.required(INT32).named("ordinal"), @@ -65,6 +78,7 @@ public class ProcessedDocumentData { Types.optional(INT32).named("htmlFeatures"), Types.optional(BINARY).as(stringType()).named("htmlStandard"), Types.optional(INT64).named("hash"), + Types.optional(INT64).named("documentMetadata"), Types.optional(INT32).named("length"), Types.optional(FLOAT).named("quality"), Types.optional(INT32).named("pubYear"), @@ -72,7 +86,7 @@ public class ProcessedDocumentData { Types.repeated(BINARY).as(stringType()).named("word") ); - public ProcessedDocumentData add(String heading, Object value) { + public DocumentRecord add(String heading, Object value) { switch (heading) { case "domain" -> domain = (String) value; case "url" -> url = (String) value; @@ -81,6 +95,7 @@ public class ProcessedDocumentData { case "length" -> length = (Integer) value; case "pubYear" -> pubYear = (Integer) value; case "hash" -> hash = (Long) value; + case "documentMetadata" -> documentMetadata = (Long) value; case "quality" -> quality = (Float) value; case "state" -> state = (String) value; case "stateReason" -> stateReason = (String) value; @@ -94,12 +109,63 @@ public class ProcessedDocumentData { } case "wordMeta" -> { if (this.metas == null) { - this.metas = new ArrayList<>(100); + this.metas = new TLongArrayList(100); } - this.metas.add((Long) value); + this.metas.add((long) value); } default -> throw new UnsupportedOperationException("Unknown heading '" + heading + '"'); } return this; } + + public void dehydrate(ValueWriter valueWriter) { + valueWriter.write("domain", domain); + valueWriter.write("url", url); + valueWriter.write("ordinal", ordinal); + valueWriter.write("state", state); + + if (stateReason != null) + valueWriter.write("stateReason", stateReason); + if (title != null) + valueWriter.write("title", title); + if (description != null) + valueWriter.write("description", description); + valueWriter.write("htmlFeatures", htmlFeatures); + valueWriter.write("htmlStandard", htmlStandard); + valueWriter.write("documentMetadata", documentMetadata); + valueWriter.write("length", length); + valueWriter.write("hash", hash); + valueWriter.write("quality", quality); + if (pubYear != null) { + valueWriter.write("pubYear", pubYear); + } + + if (metas != null) { + valueWriter.writeList("wordMeta", metas); + } + + if (words != null) { + valueWriter.writeList("word", words); + } + } + +} + +class DocumentDataHydrator implements Hydrator { + + @Override + public DocumentRecord start() { + return new DocumentRecord(); + } + + @Override + public DocumentRecord add(DocumentRecord target, String heading, Object value) { + return target.add(heading, value); + } + + @Override + public DocumentRecord finish(DocumentRecord target) { + return target; + } + } diff --git a/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DocumentRecordKeywordsProjection.java b/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DocumentRecordKeywordsProjection.java new file mode 100644 index 00000000..16cdf2a8 --- /dev/null +++ b/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DocumentRecordKeywordsProjection.java @@ -0,0 +1,92 @@ +package nu.marginalia.model.processed; + +import blue.strategic.parquet.Dehydrator; +import blue.strategic.parquet.Hydrator; +import blue.strategic.parquet.ValueWriter; +import gnu.trove.list.TLongList; +import gnu.trove.list.array.TLongArrayList; +import lombok.*; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.Types; +import org.jetbrains.annotations.NotNull; +import org.jetbrains.annotations.Nullable; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +import static org.apache.parquet.schema.LogicalTypeAnnotation.stringType; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.*; + +@Getter +@Setter +@NoArgsConstructor +@AllArgsConstructor +@EqualsAndHashCode +@ToString +public class DocumentRecordKeywordsProjection { + @NotNull + public String domain; + + public int ordinal; + + public int htmlFeatures; + public long documentMetadata; + + public List words; + public TLongList metas; + + public boolean hasKeywords() { + return words != null && metas != null; + } + + public static Hydrator newHydrator() { + return new DocumentRecordKeywordsProjectionHydrator(); + } + + public static Collection requiredColumns() { + return List.of("domain", "ordinal", "htmlFeatures", "word", "wordMeta", "documentMetadata"); + } + + public DocumentRecordKeywordsProjection add(String heading, Object value) { + switch (heading) { + case "domain" -> domain = (String) value; + case "ordinal" -> ordinal = (Integer) value; + case "htmlFeatures" -> htmlFeatures = (Integer) value; + case "documentMetadata" -> documentMetadata = (Long) value; + case "word" -> { + if (this.words == null) + this.words = new ArrayList<>(100); + this.words.add((String) value); + } + case "wordMeta" -> { + if (this.metas == null) { + this.metas = new TLongArrayList(100); + } + this.metas.add((long) value); + } + default -> throw new UnsupportedOperationException("Unknown heading '" + heading + '"'); + } + return this; + } + +} + +class DocumentRecordKeywordsProjectionHydrator implements Hydrator { + + @Override + public DocumentRecordKeywordsProjection start() { + return new DocumentRecordKeywordsProjection(); + } + + @Override + public DocumentRecordKeywordsProjection add(DocumentRecordKeywordsProjection target, String heading, Object value) { + return target.add(heading, value); + } + + @Override + public DocumentRecordKeywordsProjection finish(DocumentRecordKeywordsProjection target) { + return target; + } + +} diff --git a/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DocumentRecordMetadataProjection.java b/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DocumentRecordMetadataProjection.java new file mode 100644 index 00000000..ccad52e3 --- /dev/null +++ b/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DocumentRecordMetadataProjection.java @@ -0,0 +1,100 @@ +package nu.marginalia.model.processed; + +import blue.strategic.parquet.Hydrator; +import lombok.*; +import org.jetbrains.annotations.NotNull; +import org.jetbrains.annotations.Nullable; + +import java.util.Collection; +import java.util.List; + +@Getter +@Setter +@NoArgsConstructor +@AllArgsConstructor +@EqualsAndHashCode +@ToString +public class DocumentRecordMetadataProjection { + @NotNull + public String domain; + @NotNull + public String url; + + public int ordinal; + + @NotNull + public String state; + @Nullable + public String stateReason; + + @Nullable + public String title; + @Nullable + public String description; + public int htmlFeatures; + @Nullable + public String htmlStandard; + + public int length; + public long hash; + public float quality; + + public long documentMetadata; + + @Nullable + public Integer pubYear; + + public static Collection requiredColumns() { + return List.of("domain", "url", "ordinal", "htmlFeatures", "length", "pubYear", + "hash", "documentMetadata", "quality", "state", "stateReason", + "title", "description", "htmlStandard"); + } + + public DocumentRecordMetadataProjection add(String heading, Object value) { + switch (heading) { + case "domain" -> domain = (String) value; + case "url" -> url = (String) value; + case "ordinal" -> ordinal = (Integer) value; + case "htmlFeatures" -> htmlFeatures = (Integer) value; + case "length" -> length = (Integer) value; + case "pubYear" -> pubYear = (Integer) value; + case "hash" -> hash = (Long) value; + case "documentMetadata" -> documentMetadata = (Long) value; + case "quality" -> quality = (Float) value; + case "state" -> state = (String) value; + case "stateReason" -> stateReason = (String) value; + case "title" -> title = (String) value; + case "description" -> description = (String) value; + case "htmlStandard" -> htmlStandard = (String) value; + + default -> throw new UnsupportedOperationException("Unknown heading '" + heading + '"'); + } + return this; + } + + public static Hydrator newHydrator() { + return new DocumentRecordMetadataHydrator(); + } + + + +} + +class DocumentRecordMetadataHydrator implements Hydrator { + + @Override + public DocumentRecordMetadataProjection start() { + return new DocumentRecordMetadataProjection(); + } + + @Override + public DocumentRecordMetadataProjection add(DocumentRecordMetadataProjection target, String heading, Object value) { + return target.add(heading, value); + } + + @Override + public DocumentRecordMetadataProjection finish(DocumentRecordMetadataProjection target) { + return target; + } + +} diff --git a/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainLinkRecord.java b/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainLinkRecord.java new file mode 100644 index 00000000..298d6192 --- /dev/null +++ b/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainLinkRecord.java @@ -0,0 +1,97 @@ +package nu.marginalia.model.processed; + +import blue.strategic.parquet.Dehydrator; +import blue.strategic.parquet.Hydrator; +import blue.strategic.parquet.ValueWriter; +import lombok.*; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.Types; +import org.jetbrains.annotations.NotNull; + +import static org.apache.parquet.schema.LogicalTypeAnnotation.stringType; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; + +@Getter +@Setter +@NoArgsConstructor +@AllArgsConstructor +@EqualsAndHashCode +public class DomainLinkRecord { + @NotNull + public String source; + + @NotNull + public String dest; + + public void dehydrate(ValueWriter valueWriter) { + valueWriter.write("source", source); + valueWriter.write("dest", dest); + } + + public static Dehydrator newDehydrator() { + return DomainLinkRecord::dehydrate; + } + + public static Hydrator newHydrator() { + return new DomainLinkDataHydrator(); + } + public static Hydrator newDestDomainHydrator() { + return new DestDomainNameHydrator(); + } + + public static MessageType schema = new MessageType( + DomainLinkRecord.class.getSimpleName(), + Types.required(BINARY).as(stringType()).named("source"), + Types.required(BINARY).as(stringType()).named("dest") + ); + + public DomainLinkRecord add(String heading, Object value) { + switch (heading) { + case "source" -> source = (String) value; + case "dest" -> dest = (String) value; + default -> throw new UnsupportedOperationException("Unknown heading '" + heading + '"'); + } + return this; + } + +} + +class DomainLinkDataHydrator implements Hydrator { + + @Override + public DomainLinkRecord start() { + return new DomainLinkRecord(); + } + + @Override + public DomainLinkRecord add(DomainLinkRecord target, String heading, Object value) { + return target.add(heading, value); + } + + @Override + public DomainLinkRecord finish(DomainLinkRecord target) { + return target; + } + +} + +class DestDomainNameHydrator implements Hydrator { + + @Override + public String start() { + return ""; + } + + @Override + public String add(String target, String heading, Object value) { + if ("dest".equals(heading)) { + return (String) value; + } + return target; + } + + @Override + public String finish(String target) { + return target; + } +} \ No newline at end of file diff --git a/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainRecord.java b/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainRecord.java new file mode 100644 index 00000000..e3a0c9f9 --- /dev/null +++ b/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainRecord.java @@ -0,0 +1,146 @@ +package nu.marginalia.model.processed; + +import blue.strategic.parquet.Dehydrator; +import blue.strategic.parquet.Hydrator; +import blue.strategic.parquet.ValueWriter; +import lombok.*; +import org.apache.parquet.schema.*; +import org.jetbrains.annotations.NotNull; +import org.jetbrains.annotations.Nullable; + +import java.sql.Array; +import java.util.ArrayList; +import java.util.List; + +import static org.apache.parquet.schema.LogicalTypeAnnotation.*; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; + +@Getter +@Setter +@NoArgsConstructor +@AllArgsConstructor +@EqualsAndHashCode +@ToString +public class DomainRecord { + @NotNull + public String domain; + + public int knownUrls; + public int goodUrls; + public int visitedUrls; + + @Nullable + public String state; + @Nullable + public String redirectDomain; + @Nullable + public String ip; + + public List rssFeeds; + + + public static Hydrator newHydrator() { + return new DomainHydrator(); + } + + public static Dehydrator newDehydrator() { + return DomainRecord::dehydrate; + } + + public static Hydrator newDomainNameHydrator() { + return new DomainNameHydrator(); + } + + + public static MessageType schema = new MessageType( + DomainRecord.class.getSimpleName(), + Types.required(BINARY).as(stringType()).named("domain"), + Types.optional(INT32).named("knownUrls"), + Types.optional(INT32).named("visitedUrls"), + Types.optional(INT32).named("goodUrls"), + Types.required(BINARY).as(stringType()).named("state"), + Types.optional(BINARY).as(stringType()).named("redirectDomain"), + Types.optional(BINARY).as(stringType()).named("ip"), + Types.repeated(BINARY).as(stringType()).named("rss") + ); + + DomainRecord add(String heading, Object value) { + switch (heading) { + case "domain" -> domain = (String) value; + case "knownUrls" -> knownUrls = (Integer) value; + case "visitedUrls" -> visitedUrls = (Integer) value; + case "goodUrls" -> goodUrls = (Integer) value; + case "state" -> state = (String) value; + case "redirectDomain" -> redirectDomain = (String) value; + case "ip" -> ip = (String) value; + case "rss" -> { + if (rssFeeds == null) { + rssFeeds = new ArrayList<>(); + } + rssFeeds.add((String) value); + } + default -> throw new UnsupportedOperationException("Unknown heading '" + heading + '"'); + } + return this; + } + + private void dehydrate(ValueWriter valueWriter) { + valueWriter.write("domain", domain); + valueWriter.write("knownUrls", knownUrls); + valueWriter.write("goodUrls", goodUrls); + valueWriter.write("visitedUrls", visitedUrls); + if (state != null) { + valueWriter.write("state", state); + } + if (redirectDomain != null) { + valueWriter.write("redirectDomain", redirectDomain); + } + if (ip != null) { + valueWriter.write("ip", ip); + } + if (rssFeeds != null) { + valueWriter.writeList("rss", rssFeeds); + } + } + +} + + +class DomainHydrator implements Hydrator { + @Override + public DomainRecord start() { + return new DomainRecord(); + } + + @Override + public DomainRecord add(DomainRecord target, String heading, Object value) { + return target.add(heading, value); + } + + @Override + public DomainRecord finish(DomainRecord target) { + return target; + } +} + +class DomainNameHydrator implements Hydrator { + + @Override + public String start() { + return ""; + } + + @Override + public String add(String target, String heading, Object value) { + if ("domain".equals(heading)) { + return (String) value; + } + return target; + } + + @Override + public String finish(String target) { + return target; + } +} \ No newline at end of file diff --git a/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/ProcessedDataCodec.java b/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/ProcessedDataCodec.java deleted file mode 100644 index 54ed35c7..00000000 --- a/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/ProcessedDataCodec.java +++ /dev/null @@ -1,6 +0,0 @@ -package nu.marginalia.model.processed; - - -public class ProcessedDataCodec { - -} diff --git a/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/ProcessedDomainData.java b/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/ProcessedDomainData.java deleted file mode 100644 index 81b0241c..00000000 --- a/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/ProcessedDomainData.java +++ /dev/null @@ -1,55 +0,0 @@ -package nu.marginalia.model.processed; - -import lombok.*; -import org.apache.parquet.schema.*; -import org.jetbrains.annotations.NotNull; -import org.jetbrains.annotations.Nullable; - -import static org.apache.parquet.schema.LogicalTypeAnnotation.*; -import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; -import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; - -@Getter -@Setter -@NoArgsConstructor -@AllArgsConstructor -@EqualsAndHashCode -public class ProcessedDomainData { - @NotNull - public String domain; - - public int knownUrls; - public int goodUrls; - public int visitedUrls; - - @Nullable - public String state; - @Nullable - public String redirectDomain; - @Nullable - public String ip; - - public static MessageType schema = new MessageType( - ProcessedDomainData.class.getSimpleName(), - Types.required(BINARY).as(stringType()).named("domain"), - Types.optional(INT32).named("knownUrls"), - Types.optional(INT32).named("visitedUrls"), - Types.optional(INT32).named("goodUrls"), - Types.required(BINARY).as(stringType()).named("state"), - Types.optional(BINARY).as(stringType()).named("redirectDomain"), - Types.optional(BINARY).as(stringType()).named("ip")); - - public ProcessedDomainData add(String heading, Object value) { - switch (heading) { - case "domain" -> domain = (String) value; - case "knownUrls" -> knownUrls = (Integer) value; - case "visitedUrls" -> visitedUrls = (Integer) value; - case "goodUrls" -> goodUrls = (Integer) value; - case "state" -> state = (String) value; - case "redirectDomain" -> redirectDomain = (String) value; - case "ip" -> ip = (String) value; - default -> throw new UnsupportedOperationException("Unknown heading '" + heading + '"'); - } - return this; - } -} diff --git a/code/process-models/processed-data/src/test/java/nu/marginalia/io/processed/ProcessedDocumentParquetFileReaderTest.java b/code/process-models/processed-data/src/test/java/nu/marginalia/io/processed/DocumentRecordParquetFileReaderTest.java similarity index 72% rename from code/process-models/processed-data/src/test/java/nu/marginalia/io/processed/ProcessedDocumentParquetFileReaderTest.java rename to code/process-models/processed-data/src/test/java/nu/marginalia/io/processed/DocumentRecordParquetFileReaderTest.java index 5090a65c..a358325a 100644 --- a/code/process-models/processed-data/src/test/java/nu/marginalia/io/processed/ProcessedDocumentParquetFileReaderTest.java +++ b/code/process-models/processed-data/src/test/java/nu/marginalia/io/processed/DocumentRecordParquetFileReaderTest.java @@ -1,6 +1,7 @@ package nu.marginalia.io.processed; -import nu.marginalia.model.processed.ProcessedDocumentData; +import gnu.trove.list.array.TLongArrayList; +import nu.marginalia.model.processed.DocumentRecord; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -14,7 +15,7 @@ import java.util.stream.LongStream; import static org.junit.jupiter.api.Assertions.*; -class ProcessedDocumentParquetFileReaderTest { +class DocumentRecordParquetFileReaderTest { Path parquetFile; @BeforeEach @@ -29,7 +30,7 @@ class ProcessedDocumentParquetFileReaderTest { @Test public void test() throws IOException { - var doc = new ProcessedDocumentData( + var doc = new DocumentRecord( "www.marginalia.nu", "https://www.marginalia.nu/", 0, @@ -42,25 +43,26 @@ class ProcessedDocumentParquetFileReaderTest { 123, 0xF00BA3L, 0.25f, + 4L, null, List.of("Hello", "world"), - List.of(2L, 3L) + new TLongArrayList(new long[] { 2, 3}) ); - try (var writer = new ProcessedDocumentParquetFileWriter(parquetFile)) { + try (var writer = new DocumentRecordParquetFileWriter(parquetFile)) { writer.write(doc); } - var read = ProcessedDocumentParquetFileReader.stream(parquetFile).toList(); + var read = DocumentRecordParquetFileReader.stream(parquetFile).toList(); assertEquals(List.of(doc), read); } @Test public void testHugePayload() throws IOException { List words = IntStream.range(0, 100000).mapToObj(Integer::toString).toList(); - List metas = LongStream.range(0, 100000).boxed().toList(); + TLongArrayList metas = new TLongArrayList(LongStream.range(0, 100000).toArray()); - var doc = new ProcessedDocumentData( + var doc = new DocumentRecord( "www.marginalia.nu", "https://www.marginalia.nu/", 0, @@ -73,16 +75,17 @@ class ProcessedDocumentParquetFileReaderTest { 123, 0xF00BA3L, 0.25f, + 5L, null, words, metas ); - try (var writer = new ProcessedDocumentParquetFileWriter(parquetFile)) { + try (var writer = new DocumentRecordParquetFileWriter(parquetFile)) { writer.write(doc); } - var read = ProcessedDocumentParquetFileReader.stream(parquetFile).toList(); + var read = DocumentRecordParquetFileReader.stream(parquetFile).toList(); assertEquals(List.of(doc), read); } diff --git a/code/process-models/processed-data/src/test/java/nu/marginalia/io/processed/DomainLinkRecordParquetFileReaderTest.java b/code/process-models/processed-data/src/test/java/nu/marginalia/io/processed/DomainLinkRecordParquetFileReaderTest.java new file mode 100644 index 00000000..274e80d0 --- /dev/null +++ b/code/process-models/processed-data/src/test/java/nu/marginalia/io/processed/DomainLinkRecordParquetFileReaderTest.java @@ -0,0 +1,49 @@ +package nu.marginalia.io.processed; + +import nu.marginalia.model.processed.DomainLinkRecord; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +class DomainLinkRecordParquetFileReaderTest { + Path parquetFile; + + @BeforeEach + public void setUp() throws IOException { + parquetFile = Files.createTempFile(getClass().getSimpleName(), ".parquet"); + } + + @AfterEach + public void tearDown() throws IOException { + Files.deleteIfExists(parquetFile); + } + + @Test + public void testReadFull() throws IOException { + var first = new DomainLinkRecord( + "www.marginalia.nu", + "memex.marginalia.nu"); + var second = new DomainLinkRecord( + "memex.marginalia.nu", + "search.marginalia.nu" + ); + + try (var writer = new DomainLinkRecordParquetFileWriter(parquetFile)) { + writer.write(first); + writer.write(second); + } + + var items = DomainLinkRecordParquetFileReader + .stream(parquetFile) + .toList(); + assertEquals(List.of(first, second), items); + } + +} \ No newline at end of file diff --git a/code/process-models/processed-data/src/test/java/nu/marginalia/io/processed/ProcessedDomainParquetFileReaderTest.java b/code/process-models/processed-data/src/test/java/nu/marginalia/io/processed/DomainRecordParquetFileReaderTest.java similarity index 72% rename from code/process-models/processed-data/src/test/java/nu/marginalia/io/processed/ProcessedDomainParquetFileReaderTest.java rename to code/process-models/processed-data/src/test/java/nu/marginalia/io/processed/DomainRecordParquetFileReaderTest.java index 5264624d..7c73f13e 100644 --- a/code/process-models/processed-data/src/test/java/nu/marginalia/io/processed/ProcessedDomainParquetFileReaderTest.java +++ b/code/process-models/processed-data/src/test/java/nu/marginalia/io/processed/DomainRecordParquetFileReaderTest.java @@ -1,6 +1,6 @@ package nu.marginalia.io.processed; -import nu.marginalia.model.processed.ProcessedDomainData; +import nu.marginalia.model.processed.DomainRecord; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -12,7 +12,7 @@ import java.util.List; import static org.junit.jupiter.api.Assertions.*; -class ProcessedDomainParquetFileReaderTest { +class DomainRecordParquetFileReaderTest { Path parquetFile; @BeforeEach @@ -27,34 +27,36 @@ class ProcessedDomainParquetFileReaderTest { @Test public void testReadFull() throws IOException { - var first = new ProcessedDomainData( + var first = new DomainRecord( "www.marginalia.nu", 10, 3, 5, "'sall good man", null, - "127.0.0.1" + "127.0.0.1", + List.of("a", "b") ); - var second = new ProcessedDomainData( + var second = new DomainRecord( "memex.marginalia.nu", 0, 0, 0, "REDIRECT", "www.marginalia.nu", - "127.0.0.1" + "127.0.0.1", + null ); - try (var writer = new ProcessedDomainParquetFileWriter(parquetFile)) { + try (var writer = new DomainRecordParquetFileWriter(parquetFile)) { writer.write(first); writer.write(second); } - var domainNames = ProcessedDomainParquetFileReader.getDomainNames(parquetFile); + var domainNames = DomainRecordParquetFileReader.getDomainNames(parquetFile); assertEquals(List.of("www.marginalia.nu", "memex.marginalia.nu"), domainNames); - var items = ProcessedDomainParquetFileReader + var items = DomainRecordParquetFileReader .stream(parquetFile) .toList(); assertEquals(List.of(first, second), items); diff --git a/code/process-models/work-log/src/main/java/nu/marginalia/worklog/BatchingWorkLog.java b/code/process-models/work-log/src/main/java/nu/marginalia/worklog/BatchingWorkLog.java index 9c6ba9bb..4919b33d 100644 --- a/code/process-models/work-log/src/main/java/nu/marginalia/worklog/BatchingWorkLog.java +++ b/code/process-models/work-log/src/main/java/nu/marginalia/worklog/BatchingWorkLog.java @@ -23,6 +23,10 @@ public interface BatchingWorkLog extends AutoCloseable { * */ boolean isItemInCurrentBatch(String id); + default boolean isItemProcessed(String id) { + return isItemCommitted(id) || isItemInCurrentBatch(id); + } + /** Log additional item to the current batch */ void logItem(String id) throws IOException; @@ -32,4 +36,9 @@ public interface BatchingWorkLog extends AutoCloseable { void logFinishedBatch() throws IOException; int getBatchNumber(); + + /** Returns false if logItem has been invoked since last logFinishedBatch */ + boolean isCurrentBatchEmpty(); + + int size(); } diff --git a/code/process-models/work-log/src/main/java/nu/marginalia/worklog/BatchingWorkLogImpl.java b/code/process-models/work-log/src/main/java/nu/marginalia/worklog/BatchingWorkLogImpl.java index b1538b10..2b8d8689 100644 --- a/code/process-models/work-log/src/main/java/nu/marginalia/worklog/BatchingWorkLogImpl.java +++ b/code/process-models/work-log/src/main/java/nu/marginalia/worklog/BatchingWorkLogImpl.java @@ -6,15 +6,24 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardOpenOption; import java.time.LocalDateTime; -import java.util.HashSet; import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; public class BatchingWorkLogImpl implements BatchingWorkLog { private int batchNumber = 0; - private final Set currentBatchItems = new HashSet<>(1000); - private final Set commitedItems = new HashSet<>(10_000); + + private final Set currentBatchItems = ConcurrentHashMap.newKeySet(10_000); + private final Set committedItems = ConcurrentHashMap.newKeySet(10_000); private final OutputStream writer; + /** Create or open a work log for appending new entries. + *

+ * Opening a work log this way will cause it to be modified + * with a comment annotating when it was opened, and possibly + * a crash marker to indicate that data is to be discarded. + *

+ * Use BatchingWorkLogInspector for read-only access! + */ public BatchingWorkLogImpl(Path file) throws IOException { if (Files.exists(file)) { try (var linesStream = Files.lines(file)) { @@ -24,7 +33,11 @@ public class BatchingWorkLogImpl implements BatchingWorkLog { } writer = Files.newOutputStream(file, StandardOpenOption.APPEND); + + // This is helpful for debugging, and will also ensure any partially written line + // gets a newline at the end writeLogEntry(new CommentLine("Log resumed on " + LocalDateTime.now())); + if (getCurrentBatchSize() > 0) { writeLogEntry(new CrashMarker()); } @@ -38,8 +51,6 @@ public class BatchingWorkLogImpl implements BatchingWorkLog { writeLogEntry(new CommentLine(" " + CrashMarker.MARKER + "\tdiscard contents from the current batch and start over, written after a crash")); writeLogEntry(new CommentLine("Upon a crash, items that have re-process until their batch is finalized")); } - - } void writeLogEntry(WorkLogItem item) throws IOException { @@ -54,7 +65,7 @@ public class BatchingWorkLogImpl implements BatchingWorkLog { @Override public boolean isItemCommitted(String id) { - return commitedItems.contains(id); + return committedItems.contains(id); } @Override @@ -76,7 +87,7 @@ public class BatchingWorkLogImpl implements BatchingWorkLog { batchNumber++; // Transfer all items from the current batch to the committed items' batch - commitedItems.addAll(currentBatchItems); + committedItems.addAll(currentBatchItems); currentBatchItems.clear(); } @@ -99,9 +110,18 @@ public class BatchingWorkLogImpl implements BatchingWorkLog { return batchNumber; } + @Override + public boolean isCurrentBatchEmpty() { + return currentBatchItems.isEmpty(); + } + public int getCurrentBatchSize() { return currentBatchItems.size(); } + + public int size() { + return currentBatchItems.size() + committedItems.size(); + } } interface WorkLogItem { diff --git a/code/process-models/work-log/src/main/java/nu/marginalia/worklog/BatchingWorkLogInspector.java b/code/process-models/work-log/src/main/java/nu/marginalia/worklog/BatchingWorkLogInspector.java new file mode 100644 index 00000000..c1498b38 --- /dev/null +++ b/code/process-models/work-log/src/main/java/nu/marginalia/worklog/BatchingWorkLogInspector.java @@ -0,0 +1,22 @@ +package nu.marginalia.worklog; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +public class BatchingWorkLogInspector { + /** Batches up until the return value of this method + * are considered valid. If the method returns 2, then batches + * 0 and 1 are good, etc. + *

+ * Invariant: BatchingWorkLogInspector.getValidBatches() always + * returns the same value as BatchingWorkLog.getBatchNumber() + */ + public static int getValidBatches(Path file) throws IOException { + try (var linesStream = Files.lines(file)) { + return (int) linesStream.map(WorkLogItem::parse) + .filter(FinishBatch.class::isInstance) + .count(); + } + } +} diff --git a/code/process-models/work-log/src/test/java/nu/marginalia/worklog/BatchingWorkLogImplTest.java b/code/process-models/work-log/src/test/java/nu/marginalia/worklog/BatchingWorkLogImplTest.java index fa765fd2..79e36b6d 100644 --- a/code/process-models/work-log/src/test/java/nu/marginalia/worklog/BatchingWorkLogImplTest.java +++ b/code/process-models/work-log/src/test/java/nu/marginalia/worklog/BatchingWorkLogImplTest.java @@ -31,11 +31,19 @@ class BatchingWorkLogImplTest { wl.logItem("1"); wl.logItem("2"); wl.logItem("3"); + assertEquals(wl.getBatchNumber(), BatchingWorkLogInspector.getValidBatches(fileName)); wl.logFinishedBatch(); + + assertEquals(wl.getBatchNumber(), BatchingWorkLogInspector.getValidBatches(fileName)); wl.logItem("4"); wl.logItem("5"); + + assertEquals(wl.getBatchNumber(), BatchingWorkLogInspector.getValidBatches(fileName)); + wl.logFinishedBatch(); wl.logItem("6"); + + assertEquals(wl.getBatchNumber(), BatchingWorkLogInspector.getValidBatches(fileName)); } try (var wl = new BatchingWorkLogImpl(fileName)) { @@ -56,8 +64,11 @@ class BatchingWorkLogImplTest { assertTrue(wl.isItemCommitted("5")); assertFalse(wl.isItemCommitted("6")); assertTrue(wl.isItemCommitted("7")); + + assertEquals(wl.getBatchNumber(), BatchingWorkLogInspector.getValidBatches(fileName)); } + Files.readAllLines(fileName).forEach(System.out::println); } } \ No newline at end of file diff --git a/code/processes/converting-process/build.gradle b/code/processes/converting-process/build.gradle index 36a3ebdb..cb8e80e1 100644 --- a/code/processes/converting-process/build.gradle +++ b/code/processes/converting-process/build.gradle @@ -44,6 +44,8 @@ dependencies { implementation project(':code:libraries:language-processing') implementation project(':code:process-models:converting-model') + implementation project(':code:process-models:processed-data') + implementation project(':code:process-models:work-log') implementation project(':code:process-models:crawling-model') implementation project(':code:features-convert:adblock') diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java index 9a5a78af..a982dcfa 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java @@ -7,6 +7,7 @@ import com.google.inject.Injector; import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.converting.sideload.SideloadSource; import nu.marginalia.converting.sideload.SideloadSourceFactory; +import nu.marginalia.converting.writer.ConverterWriter; import nu.marginalia.db.storage.FileStorageService; import nu.marginalia.mq.MessageQueueFactory; import nu.marginalia.mq.MqMessage; @@ -17,6 +18,9 @@ import nu.marginalia.process.control.ProcessHeartbeat; import nu.marginalia.process.control.ProcessHeartbeatImpl; import nu.marginalia.process.log.WorkLog; import nu.marginalia.service.module.DatabaseModule; +import nu.marginalia.worklog.BatchingWorkLog; +import nu.marginalia.worklog.BatchingWorkLogImpl; +import org.checkerframework.checker.units.qual.C; import plan.CrawlPlan; import nu.marginalia.converting.compiler.InstructionsCompiler; import nu.marginalia.converting.processor.DomainProcessor; @@ -108,7 +112,10 @@ public class ConverterMain { final int maxPoolSize = Runtime.getRuntime().availableProcessors(); - try (WorkLog processLog = plan.createProcessWorkLog(); + + + try (BatchingWorkLog batchingWorkLog = new BatchingWorkLogImpl(plan.process.getLogFile()); + ConverterWriter converterWriter = new ConverterWriter(batchingWorkLog, plan.process.getDir()); ConversionLog log = new ConversionLog(plan.process.getDir())) { var instructionWriter = new InstructionWriterFactory(log, plan.process.getDir(), gson); @@ -118,30 +125,17 @@ public class ConverterMain { AtomicInteger processedDomains = new AtomicInteger(0); // Advance the progress bar to the current position if this is a resumption - processedDomains.set(processLog.countFinishedJobs()); + processedDomains.set(batchingWorkLog.size()); heartbeat.setProgress(processedDomains.get() / (double) totalDomains); - for (var domain : plan.crawlDataIterable(id -> !processLog.isJobFinished(id))) + for (var domain : plan.crawlDataIterable(id -> !batchingWorkLog.isItemProcessed(id))) { pool.submit(() -> { - try { - ProcessedDomain processed = processor.process(domain); + ProcessedDomain processed = processor.process(domain); - final String where; - final int size; + converterWriter.accept(processed); - try (var writer = instructionWriter.createInstructionsForDomainWriter(processed.id)) { - compiler.compile(processed, writer::accept); - where = writer.getFileName(); - size = writer.getSize(); - } - - processLog.setJobToFinished(processed.id, where, size); - heartbeat.setProgress(processedDomains.incrementAndGet() / (double) totalDomains); - } - catch (IOException ex) { - logger.warn("IO exception in converter", ex); - } + heartbeat.setProgress(processedDomains.incrementAndGet() / (double) totalDomains); }); } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterBatchWriter.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterBatchWriter.java new file mode 100644 index 00000000..cea46f20 --- /dev/null +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterBatchWriter.java @@ -0,0 +1,232 @@ +package nu.marginalia.converting.writer; + +import gnu.trove.list.TLongList; +import gnu.trove.list.array.TLongArrayList; +import nu.marginalia.converting.model.ProcessedDomain; +import nu.marginalia.io.processed.DocumentRecordParquetFileWriter; +import nu.marginalia.io.processed.DomainLinkRecordParquetFileWriter; +import nu.marginalia.io.processed.DomainRecordParquetFileWriter; +import nu.marginalia.io.processed.ProcessedDataFileNames; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawl.DomainIndexingState; +import nu.marginalia.model.crawl.HtmlFeature; +import nu.marginalia.model.processed.DocumentRecord; +import nu.marginalia.model.processed.DomainLinkRecord; +import nu.marginalia.model.processed.DomainRecord; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.*; +import java.util.concurrent.Callable; +import java.util.concurrent.ForkJoinPool; +import java.util.concurrent.Future; + +public class ConverterBatchWriter { + private final DomainRecordParquetFileWriter domainWriter; + private final DomainLinkRecordParquetFileWriter domainLinkWriter; + private final DocumentRecordParquetFileWriter documentWriter; + + private static final Logger logger = LoggerFactory.getLogger(ConverterBatchWriter.class); + + ConverterBatchWriter(Path basePath, int batchNumber) throws IOException { + domainWriter = new DomainRecordParquetFileWriter( + ProcessedDataFileNames.domainFileName(basePath, batchNumber) + ); + domainLinkWriter = new DomainLinkRecordParquetFileWriter( + ProcessedDataFileNames.domainLinkFileName(basePath, batchNumber) + ); + documentWriter = new DocumentRecordParquetFileWriter( + ProcessedDataFileNames.documentFileName(basePath, batchNumber) + ); + } + + public void write(ProcessedDomain domain) { + var results = ForkJoinPool.commonPool().invokeAll( + writeTasks(domain) + ); + + for (var result : results) { + if (result.state() == Future.State.FAILED) { + logger.warn("Parquet writing job failed", result.exceptionNow()); + } + } + } + + private List> writeTasks(ProcessedDomain domain) { + return List.of( + () -> writeDocumentData(domain), + () -> writeLinkData(domain), + () -> writeDomainData(domain) + ); + } + + private Object writeDocumentData(ProcessedDomain domain) throws IOException { + if (domain.documents == null) + return this; + + String domainName = domain.domain.toString(); + int ordinal = 0; + + for (var document : domain.documents) { + if (document.details == null) { + new DocumentRecord( + domainName, + document.url.toString(), + ordinal, + document.state.toString(), + document.stateReason, + null, + null, + 0, + null, + 0, + 0L, + -15, + 0L, + null, + null, + null); + } + else { + var wb = document.words.build(); + List words = Arrays.asList(wb.keywords); + TLongList metas = new TLongArrayList(wb.metadata); + + documentWriter.write(new DocumentRecord( + domainName, + document.url.toString(), + ordinal, + document.state.toString(), + document.stateReason, + document.details.title, + document.details.description, + HtmlFeature.encode(document.details.features), + document.details.standard.name(), + document.details.length, + document.details.hashCode, + (float) document.details.quality, + document.details.metadata.encode(), + document.details.pubYear, + words, + metas + )); + + } + + ordinal++; + } + + return this; + } + + private Object writeLinkData(ProcessedDomain domain) throws IOException { + String from = domain.domain.toString(); + + if (domain.documents == null) + return this; + + Set seen = new HashSet<>(); + + for (var doc : domain.documents) { + if (doc.details == null) + continue; + + for (var link : doc.details.linksExternal) { + var dest = link.domain; + + if (!seen.add(dest)) { + continue; + } + + domainLinkWriter.write(new DomainLinkRecord( + from, + dest.toString() + )); + } + } + + if (domain.redirect != null) { + domainLinkWriter.write(new DomainLinkRecord( + from, + domain.redirect.toString() + )); + } + + return this; + } + + private Object writeDomainData(ProcessedDomain domain) throws IOException { + DomainMetadata metadata = DomainMetadata.from(domain); + + List feeds = getFeedUrls(domain); + + domainWriter.write( + new DomainRecord( + domain.domain.toString(), + metadata.known(), + metadata.good(), + metadata.visited(), + Optional.ofNullable(domain.state).map(DomainIndexingState::toString).orElse(null), + Optional.ofNullable(domain.redirect).map(EdgeDomain::toString).orElse(null), + domain.ip, + feeds + ) + ); + + return this; + } + + private List getFeedUrls(ProcessedDomain domain) { + var documents = domain.documents; + if (documents == null) + return List.of(); + + return documents.stream().map(doc -> doc.details) + .filter(Objects::nonNull) + .flatMap(dets -> dets.feedLinks.stream()) + .distinct() + .map(EdgeUrl::toString) + .toList(); + } + + public void close() throws IOException { + domainWriter.close(); + documentWriter.close(); + domainLinkWriter.close(); + } +} + +record DomainMetadata(int known, int good, int visited) { + + public static DomainMetadata from(ProcessedDomain domain) { + + var documents = domain.documents; + if (documents == null) { + return new DomainMetadata(0, 0, 0); + } + + int visitedUrls = 0; + int goodUrls = 0; + Set knownUrls = new HashSet<>(); + + for (var doc : documents) { + visitedUrls++; + + if (doc.isOk()) { + goodUrls++; + } + + knownUrls.add(doc.url); + + Optional.ofNullable(doc.details) + .map(details -> details.linksInternal) + .ifPresent(knownUrls::addAll); + } + + return new DomainMetadata(knownUrls.size(), goodUrls, visitedUrls); + } + +} \ No newline at end of file diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterWriter.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterWriter.java new file mode 100644 index 00000000..595601a5 --- /dev/null +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterWriter.java @@ -0,0 +1,125 @@ +package nu.marginalia.converting.writer; + +import lombok.SneakyThrows; +import nu.marginalia.converting.model.ProcessedDomain; +import nu.marginalia.worklog.BatchingWorkLog; + +import java.nio.file.Path; +import java.time.Duration; +import java.time.Instant; +import java.time.temporal.ChronoUnit; +import java.util.concurrent.ArrayBlockingQueue; +import java.util.concurrent.Callable; +import java.util.concurrent.TimeUnit; + +public class ConverterWriter implements AutoCloseable { + + private final BatchingWorkLog workLog; + private final Path basePath; + + private final Duration switchInterval = + Duration.of(10, ChronoUnit.MINUTES); + private final ArrayBlockingQueue domainData = + new ArrayBlockingQueue<>(4); + + private final Thread workerThread; + + ConverterBatchWriter writer; + + volatile boolean running = true; + public ConverterWriter(BatchingWorkLog workLog, Path basePath) { + this.workLog = workLog; + this.basePath = basePath; + + workerThread = new Thread(this::writerThread, getClass().getSimpleName()); + workerThread.start(); + } + + @SneakyThrows + public void accept(ProcessedDomain domain) { + domainData.put(domain); + } + + @SneakyThrows + private void writerThread() { + IntervalAction switcher = new IntervalAction(this::switchBatch, switchInterval); + + writer = new ConverterBatchWriter(basePath, workLog.getBatchNumber()); + + while (running || !domainData.isEmpty()) { + var data = domainData.poll(10, TimeUnit.SECONDS); + + if (data == null) + continue; + + String id = data.id; + + if (workLog.isItemCommitted(id) || workLog.isItemInCurrentBatch(id)) + continue; + + writer.write(data); + + workLog.logItem(id); + + switcher.tick(); + } + } + + @SneakyThrows + public boolean switchBatch() { + if (workLog.isCurrentBatchEmpty()) { + // Nothing to commit + return false; + } + + // order matters here + writer.close(); + workLog.logFinishedBatch(); + writer = new ConverterBatchWriter(basePath, workLog.getBatchNumber()); + + return true; + } + + @Override + public void close() throws Exception { + running = false; + workerThread.join(); + + // order matters here + writer.close(); + workLog.logFinishedBatch(); + } +} + +class IntervalAction { + private final Callable action; + private final Duration interval; + + private Instant nextActionInstant; + + IntervalAction(Callable action, Duration interval) { + this.action = action; + this.interval = interval; + } + + /** Execute the provided action if enough time has passed + * since the last successful invocation */ + public void tick() { + if (nextActionInstant == null) { + nextActionInstant = Instant.now().plus(interval); + return; + } + + if (Instant.now().isBefore(nextActionInstant)) + return; + + try { + if (action.call()) { + nextActionInstant = Instant.now().plus(interval); + } + } + catch (Exception ex) { + throw new RuntimeException(ex); + } + } +} \ No newline at end of file diff --git a/code/processes/loading-process/build.gradle b/code/processes/loading-process/build.gradle index fd23a7d0..736fec8d 100644 --- a/code/processes/loading-process/build.gradle +++ b/code/processes/loading-process/build.gradle @@ -36,6 +36,8 @@ dependencies { implementation project(':code:process-models:crawling-model') implementation project(':code:process-models:converting-model') + implementation project(':code:process-models:processed-data') + implementation project(':code:process-models:work-log') implementation project(':code:features-convert:keyword-extraction') diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/ConvertedDomainReader.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/ConvertedDomainReader.java deleted file mode 100644 index 97b26fb4..00000000 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/ConvertedDomainReader.java +++ /dev/null @@ -1,109 +0,0 @@ -package nu.marginalia.loading; - -import com.github.luben.zstd.RecyclingBufferPool; -import com.github.luben.zstd.ZstdInputStream; -import lombok.SneakyThrows; -import nu.marginalia.converting.instruction.Instruction; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.*; -import java.lang.ref.Cleaner; -import java.nio.file.Path; -import java.util.Iterator; -import java.util.concurrent.*; -import java.util.concurrent.atomic.AtomicBoolean; - -public class ConvertedDomainReader { - private final ExecutorService executorService = Executors.newSingleThreadExecutor(); - private static final Logger logger = LoggerFactory.getLogger(ConvertedDomainReader.class); - - /** Creates a new iterator over Path. The implementation will try to read the file in a separate thread, and - * will block until the first instruction is available. Iterator$hasNext may block. - */ - public Iterator createIterator(Path path) { - return new PrefetchingInstructionIterator(path); - } - - class PrefetchingInstructionIterator implements Iterator { - - private final LinkedBlockingQueue queue = new LinkedBlockingQueue<>(16); - private final AtomicBoolean finished = new AtomicBoolean(false); - - private Instruction next = null; - - private final static Cleaner cleaner = Cleaner.create(); - static class CancelAction implements Runnable { - private final Future future; - - public CancelAction(Future taskFuture) { - this.future = taskFuture; - } - - public void run() { - future.cancel(true); - } - - } - - public PrefetchingInstructionIterator(Path path) { - var taskFuture = executorService.submit(() -> readerThread(path)); - - // Cancel the future if the iterator is garbage collected - // to reduce the risk of leaking resources; as the worker thread - // will spin forever on put if the queue is full. - - cleaner.register(this, new CancelAction(taskFuture)); - } - - private Object readerThread(Path path) { - try (var or = new ObjectInputStream(new ZstdInputStream(new BufferedInputStream(new FileInputStream(path.toFile())), RecyclingBufferPool.INSTANCE))) { - for (;;) { - var nextObject = or.readObject(); - if (nextObject instanceof Instruction is) { - queue.put(is); - } else { - logger.warn("Spurious object in file: {}", nextObject.getClass().getSimpleName()); - } - } - } catch (EOFException ex) { - // Expected - return null; - } catch (ClassNotFoundException | IOException | InterruptedException e) { - logger.warn("Error reading file " + path, e); - throw new RuntimeException(e); - } finally { - finished.set(true); - } - } - - @SneakyThrows - @Override - public boolean hasNext() { - if (next != null) - return true; - - // As long as the worker is still running, we'll do a blocking poll to wait for the next instruction - // (but we wake up every second to check if the worker is still running) - while (!finished.get()) { - if (null != (next = queue.poll(1, TimeUnit.SECONDS))) { - return true; - } - } - - // If the worker is not running, we just drain the queue without waiting - return null != (next = queue.poll()); - } - - @Override - public Instruction next() { - if (next != null || hasNext()) { - try { return next; } - finally { next = null; } - } - throw new IllegalStateException(); - } - - } - -} diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java index 97b2bf0d..675feb3d 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java @@ -5,21 +5,21 @@ import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; import lombok.SneakyThrows; -import nu.marginalia.converting.instruction.Interpreter; import nu.marginalia.db.storage.FileStorageService; -import nu.marginalia.keyword.model.DocumentKeywords; import nu.marginalia.linkdb.LinkdbWriter; -import nu.marginalia.loading.loader.IndexLoadKeywords; -import nu.marginalia.model.EdgeUrl; -import nu.marginalia.model.idx.DocumentMetadata; +import nu.marginalia.loading.documents.DocumentLoaderService; +import nu.marginalia.loading.documents.KeywordLoaderService; +import nu.marginalia.loading.documents.LoaderIndexJournalWriter; +import nu.marginalia.loading.domains.DomainIdRegistry; +import nu.marginalia.loading.domains.DomainLoaderService; +import nu.marginalia.loading.links.DomainLinksLoaderService; import nu.marginalia.mq.MessageQueueFactory; import nu.marginalia.mq.MqMessage; import nu.marginalia.mq.inbox.MqInboxResponse; import nu.marginalia.mq.inbox.MqSingleShotInbox; import nu.marginalia.process.control.ProcessHeartbeatImpl; -import nu.marginalia.process.log.WorkLog; +import nu.marginalia.worklog.BatchingWorkLogInspector; import plan.CrawlPlan; -import nu.marginalia.loading.loader.LoaderFactory; import nu.marginalia.service.module.DatabaseModule; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -35,14 +35,15 @@ import static nu.marginalia.mqapi.ProcessInboxNames.LOADER_INBOX; public class LoaderMain { private static final Logger logger = LoggerFactory.getLogger(LoaderMain.class); - private final ConvertedDomainReader instructionsReader; - private final LoaderFactory loaderFactory; - private final ProcessHeartbeatImpl heartbeat; private final MessageQueueFactory messageQueueFactory; private final FileStorageService fileStorageService; - private final IndexLoadKeywords indexLoadKeywords; - private final LinkdbWriter writer; + private final LinkdbWriter linkdbWriter; + private final LoaderIndexJournalWriter journalWriter; + private final DomainLoaderService domainService; + private final DomainLinksLoaderService linksService; + private final KeywordLoaderService keywordLoaderService; + private final DocumentLoaderService documentLoaderService; private final Gson gson; public static void main(String... args) throws Exception { @@ -65,90 +66,66 @@ public class LoaderMain { } @Inject - public LoaderMain(ConvertedDomainReader instructionsReader, - LoaderFactory loaderFactory, - ProcessHeartbeatImpl heartbeat, + public LoaderMain(ProcessHeartbeatImpl heartbeat, MessageQueueFactory messageQueueFactory, FileStorageService fileStorageService, - IndexLoadKeywords indexLoadKeywords, - LinkdbWriter writer, + LinkdbWriter linkdbWriter, + LoaderIndexJournalWriter journalWriter, + DomainLoaderService domainService, + DomainLinksLoaderService linksService, + KeywordLoaderService keywordLoaderService, + DocumentLoaderService documentLoaderService, Gson gson ) { - this.instructionsReader = instructionsReader; - this.loaderFactory = loaderFactory; this.heartbeat = heartbeat; this.messageQueueFactory = messageQueueFactory; this.fileStorageService = fileStorageService; - this.indexLoadKeywords = indexLoadKeywords; - this.writer = writer; + this.linkdbWriter = linkdbWriter; + this.journalWriter = journalWriter; + this.domainService = domainService; + this.linksService = linksService; + this.keywordLoaderService = keywordLoaderService; + this.documentLoaderService = documentLoaderService; this.gson = gson; heartbeat.start(); } @SneakyThrows - public void run(LoadRequest instructions) { + void run(LoadRequest instructions) { var plan = instructions.getPlan(); - var logFile = plan.process.getLogFile(); + var processLogFile = plan.process.getLogFile(); + + Path inputDataDir = plan.process.getDir(); + int validBatchCount = BatchingWorkLogInspector.getValidBatches(processLogFile); + + DomainIdRegistry domainIdRegistry = + domainService.getOrCreateDomainIds( + inputDataDir, + validBatchCount); - TaskStats taskStats = new TaskStats(100); try { - int loadTotal = 0; - int loaded = 0; - - for (var unused : WorkLog.iterable(logFile)) { - loadTotal++; - } - - logger.info("Loading {} files", loadTotal); - for (var entry : WorkLog.iterable(logFile)) { - InstructionCounter instructionCounter = new InstructionCounter(); - - heartbeat.setProgress(loaded++ / (double) loadTotal); - long startTime = System.currentTimeMillis(); - - Path destDir = plan.getProcessedFilePath(entry.path()); - - try (var loader = loaderFactory.create(entry.cnt())) { - var instructionsIter = instructionsReader.createIterator(destDir); - - while (instructionsIter.hasNext()) { - var next = instructionsIter.next(); - try { - next.apply(instructionCounter); - next.apply(loader); - } catch (Exception ex) { - logger.error("Failed to load instruction " + next.getClass().getSimpleName(), ex); - } - } - } - - long endTime = System.currentTimeMillis(); - long loadTime = endTime - startTime; - taskStats.observe(endTime - startTime); - - logger.info("Loaded {}/{} : {} ({}) {}ms {} l/s", taskStats.getCount(), - loadTotal, destDir, instructionCounter.getCount(), loadTime, taskStats.avgTime()); - } + linksService + .loadLinks(domainIdRegistry, inputDataDir, validBatchCount); + keywordLoaderService + .loadKeywords(domainIdRegistry, inputDataDir, validBatchCount); + documentLoaderService + .loadDocuments(domainIdRegistry, inputDataDir, validBatchCount); instructions.ok(); - - // This needs to be done in order to have a readable index journal - indexLoadKeywords.close(); - writer.close(); - logger.info("Loading finished"); } catch (Exception ex) { - ex.printStackTrace(); - logger.error("Failed to load", ex); instructions.err(); - throw ex; + logger.error("Error", ex); } finally { + journalWriter.close(); + linkdbWriter.close(); heartbeat.shutDown(); } + System.exit(0); } @@ -213,15 +190,4 @@ public class LoaderMain { } } - public class InstructionCounter implements Interpreter { - private int count = 0; - - public void loadKeywords(EdgeUrl url, int ordinal, int features, DocumentMetadata metadata, DocumentKeywords words) { - count++; - } - - public int getCount() { - return count; - } - } } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/TaskStats.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/TaskStats.java deleted file mode 100644 index 15ff182b..00000000 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/TaskStats.java +++ /dev/null @@ -1,37 +0,0 @@ -package nu.marginalia.loading; - -public class TaskStats { - private final long[] taskTimes; - private int count = 0; - private long total = 0; - - public TaskStats(int windowSize) { - taskTimes = new long[windowSize]; - } - - public synchronized void observe(long time) { - taskTimes[count++%taskTimes.length] = time; - total += time; - } - - public double avgTime() { - long tts = 0; - long tot; - - if (count < taskTimes.length) tot = count; - else tot = taskTimes.length; - - for (int i = 0; i < tot; i++) tts += taskTimes[i]; - - return (tot * 10_000L / tts)/10.; - } - - public double totalTime() { - return total; - } - - public int getCount() { - return count; - } - -} diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/documents/DocumentLoaderService.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/documents/DocumentLoaderService.java new file mode 100644 index 00000000..c6ea5a5e --- /dev/null +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/documents/DocumentLoaderService.java @@ -0,0 +1,97 @@ +package nu.marginalia.loading.documents; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import lombok.SneakyThrows; +import nu.marginalia.io.processed.DocumentRecordParquetFileReader; +import nu.marginalia.io.processed.ProcessedDataFileNames; +import nu.marginalia.linkdb.LinkdbWriter; +import nu.marginalia.linkdb.model.LdbUrlDetail; +import nu.marginalia.loading.domains.DomainIdRegistry; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.id.UrlIdCodec; +import nu.marginalia.model.processed.DocumentRecordMetadataProjection; + +import java.io.IOException; +import java.nio.file.Path; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.List; + +@Singleton +public class DocumentLoaderService { + private final LinkdbWriter linkdbWriter; + + @Inject + public DocumentLoaderService(LinkdbWriter linkdbWriter) { + this.linkdbWriter = linkdbWriter; + } + + public void loadDocuments(DomainIdRegistry domainIdRegistry, + Path processedDataPathBase, + int untilBatch) + throws IOException, SQLException + { + var documentFiles = ProcessedDataFileNames.listDocumentFiles(processedDataPathBase, untilBatch); + for (var file : documentFiles) { + loadDocumentsFromFile(domainIdRegistry, file); + } + } + + private void loadDocumentsFromFile(DomainIdRegistry domainIdRegistry, Path file) + throws SQLException, IOException + { + try (var stream = DocumentRecordParquetFileReader.streamMetadataProjection(file); + LinkdbLoader loader = new LinkdbLoader(domainIdRegistry) + ) + { + stream.forEach(loader::accept); + } + } + + class LinkdbLoader implements AutoCloseable { + private final DomainIdRegistry domainIdRegistry; + private final List details = new ArrayList<>(1000); + + LinkdbLoader(DomainIdRegistry domainIdRegistry) { + this.domainIdRegistry = domainIdRegistry; + } + + @SneakyThrows + public void accept(DocumentRecordMetadataProjection projection) + { + + long urlId = UrlIdCodec.encodeId( + domainIdRegistry.getDomainId(projection.domain), + projection.ordinal + ); + + details.add(new LdbUrlDetail( + urlId, + new EdgeUrl(projection.url), + projection.title, + projection.description, + projection.quality, + projection.htmlStandard, + projection.htmlFeatures, + projection.pubYear, + projection.hash, + projection.getLength() + )); + + if (details.size() > 100) { + linkdbWriter.add(details); + details.clear(); + } + + } + + @Override + public void close() throws SQLException { + if (!details.isEmpty()) { + linkdbWriter.add(details); + } + } + } + +} diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/documents/KeywordLoaderService.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/documents/KeywordLoaderService.java new file mode 100644 index 00000000..ef9b619e --- /dev/null +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/documents/KeywordLoaderService.java @@ -0,0 +1,57 @@ +package nu.marginalia.loading.documents; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.io.processed.DocumentRecordParquetFileReader; +import nu.marginalia.io.processed.ProcessedDataFileNames; +import nu.marginalia.keyword.model.DocumentKeywords; +import nu.marginalia.loading.domains.DomainIdRegistry; +import nu.marginalia.model.id.UrlIdCodec; +import nu.marginalia.model.processed.DocumentRecordKeywordsProjection; + +import java.io.IOException; +import java.nio.file.Path; + +@Singleton +public class KeywordLoaderService { + private final LoaderIndexJournalWriter writer; + + @Inject + public KeywordLoaderService(LoaderIndexJournalWriter writer) { + this.writer = writer; + } + + public void loadKeywords(DomainIdRegistry domainIdRegistry, + Path processedDataPathBase, + int untilBatch) throws IOException { + var documentFiles = ProcessedDataFileNames.listDocumentFiles(processedDataPathBase, untilBatch); + for (var file : documentFiles) { + loadKeywordsFromFile(domainIdRegistry, file); + } + } + + private void loadKeywordsFromFile(DomainIdRegistry domainIdRegistry, Path file) throws IOException { + try (var stream = DocumentRecordParquetFileReader.streamKeywordsProjection(file)) { + stream.filter(DocumentRecordKeywordsProjection::hasKeywords) + .forEach(proj -> insertKeywords(domainIdRegistry, proj)); + } + } + + private void insertKeywords(DomainIdRegistry domainIdRegistry, + DocumentRecordKeywordsProjection projection) + { + long combinedId = UrlIdCodec.encodeId( + domainIdRegistry.getDomainId(projection.domain), + projection.ordinal); + + var words = new DocumentKeywords( + projection.words.toArray(String[]::new), + projection.metas.toArray() + ); + + writer.putWords(combinedId, + projection.htmlFeatures, + projection.documentMetadata, + words); + } +} \ No newline at end of file diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderIndexJournalWriter.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/documents/LoaderIndexJournalWriter.java similarity index 90% rename from code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderIndexJournalWriter.java rename to code/processes/loading-process/src/main/java/nu/marginalia/loading/documents/LoaderIndexJournalWriter.java index be88b091..0bfd6193 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderIndexJournalWriter.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/documents/LoaderIndexJournalWriter.java @@ -1,4 +1,4 @@ -package nu.marginalia.loading.loader; +package nu.marginalia.loading.documents; import com.google.inject.Inject; import com.google.inject.Singleton; @@ -44,12 +44,20 @@ public class LoaderIndexJournalWriter { indexWriter = new IndexJournalWriterPagingImpl(indexArea.asPath()); } - @SneakyThrows public void putWords(long combinedId, int features, DocumentMetadata metadata, DocumentKeywords wordSet) { + putWords(combinedId, features, metadata.encode(), wordSet); + } + + @SneakyThrows + public void putWords(long combinedId, + int features, + long metadata, + DocumentKeywords wordSet) { + if (wordSet.isEmpty()) { logger.info("Skipping zero-length word set for {}", combinedId); return; @@ -75,7 +83,7 @@ public class LoaderIndexJournalWriter { } var entry = new IndexJournalEntryData(i, buffer); - var header = new IndexJournalEntryHeader(combinedId, features, metadata.encode()); + var header = new IndexJournalEntryHeader(combinedId, features, metadata); indexWriter.put(header, entry); } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/domains/DomainIdRegistry.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/domains/DomainIdRegistry.java new file mode 100644 index 00000000..cb825641 --- /dev/null +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/domains/DomainIdRegistry.java @@ -0,0 +1,36 @@ +package nu.marginalia.loading.domains; + +import nu.marginalia.model.EdgeDomain; + +import java.util.HashMap; +import java.util.Map; + +/** Maps domain names to domain ids */ +public class DomainIdRegistry { + private final Map domainIds = new HashMap<>(); + + public int getDomainId(String domainName) { + Integer id = domainIds.get(domainName.toLowerCase()); + + if (id == null) { + // This is a very severe problem + throw new IllegalStateException("Unknown domain id for domain " + domainName); + } + + return id; + } + + public int getDomainId(EdgeDomain domainName) { + return getDomainId(domainName.toString()); + } + + + public boolean isKnown(String domainName) { + return domainIds.containsKey(domainName); + } + + void add(String domainName, int id) { + domainIds.put(domainName, id); + } + +} diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/domains/DomainLoaderService.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/domains/DomainLoaderService.java new file mode 100644 index 00000000..860780d3 --- /dev/null +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/domains/DomainLoaderService.java @@ -0,0 +1,97 @@ +package nu.marginalia.loading.domains; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.io.processed.DomainLinkRecordParquetFileReader; +import nu.marginalia.io.processed.DomainRecordParquetFileReader; +import nu.marginalia.io.processed.ProcessedDataFileNames; +import nu.marginalia.model.EdgeDomain; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.file.Path; +import java.sql.SQLException; +import java.util.*; + +@Singleton +public class DomainLoaderService { + + private final HikariDataSource dataSource; + private final Logger logger = LoggerFactory.getLogger(DomainLoaderService.class); + + @Inject + public DomainLoaderService(HikariDataSource dataSource) { + this.dataSource = dataSource; + } + + /** Read the domain names from each parquet file + * compare with SQL domain database, fetch those + * that exist, insert those that don't. + */ + public DomainIdRegistry getOrCreateDomainIds(Path processedDataPathBase, int untilBatch) + throws IOException, SQLException + { + Collection domainNamesAll = readDomainNames(processedDataPathBase, untilBatch); + return getDatabaseIds(domainNamesAll); + } + + Collection readDomainNames(Path processedDataPathBase, int untilBatch) throws IOException { + final Set domainNamesAll = new HashSet<>(100_000); + + var domainFiles = ProcessedDataFileNames.listDomainFiles(processedDataPathBase, untilBatch); + for (var file : domainFiles) { + domainNamesAll.addAll(DomainRecordParquetFileReader.getDomainNames(file)); + } + + var linkFiles = ProcessedDataFileNames.listDomainLinkFiles(processedDataPathBase, untilBatch); + for (var file : linkFiles) { + domainNamesAll.addAll(DomainLinkRecordParquetFileReader.getDestDomainNames(file)); + } + + return domainNamesAll; + } + + DomainIdRegistry getDatabaseIds(Collection domainNamesAll) throws SQLException { + DomainIdRegistry ret = new DomainIdRegistry(); + + try (var conn = dataSource.getConnection(); + var insertStmt = conn.prepareStatement(""" + INSERT IGNORE INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP) VALUES (?, ?) + """); + var selectStmt = conn.prepareStatement(""" + SELECT ID, DOMAIN_NAME FROM EC_DOMAIN WHERE DOMAIN_NAME=? + """) + ) { + + int i = 0; + for (var domain : domainNamesAll) { + var parsed = new EdgeDomain(domain); + insertStmt.setString(1, domain); + insertStmt.setString(2, parsed.domain); + insertStmt.addBatch(); + if (++i > 1000) { + i = 0; + insertStmt.executeBatch(); + } + } + if (i > 0) { + insertStmt.executeBatch(); + } + + for (var domain : domainNamesAll) { + selectStmt.setString(1, domain); + var rs = selectStmt.executeQuery(); + if (rs.next()) { + ret.add(domain, rs.getInt(1)); + } + else { + logger.error("Unknown domain {}", domain); + } + } + } + + return ret; + } +} diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/links/DomainLinksLoaderService.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/links/DomainLinksLoaderService.java new file mode 100644 index 00000000..c70647a6 --- /dev/null +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/links/DomainLinksLoaderService.java @@ -0,0 +1,104 @@ +package nu.marginalia.loading.links; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.io.processed.DomainLinkRecordParquetFileReader; +import nu.marginalia.io.processed.ProcessedDataFileNames; +import nu.marginalia.loading.domains.DomainIdRegistry; +import nu.marginalia.model.processed.DomainLinkRecord; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.file.Path; +import java.sql.Connection; +import java.sql.PreparedStatement; +import java.sql.SQLException; + +@Singleton +public class DomainLinksLoaderService { + + private final HikariDataSource dataSource; + private static final Logger logger = LoggerFactory.getLogger(DomainLinksLoaderService.class); + + @Inject + public DomainLinksLoaderService(HikariDataSource dataSource) { + this.dataSource = dataSource; + } + + public void loadLinks(DomainIdRegistry domainIdRegistry, Path processedDataPathBase, int untilBatch) throws IOException, SQLException { + + dropLinkData(); + + var linkFiles = ProcessedDataFileNames.listDomainLinkFiles(processedDataPathBase, untilBatch); + for (var file : linkFiles) { + loadLinksFromFile(domainIdRegistry, file); + } + } + + private void dropLinkData() throws SQLException { + logger.info("Truncating EC_DOMAIN_LINK"); + + try (var conn = dataSource.getConnection(); + var stmt = conn.createStatement()) { + stmt.executeUpdate("TRUNCATE TABLE EC_DOMAIN_LINK"); + } + } + + private void loadLinksFromFile(DomainIdRegistry domainIdRegistry, Path file) throws IOException, SQLException { + try (var domainStream = DomainLinkRecordParquetFileReader.stream(file); + var linkLoader = new LinkLoader(domainIdRegistry)) + { + logger.info("Loading links from {}", file); + domainStream.forEach(linkLoader::accept); + } + } + + class LinkLoader implements AutoCloseable { + private final Connection connection; + private final PreparedStatement insertStatement; + private final DomainIdRegistry domainIdRegistry; + + private int batchSize = 0; + private int total = 0; + + public LinkLoader(DomainIdRegistry domainIdRegistry) throws SQLException { + this.domainIdRegistry = domainIdRegistry; + + connection = dataSource.getConnection(); + insertStatement = connection.prepareStatement(""" + INSERT INTO EC_DOMAIN_LINK(SOURCE_DOMAIN_ID, DEST_DOMAIN_ID) + VALUES (?, ?) + """); + } + + void accept(DomainLinkRecord record) { + try { + insertStatement.setInt(1, domainIdRegistry.getDomainId(record.source)); + insertStatement.setInt(2, domainIdRegistry.getDomainId(record.dest)); + insertStatement.addBatch(); + if (++batchSize > 1000) { + batchSize = 0; + insertStatement.executeBatch(); + } + total++; + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + } + + @Override + public void close() throws SQLException { + if (batchSize > 0) { + insertStatement.executeBatch(); + } + + logger.info("Inserted {} links", total); + + insertStatement.close(); + connection.close(); + } + } +} diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/IndexLoadKeywords.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/IndexLoadKeywords.java deleted file mode 100644 index 2e24b843..00000000 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/IndexLoadKeywords.java +++ /dev/null @@ -1,47 +0,0 @@ -package nu.marginalia.loading.loader; - -import com.google.inject.Inject; -import nu.marginalia.keyword.model.DocumentKeywords; -import nu.marginalia.model.EdgeUrl; -import nu.marginalia.model.id.UrlIdCodec; -import nu.marginalia.model.idx.DocumentMetadata; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class IndexLoadKeywords { - private static final Logger logger = LoggerFactory.getLogger(IndexLoadKeywords.class); - private final LoaderIndexJournalWriter journalWriter; - - private volatile boolean canceled = false; - - @Inject - public IndexLoadKeywords(LoaderIndexJournalWriter journalWriter) { - this.journalWriter = journalWriter; - } - - - public void close() throws Exception { - if (!canceled) { - journalWriter.close(); - } - } - - public void load(LoaderData loaderData, - int ordinal, - EdgeUrl url, - int features, - DocumentMetadata metadata, - DocumentKeywords words) { - long combinedId = UrlIdCodec.encodeId(loaderData.getTargetDomainId(), ordinal); - - if (combinedId <= 0) { - logger.warn("Failed to get IDs for {} -- c={}", url, combinedId); - return; - } - - journalWriter.putWords(combinedId, - features, - metadata, - words); - } -} diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LdbLoadProcessedDocument.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LdbLoadProcessedDocument.java deleted file mode 100644 index acb3b26b..00000000 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LdbLoadProcessedDocument.java +++ /dev/null @@ -1,83 +0,0 @@ -package nu.marginalia.loading.loader; - -import com.google.inject.Inject; -import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument; -import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError; -import nu.marginalia.linkdb.LinkdbStatusWriter; -import nu.marginalia.linkdb.LinkdbWriter; -import nu.marginalia.linkdb.model.LdbUrlDetail; -import nu.marginalia.linkdb.model.UrlStatus; -import nu.marginalia.model.id.UrlIdCodec; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.sql.SQLException; -import java.util.ArrayList; -import java.util.List; - -public class LdbLoadProcessedDocument { - private static final Logger logger = LoggerFactory.getLogger(LdbLoadProcessedDocument.class); - private final LinkdbWriter linkdbWriter; - private final LinkdbStatusWriter linkdbStatusWriter; - - @Inject - public LdbLoadProcessedDocument(LinkdbWriter linkdbWriter, - LinkdbStatusWriter linkdbStatusWriter - ) { - this.linkdbWriter = linkdbWriter; - this.linkdbStatusWriter = linkdbStatusWriter; - } - - public void load(LoaderData data, List documents) { - var details = new ArrayList(); - - int domainId = data.getTargetDomainId(); - var statusList = new ArrayList(); - - for (var document : documents) { - long id = UrlIdCodec.encodeId(domainId, document.ordinal()); - details.add(new LdbUrlDetail( - id, - document.url(), - document.title(), - document.description(), - document.quality(), - document.standard(), - document.htmlFeatures(), - document.pubYear(), - document.hash(), - document.length() - )); - statusList.add(new UrlStatus(id, document.url(), document.state().toString(), null)); - } - - try { - linkdbWriter.add(details); - } - catch (SQLException ex) { - logger.warn("Failed to add processed documents to linkdb", ex); - } - } - - public void loadWithError(LoaderData data, List documents) { - var statusList = new ArrayList(); - int domainId = data.getTargetDomainId(); - - for (var document : documents) { - statusList.add(new UrlStatus( - UrlIdCodec.encodeId(domainId, document.ordinal()), - document.url(), - document.state().toString(), - document.reason() - )); - } - - try { - linkdbStatusWriter.add(statusList); - } - catch (SQLException ex) { - logger.warn("Failed to add processed documents to linkdb", ex); - } - } - -} diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/Loader.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/Loader.java deleted file mode 100644 index 8af672c5..00000000 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/Loader.java +++ /dev/null @@ -1,118 +0,0 @@ -package nu.marginalia.loading.loader; - -import nu.marginalia.keyword.model.DocumentKeywords; -import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.EdgeUrl; -import nu.marginalia.model.crawl.DomainIndexingState; -import nu.marginalia.model.idx.DocumentMetadata; -import nu.marginalia.converting.instruction.Interpreter; -import nu.marginalia.converting.instruction.instructions.DomainLink; -import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument; -import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.sql.SQLException; -import java.util.ArrayList; -import java.util.List; - -public class Loader implements Interpreter, AutoCloseable { - private final SqlLoadDomains sqlLoadDomains; - private final SqlLoadDomainLinks sqlLoadDomainLinks; - private final SqlLoadProcessedDomain sqlLoadProcessedDomain; - private final LdbLoadProcessedDocument loadProcessedDocument; - private final SqlLoadDomainMetadata sqlLoadDomainMetadata; - - private final IndexLoadKeywords indexLoadKeywords; - - private static final Logger logger = LoggerFactory.getLogger(Loader.class); - - private final List processedDocumentList; - private final List processedDocumentWithErrorList; - - - public final LoaderData data; - - public Loader(int sizeHint, - SqlLoadDomains sqlLoadDomains, - SqlLoadDomainLinks sqlLoadDomainLinks, - SqlLoadProcessedDomain sqlLoadProcessedDomain, - LdbLoadProcessedDocument loadProcessedDocument, - SqlLoadDomainMetadata sqlLoadDomainMetadata, - IndexLoadKeywords indexLoadKeywords) { - data = new LoaderData(sizeHint); - - this.sqlLoadDomains = sqlLoadDomains; - this.sqlLoadDomainLinks = sqlLoadDomainLinks; - this.sqlLoadProcessedDomain = sqlLoadProcessedDomain; - this.loadProcessedDocument = loadProcessedDocument; - this.sqlLoadDomainMetadata = sqlLoadDomainMetadata; - this.indexLoadKeywords = indexLoadKeywords; - - processedDocumentList = new ArrayList<>(sizeHint); - processedDocumentWithErrorList = new ArrayList<>(sizeHint); - } - - @Override - public void loadDomain(EdgeDomain[] domains) { - sqlLoadDomains.load(data, domains); - } - - @Override - public void loadRssFeed(EdgeUrl[] rssFeed) { - logger.debug("loadRssFeed({})", rssFeed, null); - } - - @Override - public void loadDomainLink(DomainLink[] links) { - sqlLoadDomainLinks.load(data, links); - } - - @Override - public void loadProcessedDomain(EdgeDomain domain, DomainIndexingState state, String ip) { - sqlLoadProcessedDomain.load(data, domain, state, ip); - } - - @Override - public void loadProcessedDocument(LoadProcessedDocument document) { - processedDocumentList.add(document); - - if (processedDocumentList.size() > 1000) { - loadProcessedDocument.load(data, processedDocumentList); - processedDocumentList.clear(); - } - } - @Override - public void loadProcessedDocumentWithError(LoadProcessedDocumentWithError document) { - processedDocumentWithErrorList.add(document); - - if (processedDocumentWithErrorList.size() > 1000) { - loadProcessedDocument.loadWithError(data, processedDocumentWithErrorList); - processedDocumentWithErrorList.clear(); - } - } - @Override - public void loadKeywords(EdgeUrl url, int ordinal, int features, DocumentMetadata metadata, DocumentKeywords words) { - indexLoadKeywords.load(data, ordinal, url, features, metadata, words); - } - - @Override - public void loadDomainRedirect(DomainLink link) { - sqlLoadProcessedDomain.loadAlias(data, link); - } - - @Override - public void loadDomainMetadata(EdgeDomain domain, int knownUrls, int goodUrls, int visitedUrls) { - sqlLoadDomainMetadata.load(data, domain, knownUrls, goodUrls, visitedUrls); - } - - public void close() { - if (processedDocumentList.size() > 0) { - loadProcessedDocument.load(data, processedDocumentList); - } - if (processedDocumentWithErrorList.size() > 0) { - loadProcessedDocument.loadWithError(data, processedDocumentWithErrorList); - } - } - -} diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderData.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderData.java deleted file mode 100644 index 613b880d..00000000 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderData.java +++ /dev/null @@ -1,38 +0,0 @@ -package nu.marginalia.loading.loader; - -import gnu.trove.map.hash.TObjectIntHashMap; -import nu.marginalia.model.EdgeDomain; - -public class LoaderData { - - private final TObjectIntHashMap domainIds; - private EdgeDomain targetDomain; - public final int sizeHint; - private int targetDomainId = -1; - - public LoaderData(int sizeHint) { - domainIds = new TObjectIntHashMap<>(10); - this.sizeHint = sizeHint; - } - - public void setTargetDomain(EdgeDomain domain) { - this.targetDomain = domain; - } - public EdgeDomain getTargetDomain() { - return targetDomain; - } - - public int getTargetDomainId() { - if (targetDomainId < 0) - targetDomainId = domainIds.get(targetDomain); - return targetDomainId; - } - - public void addDomain(EdgeDomain domain, int id) { - domainIds.put(domain, id); - } - - public int getDomainId(EdgeDomain domain) { - return domainIds.get(domain); - } -} diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderFactory.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderFactory.java deleted file mode 100644 index f5984b51..00000000 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderFactory.java +++ /dev/null @@ -1,31 +0,0 @@ -package nu.marginalia.loading.loader; - -import com.google.inject.Inject; - -public class LoaderFactory { - private final SqlLoadDomains sqlLoadDomains; - private final SqlLoadDomainLinks sqlLoadDomainLinks; - private final SqlLoadProcessedDomain sqlLoadProcessedDomain; - private final LdbLoadProcessedDocument sqlLoadProcessedDocument; - private final SqlLoadDomainMetadata sqlLoadDomainMetadata; - private final IndexLoadKeywords indexLoadKeywords; - - @Inject - public LoaderFactory(SqlLoadDomains sqlLoadDomains, - SqlLoadDomainLinks sqlLoadDomainLinks, - SqlLoadProcessedDomain sqlLoadProcessedDomain, - LdbLoadProcessedDocument sqlLoadProcessedDocument, - SqlLoadDomainMetadata sqlLoadDomainMetadata, - IndexLoadKeywords indexLoadKeywords) { - this.sqlLoadDomains = sqlLoadDomains; - this.sqlLoadDomainLinks = sqlLoadDomainLinks; - this.sqlLoadProcessedDomain = sqlLoadProcessedDomain; - this.sqlLoadProcessedDocument = sqlLoadProcessedDocument; - this.sqlLoadDomainMetadata = sqlLoadDomainMetadata; - this.indexLoadKeywords = indexLoadKeywords; - } - - public Loader create(int sizeHint) { - return new Loader(sizeHint, sqlLoadDomains, sqlLoadDomainLinks, sqlLoadProcessedDomain, sqlLoadProcessedDocument, sqlLoadDomainMetadata, indexLoadKeywords); - } -} diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadDomainLinks.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadDomainLinks.java deleted file mode 100644 index 79028d4c..00000000 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadDomainLinks.java +++ /dev/null @@ -1,84 +0,0 @@ -package nu.marginalia.loading.loader; - -import com.google.inject.Inject; -import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.converting.instruction.instructions.DomainLink; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.sql.SQLException; - -import static java.sql.Statement.SUCCESS_NO_INFO; - -public class SqlLoadDomainLinks { - - private final HikariDataSource dataSource; - private static final Logger logger = LoggerFactory.getLogger(SqlLoadDomainLinks.class); - - @Inject - public SqlLoadDomainLinks(HikariDataSource dataSource) { - this.dataSource = dataSource; - try (var conn = dataSource.getConnection()) { - try (var stmt = conn.createStatement()) { - stmt.execute("DROP PROCEDURE IF EXISTS INSERT_LINK"); - stmt.execute(""" - CREATE PROCEDURE INSERT_LINK ( - IN FROM_DOMAIN VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci, - IN TO_DOMAIN VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci - ) - BEGIN - INSERT IGNORE INTO EC_DOMAIN_LINK (SOURCE_DOMAIN_ID, DEST_DOMAIN_ID) - SELECT SOURCE.ID,DEST.ID - FROM EC_DOMAIN SOURCE INNER JOIN EC_DOMAIN DEST - ON SOURCE.DOMAIN_NAME=FROM_DOMAIN AND DEST.DOMAIN_NAME=TO_DOMAIN; - END - """); - } - } - catch (SQLException ex) { - throw new RuntimeException("Failed to set up loader", ex); - } - } - - public void load(LoaderData data, DomainLink[] links) { - - try (var connection = dataSource.getConnection(); - var nukeExistingLinksForDomain = - connection.prepareStatement(""" - DELETE FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=? - """); - var stmt = - connection.prepareCall("CALL INSERT_LINK(?,?)")) - { - - connection.setAutoCommit(false); - nukeExistingLinksForDomain.setInt(1, data.getDomainId(links[0].from())); - nukeExistingLinksForDomain.executeUpdate(); - - for (DomainLink link : links) { - stmt.setString(1, link.from().toString()); - stmt.setString(2, link.to().toString()); - - stmt.addBatch(); - } - - var ret = stmt.executeBatch(); - for (int rv = 0; rv < links.length; rv++) { - if (ret[rv] != 1 && ret[rv] != SUCCESS_NO_INFO) { - logger.warn("load({}) -- bad row count {}", links[rv], ret[rv]); - } - } - - connection.commit(); - connection.setAutoCommit(true); - - } - catch (SQLException ex) { - logger.warn("SQL error inserting domain links", ex); - - if (getClass().desiredAssertionStatus()) - throw new RuntimeException(ex); - } - - } -} diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadDomainMetadata.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadDomainMetadata.java deleted file mode 100644 index bf1dbcdc..00000000 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadDomainMetadata.java +++ /dev/null @@ -1,41 +0,0 @@ -package nu.marginalia.loading.loader; - -import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.model.EdgeDomain; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.google.inject.Inject; -import java.sql.SQLException; - -public class SqlLoadDomainMetadata { - private final HikariDataSource dataSource; - private final Logger logger = LoggerFactory.getLogger(getClass()); - - @Inject - public SqlLoadDomainMetadata(HikariDataSource dataSource) { - this.dataSource = dataSource; - } - - public void load(LoaderData data, EdgeDomain domain, int knownUrls, int goodUrls, int visitedUrls) { - int domainId = data.getDomainId(domain); - - try (var conn = dataSource.getConnection(); - var stmt = conn.prepareStatement(""" - INSERT INTO DOMAIN_METADATA(ID,KNOWN_URLS,VISITED_URLS,GOOD_URLS) VALUES (?, ?, ?, ?) - """ - )) - { - stmt.setInt(1, domainId); - stmt.setInt(2, knownUrls); - stmt.setInt(3, visitedUrls); - stmt.setInt(4, goodUrls); - stmt.executeUpdate(); - } catch (SQLException ex) { - logger.warn("SQL error inserting domains", ex); - - if (getClass().desiredAssertionStatus()) - throw new RuntimeException(ex); - } - } -} diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadDomains.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadDomains.java deleted file mode 100644 index 3ecd2411..00000000 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadDomains.java +++ /dev/null @@ -1,163 +0,0 @@ -package nu.marginalia.loading.loader; - -import com.google.inject.Inject; -import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.model.EdgeDomain; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.sql.Connection; -import java.sql.SQLException; - -import static java.sql.Statement.SUCCESS_NO_INFO; - -public class SqlLoadDomains { - - private final HikariDataSource dataSource; - private static final Logger logger = LoggerFactory.getLogger(SqlLoadDomains.class); - - @Inject - public SqlLoadDomains(HikariDataSource dataSource) { - this.dataSource = dataSource; - try (var conn = dataSource.getConnection()) { - try (var stmt = conn.createStatement()) { - stmt.execute("DROP PROCEDURE IF EXISTS INSERT_DOMAIN"); - stmt.execute(""" - CREATE PROCEDURE INSERT_DOMAIN ( - IN DOMAIN_NAME VARCHAR(255), - IN TOP_DOMAIN VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci) - BEGIN - INSERT IGNORE INTO EC_DOMAIN(DOMAIN_NAME, DOMAIN_TOP) VALUES (DOMAIN_NAME, TOP_DOMAIN); - END - """); - } - } - catch (SQLException ex) { - throw new RuntimeException("Failed to set up loader", ex); - } - } - - public void load(LoaderData data, EdgeDomain domain) { - - try (var connection = dataSource.getConnection()) { - try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?)")) { - connection.setAutoCommit(false); - insertCall.setString(1, domain.toString()); - insertCall.setString(2, domain.domain); - - var ret = insertCall.executeUpdate(); - connection.commit(); - if (ret < 0) { - logger.warn("load({}) -- bad return status {}", domain, ret); - } - - findIdForDomain(connection, data, domain); - connection.setAutoCommit(true); - } - } - catch (SQLException ex) { - logger.warn("SQL error inserting domain", ex); - if (getClass().desiredAssertionStatus()) - throw new RuntimeException(ex); - } - - - } - - public void load(LoaderData data, EdgeDomain[] domains) { - - try (var connection = dataSource.getConnection()) { - connection.setAutoCommit(false); - - try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?)")) { - - - int cnt = 0; int batchOffset = 0; - for (var domain : domains) { - insertCall.setString(1, domain.toString()); - insertCall.setString(2, domain.domain); - insertCall.addBatch(); - - if (++cnt == 1000) { - var ret = insertCall.executeBatch(); - connection.commit(); - - for (int rv = 0; rv < cnt; rv++) { - if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) { - logger.warn("load({}) -- bad row count {}", domains[batchOffset + rv], ret[rv]); - } - } - - cnt = 0; - batchOffset += 1000; - } - } - if (cnt > 0) { - var ret = insertCall.executeBatch(); - connection.commit(); - for (int rv = 0; rv < cnt; rv++) { - if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) { - logger.warn("load({}) -- bad row count {}", domains[batchOffset + rv], ret[rv]); - } - } - } - - } - connection.commit(); - connection.setAutoCommit(true); - findIdForDomain(connection, data, domains); - } - catch (SQLException ex) { - logger.warn("SQL error inserting domains", ex); - } - } - - void findIdForDomain(Connection connection, LoaderData data, EdgeDomain... domains) { - if (data.getTargetDomain() == null || data.getDomainId(data.getTargetDomain()) > 0) { - return; - } - - try (var query = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) - { - - for (var domain : domains) { - if (data.getDomainId(domain) > 0) - continue; - - query.setString(1, domain.toString()); - var rsp = query.executeQuery(); - if (rsp.next()) { - data.addDomain(domain, rsp.getInt(1)); - } else { - logger.warn("load() -- could not find ID for target domain {}", domain); - } - } - } - catch (SQLException ex) { - logger.warn("SQL error finding id for domain", ex); - } - } - - void loadAdditionalDomains(Connection connection, LoaderData data, EdgeDomain[] domains) { - - try (var query = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) - { - for (var domain : domains) { - - if (data.getDomainId(domain) == 0) continue; - - query.setString(1, domain.toString()); - var rsp = query.executeQuery(); - if (rsp.next()) { - data.addDomain(domain, rsp.getInt(1)); - } else { - logger.warn("load() -- could not find ID for target domain {}", domain); - } - } - } - catch (SQLException ex) { - logger.warn("SQL error finding id for domain", ex); - } - } -} diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDomain.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDomain.java deleted file mode 100644 index 9bf94816..00000000 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDomain.java +++ /dev/null @@ -1,104 +0,0 @@ -package nu.marginalia.loading.loader; - -import com.google.inject.Inject; -import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.model.crawl.DomainIndexingState; -import nu.marginalia.converting.instruction.instructions.DomainLink; -import nu.marginalia.model.EdgeDomain; -import org.apache.commons.lang3.StringUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.sql.SQLException; - -public class SqlLoadProcessedDomain { - private final HikariDataSource dataSource; - private final SqlLoadDomains loadDomains; - private static final Logger logger = LoggerFactory.getLogger(SqlLoadProcessedDomain.class); - - @Inject - public SqlLoadProcessedDomain(HikariDataSource dataSource, SqlLoadDomains loadDomains) { - this.dataSource = dataSource; - this.loadDomains = loadDomains; - - - try (var conn = dataSource.getConnection()) { - try (var stmt = conn.createStatement()) { - stmt.execute("DROP PROCEDURE IF EXISTS INITIALIZE_DOMAIN"); - - stmt.execute(""" - CREATE PROCEDURE INITIALIZE_DOMAIN ( - IN ST ENUM('ACTIVE', 'EXHAUSTED', 'SPECIAL', 'SOCIAL_MEDIA', 'BLOCKED', 'REDIR', 'ERROR', 'UNKNOWN'), - IN IDX INT, - IN DID INT, - IN IP VARCHAR(48)) - BEGIN - DELETE FROM DOMAIN_METADATA WHERE ID=DID; - DELETE FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=DID; - UPDATE EC_DOMAIN SET INDEX_DATE=NOW(), STATE=ST, DOMAIN_ALIAS=NULL, INDEXED=GREATEST(INDEXED,IDX), IP=IP WHERE ID=DID; - DELETE FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=DID; - END - """); - } - } - catch (SQLException ex) { - throw new RuntimeException("Failed to set up loader", ex); - } - } - - public void load(LoaderData data, EdgeDomain domain, DomainIndexingState state, String ip) { - - data.setTargetDomain(domain); - - loadDomains.load(data, domain); - - try (var conn = dataSource.getConnection()) { - try (var initCall = conn.prepareCall("CALL INITIALIZE_DOMAIN(?,?,?,?)")) { - initCall.setString(1, state.name()); - initCall.setInt(2, 1 + data.sizeHint / 100); - initCall.setInt(3, data.getDomainId(domain)); - initCall.setString(4, StringUtils.truncate(ip, 48)); - int rc = initCall.executeUpdate(); - conn.commit(); - if (rc < 1) { - logger.warn("load({},{}) -- bad rowcount {}", domain, state, rc); - } - } - catch (SQLException ex) { - conn.rollback(); - throw ex; - } - } - catch (SQLException ex) { - logger.warn("SQL error initializing domain", ex); - - if (getClass().desiredAssertionStatus()) - throw new RuntimeException(ex); - } - - } - - public void loadAlias(LoaderData data, DomainLink link) { - try (var conn = dataSource.getConnection(); - var stmt = conn.prepareStatement(""" - UPDATE EC_DOMAIN TARGET - INNER JOIN EC_DOMAIN ALIAS ON ALIAS.DOMAIN_NAME=? - SET TARGET.DOMAIN_ALIAS=ALIAS.ID - WHERE TARGET.DOMAIN_NAME=? - """)) { - stmt.setString(1, link.to().toString()); - stmt.setString(2, link.from().toString()); - int rc = stmt.executeUpdate(); - conn.commit(); - if (rc != 1) { - logger.warn("loadAlias({}) - unexpected row count {}", link, rc); - } - } - catch (SQLException ex) { - logger.warn("SQL error inserting domain alias", ex); - - if (getClass().desiredAssertionStatus()) - throw new RuntimeException(ex); - } - } -} diff --git a/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadDomainLinksTest.java b/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadDomainLinksTest.java deleted file mode 100644 index a8d85699..00000000 --- a/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadDomainLinksTest.java +++ /dev/null @@ -1,52 +0,0 @@ -package nu.marginalia.loader; - -import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.loading.loader.LoaderData; -import nu.marginalia.loading.loader.SqlLoadDomainLinks; -import nu.marginalia.loading.loader.SqlLoadDomains; -import nu.marginalia.converting.instruction.instructions.DomainLink; -import nu.marginalia.model.EdgeDomain; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Tag; -import org.junit.jupiter.api.Test; -import org.testcontainers.containers.MariaDBContainer; -import org.testcontainers.junit.jupiter.Container; -import org.testcontainers.junit.jupiter.Testcontainers; - -@Tag("slow") -@Testcontainers -class SqlLoadDomainLinksTest { - @Container - static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") - .withDatabaseName("WMSA_prod") - .withUsername("wmsa") - .withPassword("wmsa") - .withInitScript("db/migration/V23_06_0_000__base.sql") - .withNetworkAliases("mariadb"); - - HikariDataSource dataSource; - LoaderData loaderData; - @BeforeEach - public void setUp() { - dataSource = DbTestUtil.getConnection(mariaDBContainer.getJdbcUrl()); - - var loadDomains = new SqlLoadDomains(dataSource); - loaderData = new LoaderData(10); - - loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu")); - loadDomains.load(loaderData, new EdgeDomain[] { new EdgeDomain("www.marginalia.nu"), new EdgeDomain("memex.marginalia.nu") }); - } - - @AfterEach - public void tearDown() { - dataSource.close(); - } - - @Test - public void loadDomainLinks() { - var loader = new SqlLoadDomainLinks(dataSource); - loader.load(loaderData, new DomainLink[] { new DomainLink(new EdgeDomain("www.marginalia.nu"), new EdgeDomain("memex.marginalia.nu")) }); - } - -} \ No newline at end of file diff --git a/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadDomainsTest.java b/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadDomainsTest.java deleted file mode 100644 index 16d52d33..00000000 --- a/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadDomainsTest.java +++ /dev/null @@ -1,55 +0,0 @@ -package nu.marginalia.loader; - -import nu.marginalia.loading.loader.LoaderData; -import nu.marginalia.loading.loader.SqlLoadDomains; -import nu.marginalia.model.EdgeDomain; -import org.junit.jupiter.api.Tag; -import org.junit.jupiter.api.Test; -import org.testcontainers.containers.MariaDBContainer; -import org.testcontainers.junit.jupiter.Container; -import org.testcontainers.junit.jupiter.Testcontainers; - -import static org.junit.jupiter.api.Assertions.assertTrue; - -@Tag("slow") -@Testcontainers -class SqlLoadDomainsTest { - @Container - static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") - .withDatabaseName("WMSA_prod") - .withUsername("wmsa") - .withPassword("wmsa") - .withInitScript("db/migration/V23_06_0_000__base.sql") - .withNetworkAliases("mariadb"); - - @Test - public void loadDomain() { - - try (var dataSource = DbTestUtil.getConnection(mariaDBContainer.getJdbcUrl())) { - var loadDomains = new SqlLoadDomains(dataSource); - var loaderData = new LoaderData(10); - - loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu")); - loadDomains.load(loaderData, new EdgeDomain("www.marginalia.nu")); - - assertTrue(loaderData.getDomainId(new EdgeDomain("www.marginalia.nu")) >= 0); - } - - } - - @Test - public void loadDomains() { - - try (var dataSource = DbTestUtil.getConnection(mariaDBContainer.getJdbcUrl());) { - var loadDomains = new SqlLoadDomains(dataSource); - var loaderData = new LoaderData(10); - - loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu")); - loadDomains.load(loaderData, new EdgeDomain[] { new EdgeDomain("www.marginalia.nu"), new EdgeDomain("memex.marginalia.nu") }); - - assertTrue(loaderData.getDomainId(new EdgeDomain("www.marginalia.nu")) >= 0); - assertTrue(loaderData.getDomainId(new EdgeDomain("memex.marginalia.nu")) >= 0); - } - - } -} \ No newline at end of file diff --git a/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDomainTest.java b/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDomainTest.java deleted file mode 100644 index e1fa8223..00000000 --- a/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDomainTest.java +++ /dev/null @@ -1,76 +0,0 @@ -package nu.marginalia.loader; - -import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.loading.loader.LoaderData; -import nu.marginalia.loading.loader.SqlLoadDomains; -import nu.marginalia.loading.loader.SqlLoadProcessedDomain; -import nu.marginalia.converting.instruction.instructions.DomainLink; -import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.crawl.DomainIndexingState; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Tag; -import org.junit.jupiter.api.Test; -import org.testcontainers.containers.MariaDBContainer; -import org.testcontainers.junit.jupiter.Container; -import org.testcontainers.junit.jupiter.Testcontainers; - -import java.util.stream.Collectors; -import java.util.stream.Stream; - -@Tag("slow") -@Testcontainers -class SqlLoadProcessedDomainTest { - @Container - static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") - .withDatabaseName("WMSA_prod") - .withUsername("wmsa") - .withPassword("wmsa") - .withInitScript("db/migration/V23_06_0_000__base.sql") - .withNetworkAliases("mariadb"); - - HikariDataSource dataSource; - LoaderData loaderData; - @BeforeEach - public void setUp() { - - dataSource = DbTestUtil.getConnection(mariaDBContainer.getJdbcUrl()); - - var loadDomains = new SqlLoadDomains(dataSource); - loaderData = new LoaderData(10); - - loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu")); - loadDomains.load(loaderData, new EdgeDomain[]{ new EdgeDomain("www.marginalia.nu"), new EdgeDomain("memex.marginalia.nu") }); - } - - @AfterEach - public void tearDown() { - dataSource.close(); - } - - @Test - public void loadProcessedDomain() { - var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource)); - loader.load(loaderData, new EdgeDomain("www.marginalia.nu"), DomainIndexingState.BLOCKED, "127.0.0.1"); - } - @Test - public void loadProcessedDomainTwice() { - var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource)); - loader.load(loaderData, new EdgeDomain("www.marginalia.nu"), DomainIndexingState.BLOCKED, "127.0.0.1"); - } - - @Test - public void loadProcessedDomaiWithExtremelyLongIP() { - var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource)); - - String ip = Stream.generate(() -> "127.").limit(1024).collect(Collectors.joining()); - - loader.load(loaderData, new EdgeDomain("www.marginalia.nu"), DomainIndexingState.BLOCKED, ip); - } - - @Test - public void loadDomainAlias() { - var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource)); - loader.loadAlias(loaderData, new DomainLink(new EdgeDomain("memex.marginalia.nu"), new EdgeDomain("www.marginalia.nu"))); - } -} \ No newline at end of file diff --git a/code/processes/loading-process/src/test/java/nu/marginalia/loading/domains/DomainLoaderServiceTest.java b/code/processes/loading-process/src/test/java/nu/marginalia/loading/domains/DomainLoaderServiceTest.java new file mode 100644 index 00000000..9340fe14 --- /dev/null +++ b/code/processes/loading-process/src/test/java/nu/marginalia/loading/domains/DomainLoaderServiceTest.java @@ -0,0 +1,130 @@ +package nu.marginalia.loading.domains; + +import com.google.common.collect.Lists; +import nu.marginalia.io.processed.DomainLinkRecordParquetFileWriter; +import nu.marginalia.io.processed.DomainRecordParquetFileWriter; +import nu.marginalia.io.processed.ProcessedDataFileNames; +import nu.marginalia.loader.DbTestUtil; +import nu.marginalia.model.processed.DomainLinkRecord; +import nu.marginalia.model.processed.DomainRecord; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; +import org.testcontainers.containers.MariaDBContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.sql.SQLException; +import java.util.*; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static org.junit.jupiter.api.Assertions.*; + +@Tag("slow") +@Testcontainers +class DomainLoaderServiceTest { + List toDelete = new ArrayList<>(); + + @Container + static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") + .withDatabaseName("WMSA_prod") + .withUsername("wmsa") + .withPassword("wmsa") + .withInitScript("db/migration/V23_06_0_000__base.sql") + .withNetworkAliases("mariadb"); + + @AfterEach + public void tearDown() throws IOException { + for (var path : Lists.reverse(toDelete)) { + Files.deleteIfExists(path); + } + + toDelete.clear(); + } + @Test + void readDomainNames() throws IOException { + Path workDir = Files.createTempDirectory(getClass().getSimpleName()); + Path parquetFile1 = ProcessedDataFileNames.domainFileName(workDir, 0); + Path parquetFile2 = ProcessedDataFileNames.domainFileName(workDir, 1); + Path parquetFile3 = ProcessedDataFileNames.domainLinkFileName(workDir, 0); + + toDelete.add(workDir); + toDelete.add(parquetFile1); + toDelete.add(parquetFile2); + toDelete.add(parquetFile3); + + // Prep by creating two parquet files with domains + // and one with domain links + + List domains1 = List.of("www.marginalia.nu", "memex.marginalia.nu", "search.marginalia.nu"); + List domains2 = List.of("wiby.me", "www.mojeek.com", "www.altavista.com"); + List linkDomains = List.of("maya.land", "xkcd.com", "aaronsw.com"); + + try (var pw = new DomainRecordParquetFileWriter(parquetFile1)) { + for (var domain : domains1) { + pw.write(dr(domain)); + } + } + try (var pw = new DomainRecordParquetFileWriter(parquetFile2)) { + for (var domain : domains2) { + pw.write(dr(domain)); + } + } + try (var pw = new DomainLinkRecordParquetFileWriter(parquetFile3)) { + for (var domain : linkDomains) { + pw.write(dl(domain)); + } + } + // Read them + var domainService = new DomainLoaderService(null); + var domainNames = domainService.readDomainNames(workDir, 2); + + // Verify + Set expectedDomains = Stream.of(domains1, domains2, linkDomains) + .flatMap(List::stream) + .collect(Collectors.toSet()); + assertEquals(expectedDomains, domainNames); + } + + @Test + void getDatabaseIds() { + try (var dataSource = DbTestUtil.getConnection(mariaDBContainer.getJdbcUrl())) { + var domainService = new DomainLoaderService(dataSource); + + for (int i = 0; i < 2; i++) { + // run the test case twice to cover both the insert and query cases + System.out.println("Case " + i); + + var domains = List.of("memex.marginalia.nu", "www.marginalia.nu", "search.marginalia.nu", "wiby.me"); + var data = domainService.getDatabaseIds(domains); + + Map ids = new HashMap<>(); + + for (String domain : domains) { + ids.put(domain, data.getDomainId(domain)); + } + + // Verify we got 4 domain IDs for the provided inputs + var entries = new HashSet<>(ids.values()); + assertEquals(4, entries.size()); + assertEquals(Set.of(1,2,3,4), entries); // this may be fragile? + } + + } catch (SQLException e) { + Assertions.fail(e); + } + } + + private DomainRecord dr(String domainName) { + return new DomainRecord(domainName, 0, 0, 0, null, null, null, null); + } + + private DomainLinkRecord dl(String destDomainName) { + return new DomainLinkRecord("www.marginalia.nu", destDomainName); + } +} \ No newline at end of file diff --git a/code/processes/loading-process/src/test/java/nu/marginalia/loading/links/DomainLinksLoaderServiceTest.java b/code/processes/loading-process/src/test/java/nu/marginalia/loading/links/DomainLinksLoaderServiceTest.java new file mode 100644 index 00000000..2f1f9b00 --- /dev/null +++ b/code/processes/loading-process/src/test/java/nu/marginalia/loading/links/DomainLinksLoaderServiceTest.java @@ -0,0 +1,124 @@ +package nu.marginalia.loading.links; + +import com.google.common.collect.Lists; +import nu.marginalia.io.processed.DomainLinkRecordParquetFileWriter; +import nu.marginalia.io.processed.DomainRecordParquetFileWriter; +import nu.marginalia.io.processed.ProcessedDataFileNames; +import nu.marginalia.loader.DbTestUtil; +import nu.marginalia.loading.domains.DomainLoaderService; +import nu.marginalia.model.processed.DomainLinkRecord; +import nu.marginalia.model.processed.DomainRecord; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; +import org.testcontainers.containers.MariaDBContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.sql.SQLException; +import java.util.*; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +@Tag("slow") +@Testcontainers +class DomainLinksLoaderServiceTest { + List toDelete = new ArrayList<>(); + + @Container + static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") + .withDatabaseName("WMSA_prod") + .withUsername("wmsa") + .withPassword("wmsa") + .withInitScript("db/migration/V23_06_0_000__base.sql") + .withNetworkAliases("mariadb"); + + @AfterEach + public void tearDown() throws IOException { + for (var path : Lists.reverse(toDelete)) { + Files.deleteIfExists(path); + } + + toDelete.clear(); + } + + @Test + public void test() throws IOException, SQLException { + Path workDir = Files.createTempDirectory(getClass().getSimpleName()); + Path parquetFile1 = ProcessedDataFileNames.domainFileName(workDir, 0); + Path parquetFile2 = ProcessedDataFileNames.domainLinkFileName(workDir, 0); + Path parquetFile3 = ProcessedDataFileNames.domainLinkFileName(workDir, 1); + + toDelete.add(workDir); + toDelete.add(parquetFile1); + toDelete.add(parquetFile2); + toDelete.add(parquetFile3); + + List domains1 = List.of("www.marginalia.nu", "search.marginalia.nu"); + List linkDomains1 = List.of("wiby.me", "www.mojeek.com", "www.altavista.com"); + List linkDomains2 = List.of("maya.land", "xkcd.com", "aaronsw.com"); + + try (var pw = new DomainRecordParquetFileWriter(parquetFile1)) { + for (var domain : domains1) { + pw.write(dr(domain)); + } + } + try (var pw = new DomainLinkRecordParquetFileWriter(parquetFile2)) { + for (var domain : linkDomains1) { + pw.write(dl("www.marginalia.nu", domain)); + } + } + try (var pw = new DomainLinkRecordParquetFileWriter(parquetFile3)) { + for (var domain : linkDomains2) { + pw.write(dl("search.marginalia.nu", domain)); + } + } + + try (var dataSource = DbTestUtil.getConnection(mariaDBContainer.getJdbcUrl()); + var conn = dataSource.getConnection(); + var query = conn.prepareStatement(""" + SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK + """) + ) { + var domainService = new DomainLoaderService(dataSource); + var domainRegistry = domainService.getOrCreateDomainIds(workDir, 2); + + var dls = new DomainLinksLoaderService(dataSource); + dls.loadLinks(domainRegistry, workDir, 2); + + Map> expected = new HashMap<>(); + Map> actual = new HashMap<>(); + expected.put(domainRegistry.getDomainId("www.marginalia.nu"), new HashSet<>()); + expected.put(domainRegistry.getDomainId("search.marginalia.nu"), new HashSet<>()); + + for (var domain : linkDomains1) { + expected.get(domainRegistry.getDomainId("www.marginalia.nu")).add(domainRegistry.getDomainId(domain)); + } + for (var domain : linkDomains2) { + expected.get(domainRegistry.getDomainId("search.marginalia.nu")).add(domainRegistry.getDomainId(domain)); + } + + var rs = query.executeQuery(); + while (rs.next()) { + actual.computeIfAbsent(rs.getInt(1), k -> new HashSet<>()) + .add(rs.getInt(2)); + } + + assertEquals(expected, actual); + + } + + + } + + private DomainRecord dr(String domainName) { + return new DomainRecord(domainName, 0, 0, 0, null, null, null, null); + } + + private DomainLinkRecord dl(String sourceDomainName, String destDomainName) { + return new DomainLinkRecord(sourceDomainName, destDomainName); + } +} \ No newline at end of file diff --git a/code/processes/loading-process/src/test/java/nu/marginalia/loading/loader/LoaderIndexJournalWriterTest.java b/code/processes/loading-process/src/test/java/nu/marginalia/loading/loader/LoaderIndexJournalWriterTest.java index 44b55476..709b3110 100644 --- a/code/processes/loading-process/src/test/java/nu/marginalia/loading/loader/LoaderIndexJournalWriterTest.java +++ b/code/processes/loading-process/src/test/java/nu/marginalia/loading/loader/LoaderIndexJournalWriterTest.java @@ -5,6 +5,7 @@ import nu.marginalia.db.storage.model.FileStorage; import nu.marginalia.db.storage.model.FileStorageType; import nu.marginalia.index.journal.reader.IndexJournalReaderSingleFile; import nu.marginalia.keyword.model.DocumentKeywords; +import nu.marginalia.loading.documents.LoaderIndexJournalWriter; import nu.marginalia.model.idx.DocumentMetadata; import nu.marginallia.index.journal.IndexJournalFileNames; import org.junit.jupiter.api.AfterEach; From 4799dd769e139eb18cce76db91d8d731b4119372 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 13 Sep 2023 19:18:58 +0200 Subject: [PATCH 08/14] (converting) WIP begin to remove converting-model and the old InstructionsCompiler --- .../process-models/converting-model/readme.md | 48 +----- .../converting/instruction/Instruction.java | 10 -- .../instruction/InstructionTag.java | 25 ---- .../converting/instruction/Interpreter.java | 26 ---- .../instruction/instructions/DomainLink.java | 8 - .../instruction/instructions/LoadDomain.java | 31 ---- .../instructions/LoadDomainLink.java | 31 ---- .../instructions/LoadDomainMetadata.java | 28 ---- .../instructions/LoadDomainRedirect.java | 29 ---- .../instructions/LoadKeywords.java | 32 ---- .../instructions/LoadProcessedDocument.java | 37 ----- .../LoadProcessedDocumentWithError.java | 29 ---- .../instructions/LoadProcessedDomain.java | 26 ---- .../instruction/instructions/LoadRssFeed.java | 32 ---- .../marginalia/converting/ConversionLog.java | 37 ----- .../marginalia/converting/ConverterMain.java | 30 +--- .../converting/InstructionWriterFactory.java | 141 ------------------ .../compiler/DocumentsCompiler.java | 59 -------- .../compiler/DomainMetadataCompiler.java | 47 ------ .../converting/compiler/FeedsCompiler.java | 24 --- .../compiler/InstructionsCompiler.java | 88 ----------- .../converting/compiler/LinksCompiler.java | 35 ----- .../converting/compiler/RedirectCompiler.java | 20 --- .../converting/model/GeneratorType.java | 0 24 files changed, 4 insertions(+), 869 deletions(-) delete mode 100644 code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/Instruction.java delete mode 100644 code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/InstructionTag.java delete mode 100644 code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/Interpreter.java delete mode 100644 code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/DomainLink.java delete mode 100644 code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadDomain.java delete mode 100644 code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadDomainLink.java delete mode 100644 code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadDomainMetadata.java delete mode 100644 code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadDomainRedirect.java delete mode 100644 code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadKeywords.java delete mode 100644 code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDocument.java delete mode 100644 code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDocumentWithError.java delete mode 100644 code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDomain.java delete mode 100644 code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadRssFeed.java delete mode 100644 code/processes/converting-process/src/main/java/nu/marginalia/converting/ConversionLog.java delete mode 100644 code/processes/converting-process/src/main/java/nu/marginalia/converting/InstructionWriterFactory.java delete mode 100644 code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DocumentsCompiler.java delete mode 100644 code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DomainMetadataCompiler.java delete mode 100644 code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/FeedsCompiler.java delete mode 100644 code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/InstructionsCompiler.java delete mode 100644 code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/LinksCompiler.java delete mode 100644 code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/RedirectCompiler.java rename code/{process-models/converting-model => processes/converting-process}/src/main/java/nu/marginalia/converting/model/GeneratorType.java (100%) diff --git a/code/process-models/converting-model/readme.md b/code/process-models/converting-model/readme.md index feaae4b3..52973e48 100644 --- a/code/process-models/converting-model/readme.md +++ b/code/process-models/converting-model/readme.md @@ -1,49 +1,3 @@ # Converting Models -Contains models shared by the [converting-process](../../processes/converting-process/) and -[loading-process](../../processes/loading-process/). - -## Design - -The two processes communicate through a file-based protocol. The converter serializes [instructions](src/main/java/nu/marginalia/converting/instruction/Instruction.java) -to file, which are deserialized by the loader and fed into an [instructions](src/main/java/nu/marginalia/converting/instruction/Interpreter.java). - -The instructions implement a visitor pattern. - -Conceptually the pattern can be thought of a bit like remote function calls over file, -or a crude instructions-based programming language. - -This - -```java -producer.foo("cat"); -producer.bar("milk", "eggs", "bread"); -``` - -translates through this paradigm, to this: - -``` -(producer) -writeInstruction(DoFoo("Cat")) -writeInstruction(DoBar("Milk", "Eggs", "Bread")) - -(consumer) -while read instruction: - interpreter.apply(instruction) - -(Interpreter) -doFoo(animal): - ... -doBar(ingredients): - ... - -(doFoo) -DoFoo(animal): - apply(interpreter): - interpreter.foo(animal) - -(doBar) -DoBar(ingredients): - apply(interpreter): - interpreter.bar(ingredients) -``` +!!To be deleted!! \ No newline at end of file diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/Instruction.java b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/Instruction.java deleted file mode 100644 index b36ef217..00000000 --- a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/Instruction.java +++ /dev/null @@ -1,10 +0,0 @@ -package nu.marginalia.converting.instruction; - -import java.io.Serializable; - -public interface Instruction extends Serializable { - void apply(Interpreter interpreter); - boolean isNoOp(); - - InstructionTag tag(); -} diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/InstructionTag.java b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/InstructionTag.java deleted file mode 100644 index 23584925..00000000 --- a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/InstructionTag.java +++ /dev/null @@ -1,25 +0,0 @@ -package nu.marginalia.converting.instruction; - -import nu.marginalia.converting.instruction.instructions.*; - -public enum InstructionTag { - - DOMAIN(LoadDomain.class), - LINK(LoadDomainLink.class), - REDIRECT(LoadDomainRedirect.class), - WORDS(LoadKeywords.class), - PROC_DOCUMENT(LoadProcessedDocument.class), - PROC_DOCUMENT_ERR(LoadProcessedDocumentWithError.class), - PROC_DOMAIN(LoadProcessedDomain.class), - - DOMAIN_METADATA(LoadDomainMetadata.class), - - RSS(LoadRssFeed.class); - - public final Class clazz; - - InstructionTag(Class clazz) { - this.clazz = clazz; - } - -} diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/Interpreter.java b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/Interpreter.java deleted file mode 100644 index 624081c9..00000000 --- a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/Interpreter.java +++ /dev/null @@ -1,26 +0,0 @@ -package nu.marginalia.converting.instruction; - -import nu.marginalia.keyword.model.DocumentKeywords; -import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.EdgeUrl; -import nu.marginalia.model.crawl.DomainIndexingState; -import nu.marginalia.model.idx.DocumentMetadata; -import nu.marginalia.converting.instruction.instructions.DomainLink; -import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument; -import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError; - -public interface Interpreter { - default void loadDomain(EdgeDomain[] domain) {} - default void loadRssFeed(EdgeUrl[] rssFeed) {} - default void loadDomainLink(DomainLink[] links) {} - - default void loadProcessedDomain(EdgeDomain domain, DomainIndexingState state, String ip) {} - default void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument) {} - default void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError) {} - - default void loadKeywords(EdgeUrl url, int ordinal, int features, DocumentMetadata metadata, DocumentKeywords words) {} - - default void loadDomainRedirect(DomainLink link) {} - - default void loadDomainMetadata(EdgeDomain domain, int knownUrls, int goodUrls, int visitedUrls) {} -} diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/DomainLink.java b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/DomainLink.java deleted file mode 100644 index 22230a37..00000000 --- a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/DomainLink.java +++ /dev/null @@ -1,8 +0,0 @@ -package nu.marginalia.converting.instruction.instructions; - -import nu.marginalia.model.EdgeDomain; - -import java.io.Serializable; - -public record DomainLink(EdgeDomain from, EdgeDomain to) implements Serializable { -} diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadDomain.java b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadDomain.java deleted file mode 100644 index f1f361a1..00000000 --- a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadDomain.java +++ /dev/null @@ -1,31 +0,0 @@ -package nu.marginalia.converting.instruction.instructions; - -import nu.marginalia.converting.instruction.Instruction; -import nu.marginalia.converting.instruction.InstructionTag; -import nu.marginalia.converting.instruction.Interpreter; -import nu.marginalia.model.EdgeDomain; - -import java.util.Arrays; - -public record LoadDomain(EdgeDomain... domain) implements Instruction { - - @Override - public void apply(Interpreter interpreter) { - interpreter.loadDomain(domain); - } - - @Override - public boolean isNoOp() { - return domain.length == 0; - } - - @Override - public InstructionTag tag() { - return InstructionTag.DOMAIN; - } - - @Override - public String toString() { - return getClass().getSimpleName()+"["+Arrays.toString(domain)+"]"; - } -} diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadDomainLink.java b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadDomainLink.java deleted file mode 100644 index 9a5b85f8..00000000 --- a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadDomainLink.java +++ /dev/null @@ -1,31 +0,0 @@ -package nu.marginalia.converting.instruction.instructions; - -import nu.marginalia.converting.instruction.Instruction; -import nu.marginalia.converting.instruction.InstructionTag; -import nu.marginalia.converting.instruction.Interpreter; - -import java.util.Arrays; - -public record LoadDomainLink(DomainLink... links) implements Instruction { - - @Override - public void apply(Interpreter interpreter) { - interpreter.loadDomainLink(links); - } - - @Override - public String toString() { - return getClass().getSimpleName()+"["+ Arrays.toString(links)+"]"; - } - - @Override - public InstructionTag tag() { - return InstructionTag.LINK; - } - - @Override - public boolean isNoOp() { - return links.length == 0; - } - -} diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadDomainMetadata.java b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadDomainMetadata.java deleted file mode 100644 index 88da806c..00000000 --- a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadDomainMetadata.java +++ /dev/null @@ -1,28 +0,0 @@ -package nu.marginalia.converting.instruction.instructions; - -import nu.marginalia.converting.instruction.Instruction; -import nu.marginalia.converting.instruction.InstructionTag; -import nu.marginalia.converting.instruction.Interpreter; -import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.EdgeUrl; - -import java.util.Arrays; - -public record LoadDomainMetadata(EdgeDomain domain, int knownUrls, int goodUrls, int visitedUrls) implements Instruction { - - @Override - public void apply(Interpreter interpreter) { - interpreter.loadDomainMetadata(domain, knownUrls, goodUrls, visitedUrls); - } - - @Override - public boolean isNoOp() { - return false; - } - - @Override - public InstructionTag tag() { - return InstructionTag.DOMAIN_METADATA; - } - -} diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadDomainRedirect.java b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadDomainRedirect.java deleted file mode 100644 index 5bd357ab..00000000 --- a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadDomainRedirect.java +++ /dev/null @@ -1,29 +0,0 @@ -package nu.marginalia.converting.instruction.instructions; - -import nu.marginalia.converting.instruction.Instruction; -import nu.marginalia.converting.instruction.InstructionTag; -import nu.marginalia.converting.instruction.Interpreter; - -public record LoadDomainRedirect(DomainLink links) implements Instruction { - - @Override - public void apply(Interpreter interpreter) { - interpreter.loadDomainRedirect(links); - } - - @Override - public String toString() { - return getClass().getSimpleName()+"["+ links+"]"; - } - - @Override - public InstructionTag tag() { - return InstructionTag.REDIRECT; - } - - @Override - public boolean isNoOp() { - return false; - } - -} diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadKeywords.java b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadKeywords.java deleted file mode 100644 index 96c78611..00000000 --- a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadKeywords.java +++ /dev/null @@ -1,32 +0,0 @@ -package nu.marginalia.converting.instruction.instructions; - -import nu.marginalia.keyword.model.DocumentKeywords; -import nu.marginalia.model.idx.DocumentMetadata; -import nu.marginalia.converting.instruction.Instruction; -import nu.marginalia.converting.instruction.InstructionTag; -import nu.marginalia.converting.instruction.Interpreter; -import nu.marginalia.model.EdgeUrl; - -public record LoadKeywords(EdgeUrl url, int ordinal, int features, DocumentMetadata metadata, DocumentKeywords words) implements Instruction { - - @Override - public void apply(Interpreter interpreter) { - interpreter.loadKeywords(url, ordinal, features, metadata, words); - } - - @Override - public boolean isNoOp() { - return false; - } - - @Override - public InstructionTag tag() { - return InstructionTag.WORDS; - } - - @Override - public String toString() { - return getClass().getSimpleName()+"["+ words+"]"; - } - -} diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDocument.java b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDocument.java deleted file mode 100644 index 2a43494c..00000000 --- a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDocument.java +++ /dev/null @@ -1,37 +0,0 @@ -package nu.marginalia.converting.instruction.instructions; - -import nu.marginalia.model.crawl.UrlIndexingState; -import nu.marginalia.converting.instruction.Instruction; -import nu.marginalia.converting.instruction.InstructionTag; -import nu.marginalia.converting.instruction.Interpreter; -import nu.marginalia.model.EdgeUrl; -import org.jetbrains.annotations.Nullable; - - -public record LoadProcessedDocument(EdgeUrl url, - int ordinal, UrlIndexingState state, - String title, - String description, - int htmlFeatures, - String standard, - int length, - long hash, - double quality, - @Nullable Integer pubYear -) implements Instruction -{ - @Override - public void apply(Interpreter interpreter) { - interpreter.loadProcessedDocument(this); - } - - @Override - public InstructionTag tag() { - return InstructionTag.PROC_DOCUMENT; - } - - @Override - public boolean isNoOp() { - return false; - } -} diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDocumentWithError.java b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDocumentWithError.java deleted file mode 100644 index a1a42a90..00000000 --- a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDocumentWithError.java +++ /dev/null @@ -1,29 +0,0 @@ -package nu.marginalia.converting.instruction.instructions; - -import nu.marginalia.model.crawl.UrlIndexingState; -import nu.marginalia.converting.instruction.Instruction; -import nu.marginalia.converting.instruction.InstructionTag; -import nu.marginalia.converting.instruction.Interpreter; -import nu.marginalia.model.EdgeUrl; - - -public record LoadProcessedDocumentWithError(EdgeUrl url, - UrlIndexingState state, - String reason, - int ordinal) implements Instruction -{ - @Override - public void apply(Interpreter interpreter) { - interpreter.loadProcessedDocumentWithError(this); - } - - @Override - public InstructionTag tag() { - return InstructionTag.PROC_DOCUMENT_ERR; - } - - @Override - public boolean isNoOp() { - return false; - } -} diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDomain.java b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDomain.java deleted file mode 100644 index 1186c38d..00000000 --- a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDomain.java +++ /dev/null @@ -1,26 +0,0 @@ -package nu.marginalia.converting.instruction.instructions; - -import nu.marginalia.model.crawl.DomainIndexingState; -import nu.marginalia.converting.instruction.Instruction; -import nu.marginalia.converting.instruction.InstructionTag; -import nu.marginalia.converting.instruction.Interpreter; -import nu.marginalia.model.EdgeDomain; - -public record LoadProcessedDomain(EdgeDomain domain, DomainIndexingState state, String ip) implements Instruction { - - @Override - public void apply(Interpreter interpreter) { - interpreter.loadProcessedDomain(domain, state, ip); - } - - @Override - public InstructionTag tag() { - return InstructionTag.PROC_DOMAIN; - } - - @Override - public boolean isNoOp() { - return false; - } - -} diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadRssFeed.java b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadRssFeed.java deleted file mode 100644 index f6c8d7b5..00000000 --- a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadRssFeed.java +++ /dev/null @@ -1,32 +0,0 @@ -package nu.marginalia.converting.instruction.instructions; - -import nu.marginalia.converting.instruction.Instruction; -import nu.marginalia.converting.instruction.InstructionTag; -import nu.marginalia.converting.instruction.Interpreter; -import nu.marginalia.model.EdgeUrl; - -import java.util.Arrays; - -public record LoadRssFeed(EdgeUrl... feeds) implements Instruction { - - @Override - public void apply(Interpreter interpreter) { - interpreter.loadRssFeed(feeds); - } - - @Override - public String toString() { - return getClass().getSimpleName()+"["+ Arrays.toString(feeds)+"]"; - } - - @Override - public InstructionTag tag() { - return InstructionTag.RSS; - } - - @Override - public boolean isNoOp() { - return feeds.length == 0; - } - -} diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConversionLog.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConversionLog.java deleted file mode 100644 index 865e6d6b..00000000 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConversionLog.java +++ /dev/null @@ -1,37 +0,0 @@ -package nu.marginalia.converting; - -import com.github.luben.zstd.RecyclingBufferPool; -import com.github.luben.zstd.ZstdOutputStream; -import nu.marginalia.converting.instruction.Interpreter; -import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError; - -import java.io.BufferedOutputStream; -import java.io.IOException; -import java.io.PrintWriter; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.StandardOpenOption; -import java.time.LocalDateTime; -import java.time.ZoneOffset; - -public class ConversionLog implements AutoCloseable, Interpreter { - private final PrintWriter writer; - - public ConversionLog(Path rootDir) throws IOException { - String fileName = String.format("conversion-log-%s.zstd", LocalDateTime.now().toEpochSecond(ZoneOffset.UTC)); - Path logFile = rootDir.resolve(fileName); - - writer = new PrintWriter(new ZstdOutputStream(new BufferedOutputStream(Files.newOutputStream(logFile, StandardOpenOption.WRITE, StandardOpenOption.CREATE)), RecyclingBufferPool.INSTANCE)); - } - - @Override - public void close() throws IOException { - writer.close(); - } - - @Override - public synchronized void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError) { - writer.printf("%s\t%s\n", loadProcessedDocumentWithError.url(), loadProcessedDocumentWithError.reason()); - } - -} diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java index a982dcfa..eefb2be2 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java @@ -20,14 +20,11 @@ import nu.marginalia.process.log.WorkLog; import nu.marginalia.service.module.DatabaseModule; import nu.marginalia.worklog.BatchingWorkLog; import nu.marginalia.worklog.BatchingWorkLogImpl; -import org.checkerframework.checker.units.qual.C; import plan.CrawlPlan; -import nu.marginalia.converting.compiler.InstructionsCompiler; import nu.marginalia.converting.processor.DomainProcessor; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; import java.nio.file.Path; import java.sql.SQLException; import java.util.Optional; @@ -40,7 +37,6 @@ import static nu.marginalia.mqapi.ProcessInboxNames.CONVERTER_INBOX; public class ConverterMain { private static final Logger logger = LoggerFactory.getLogger(ConverterMain.class); private final DomainProcessor processor; - private final InstructionsCompiler compiler; private final Gson gson; private final ProcessHeartbeat heartbeat; private final MessageQueueFactory messageQueueFactory; @@ -69,7 +65,6 @@ public class ConverterMain { @Inject public ConverterMain( DomainProcessor processor, - InstructionsCompiler compiler, Gson gson, ProcessHeartbeatImpl heartbeat, MessageQueueFactory messageQueueFactory, @@ -78,7 +73,6 @@ public class ConverterMain { ) { this.processor = processor; - this.compiler = compiler; this.gson = gson; this.heartbeat = heartbeat; this.messageQueueFactory = messageQueueFactory; @@ -91,21 +85,7 @@ public class ConverterMain { public void convert(SideloadSource sideloadSource, Path writeDir) throws Exception { int maxPoolSize = 16; - try (WorkLog workLog = new WorkLog(writeDir.resolve("processor.log")); - ConversionLog conversionLog = new ConversionLog(writeDir)) { - var instructionWriter = new InstructionWriterFactory(conversionLog, writeDir, gson); - - final String where; - final int size; - - try (var writer = instructionWriter.createInstructionsForDomainWriter(sideloadSource.getId())) { - compiler.compileStreaming(sideloadSource, writer::accept); - where = writer.getFileName(); - size = writer.getSize(); - } - - workLog.setJobToFinished(sideloadSource.getId(), where, size); - } + // FIXME } public void convert(CrawlPlan plan) throws Exception { @@ -115,10 +95,8 @@ public class ConverterMain { try (BatchingWorkLog batchingWorkLog = new BatchingWorkLogImpl(plan.process.getLogFile()); - ConverterWriter converterWriter = new ConverterWriter(batchingWorkLog, plan.process.getDir()); - ConversionLog log = new ConversionLog(plan.process.getDir())) { - var instructionWriter = new InstructionWriterFactory(log, plan.process.getDir(), gson); - + ConverterWriter converterWriter = new ConverterWriter(batchingWorkLog, plan.process.getDir())) + { var pool = new DumbThreadPool(maxPoolSize, 2); int totalDomains = plan.countCrawledDomains(); @@ -132,9 +110,7 @@ public class ConverterMain { { pool.submit(() -> { ProcessedDomain processed = processor.process(domain); - converterWriter.accept(processed); - heartbeat.setProgress(processedDomains.incrementAndGet() / (double) totalDomains); }); } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/InstructionWriterFactory.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/InstructionWriterFactory.java deleted file mode 100644 index e3b68629..00000000 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/InstructionWriterFactory.java +++ /dev/null @@ -1,141 +0,0 @@ -package nu.marginalia.converting; - -import com.github.luben.zstd.ZstdOutputStream; -import com.google.gson.Gson; -import nu.marginalia.model.crawl.DomainIndexingState; -import nu.marginalia.model.idx.DocumentMetadata; -import nu.marginalia.converting.instruction.Instruction; -import nu.marginalia.converting.instruction.Interpreter; -import nu.marginalia.keyword.model.DocumentKeywords; -import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument; -import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.EdgeUrl; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.*; -import java.nio.file.Files; -import java.nio.file.Path; - -public class InstructionWriterFactory { - - private final ConversionLog log; - private final Path outputDir; - private final Gson gson; - private static final Logger logger = LoggerFactory.getLogger(InstructionWriterFactory.class); - - public InstructionWriterFactory(ConversionLog log, Path outputDir, Gson gson) { - this.log = log; - this.outputDir = outputDir; - this.gson = gson; - - if (!Files.isDirectory(outputDir)) { - throw new IllegalArgumentException("Output dir " + outputDir + " does not exist"); - } - } - - public InstructionWriter createInstructionsForDomainWriter(String id) throws IOException { - Path outputFile = getOutputFile(id); - return new InstructionWriter(outputFile); - } - - public class InstructionWriter implements AutoCloseable { - private final ObjectOutputStream outputStream; - private final String where; - private final SummarizingInterpreter summary = new SummarizingInterpreter(); - - private int size = 0; - - - InstructionWriter(Path filename) throws IOException { - where = filename.getFileName().toString(); - Files.deleteIfExists(filename); - outputStream = new ObjectOutputStream(new ZstdOutputStream(new FileOutputStream(filename.toFile()))); - } - - public void accept(Instruction instruction) { - if (instruction.isNoOp()) return; - - instruction.apply(summary); - instruction.apply(log); - - size++; - - try { - outputStream.writeObject(instruction); - - // Reset the stream to avoid keeping references to the objects - // (as this will cause the memory usage to grow indefinitely when - // writing huge amounts of data) - outputStream.reset(); - } - catch (IOException ex) { - logger.warn("IO exception writing instruction", ex); - } - } - - @Override - public void close() throws IOException { - logger.info("Wrote {} - {} - {}", where, size, summary); - outputStream.close(); - } - - public String getFileName() { - return where; - } - - public int getSize() { - return size; - } - } - - private Path getOutputFile(String id) throws IOException { - String first = id.substring(0, 2); - String second = id.substring(2, 4); - - Path destDir = outputDir.resolve(first).resolve(second); - if (!Files.exists(destDir)) { - Files.createDirectories(destDir); - } - - return destDir.resolve(id + ".pzstd"); - } - - private static class SummarizingInterpreter implements Interpreter { - - private String domainName; - private int ok = 0; - private int error = 0; - - int keywords = 0; - int documents = 0; - - public String toString() { - // This shouldn't happen (TM) - assert keywords == documents : "keywords != documents"; - - return String.format("%s - %d %d", domainName, ok, error); - } - - @Override - public void loadProcessedDomain(EdgeDomain domain, DomainIndexingState state, String ip) { - this.domainName = domain.toString(); - } - - @Override - public void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument) { - documents++; - } - - @Override - public void loadKeywords(EdgeUrl url, int ordinal, int features, DocumentMetadata metadata, DocumentKeywords words) { - keywords++; - } - - @Override - public void loadDomainMetadata(EdgeDomain domain, int knownUrls, int goodUrls, int visitedUrls) { - ok += goodUrls; - error += visitedUrls - goodUrls; - } - } -} diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DocumentsCompiler.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DocumentsCompiler.java deleted file mode 100644 index b3cb2a9f..00000000 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DocumentsCompiler.java +++ /dev/null @@ -1,59 +0,0 @@ -package nu.marginalia.converting.compiler; - -import nu.marginalia.converting.instruction.Instruction; -import nu.marginalia.converting.instruction.instructions.LoadKeywords; -import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument; -import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError; -import nu.marginalia.converting.model.ProcessedDocument; -import nu.marginalia.model.crawl.HtmlFeature; - -import java.util.List; -import java.util.function.Consumer; - -public class DocumentsCompiler { - - public void compileDocumentDetails(Consumer instructionConsumer, - ProcessedDocument doc, - int ordinal) { - var details = doc.details; - - if (details != null) { - instructionConsumer.accept(new LoadProcessedDocument(doc.url, - ordinal, - doc.state, - details.title, - details.description, - HtmlFeature.encode(details.features), - details.standard.name(), - details.length, - details.hashCode, - details.quality, - details.pubYear - )); - } - else { - instructionConsumer.accept(new LoadProcessedDocumentWithError( - doc.url, - doc.state, - doc.stateReason, - ordinal - )); - } - } - - public void compileWords(Consumer instructionConsumer, - ProcessedDocument doc, - int ordinal) { - var words = doc.words; - - if (words != null) { - instructionConsumer.accept(new LoadKeywords(doc.url, - ordinal, - HtmlFeature.encode(doc.details.features), - doc.details.metadata, - words.build()) - ); - } - } - -} diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DomainMetadataCompiler.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DomainMetadataCompiler.java deleted file mode 100644 index 3909edb1..00000000 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DomainMetadataCompiler.java +++ /dev/null @@ -1,47 +0,0 @@ -package nu.marginalia.converting.compiler; - -import nu.marginalia.converting.instruction.Instruction; -import nu.marginalia.converting.instruction.instructions.LoadDomainMetadata; -import nu.marginalia.converting.model.ProcessedDocument; -import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.EdgeUrl; -import org.jetbrains.annotations.NotNull; - -import java.util.HashSet; -import java.util.List; -import java.util.Optional; -import java.util.Set; -import java.util.function.Consumer; - -public class DomainMetadataCompiler { - - - public void compile(Consumer instructionConsumer, EdgeDomain domain, @NotNull List documents) { - - int visitedUrls = 0; - int goodUrls = 0; - - Set knownUrls = new HashSet<>(documents.size() * 2); - - for (var doc : documents) { - visitedUrls++; - - if (doc.isOk()) { - goodUrls++; - } - - knownUrls.add(doc.url); - - Optional.ofNullable(doc.details) - .map(details -> details.linksInternal) - .ifPresent(knownUrls::addAll); - } - - instructionConsumer.accept(new LoadDomainMetadata(domain, knownUrls.size(), goodUrls, visitedUrls)); - } - - public void compileFake(Consumer instructionConsumer, EdgeDomain domain, int countAll, int countGood) { - instructionConsumer.accept(new LoadDomainMetadata(domain, countAll, countGood, countAll)); - } - -} diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/FeedsCompiler.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/FeedsCompiler.java deleted file mode 100644 index 2c111ea2..00000000 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/FeedsCompiler.java +++ /dev/null @@ -1,24 +0,0 @@ -package nu.marginalia.converting.compiler; - -import nu.marginalia.converting.instruction.Instruction; -import nu.marginalia.converting.instruction.instructions.LoadRssFeed; -import nu.marginalia.converting.model.ProcessedDocument; -import nu.marginalia.model.EdgeUrl; - -import java.util.List; -import java.util.Objects; -import java.util.function.Consumer; - -public class FeedsCompiler { - - public void compile(Consumer instructionConsumer, List documents) { - - EdgeUrl[] feeds = documents.stream().map(doc -> doc.details) - .filter(Objects::nonNull) - .flatMap(dets -> dets.feedLinks.stream()) - .distinct() - .toArray(EdgeUrl[]::new); - - instructionConsumer.accept(new LoadRssFeed(feeds)); - } -} diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/InstructionsCompiler.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/InstructionsCompiler.java deleted file mode 100644 index 65d2e989..00000000 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/InstructionsCompiler.java +++ /dev/null @@ -1,88 +0,0 @@ -package nu.marginalia.converting.compiler; - -import com.google.inject.Inject; -import nu.marginalia.converting.instruction.Instruction; -import nu.marginalia.converting.instruction.instructions.LoadProcessedDomain; -import nu.marginalia.converting.model.ProcessedDocument; -import nu.marginalia.converting.model.ProcessedDomain; -import nu.marginalia.converting.sideload.SideloadSource; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.Collections; -import java.util.Iterator; -import java.util.function.Consumer; - -import static java.util.Objects.requireNonNullElse; - -public class InstructionsCompiler { - private final DocumentsCompiler documentsCompiler; - private final DomainMetadataCompiler domainMetadataCompiler; - private final FeedsCompiler feedsCompiler; - private final LinksCompiler linksCompiler; - private final RedirectCompiler redirectCompiler; - - private final Logger logger = LoggerFactory.getLogger(InstructionsCompiler.class); - - @Inject - public InstructionsCompiler(DocumentsCompiler documentsCompiler, - DomainMetadataCompiler domainMetadataCompiler, - FeedsCompiler feedsCompiler, - LinksCompiler linksCompiler, - RedirectCompiler redirectCompiler) - { - this.documentsCompiler = documentsCompiler; - this.domainMetadataCompiler = domainMetadataCompiler; - this.feedsCompiler = feedsCompiler; - this.linksCompiler = linksCompiler; - this.redirectCompiler = redirectCompiler; - } - - public void compile(ProcessedDomain domain, Consumer instructionConsumer) { - // Guaranteed to always be first - instructionConsumer.accept(new LoadProcessedDomain(domain.domain, domain.state, domain.ip)); - - if (domain.documents != null) { - - int ordinal = 0; - for (var doc : domain.documents) { - documentsCompiler.compileDocumentDetails(instructionConsumer, doc, ordinal); - documentsCompiler.compileWords(instructionConsumer, doc, ordinal); - ordinal++; - } - - feedsCompiler.compile(instructionConsumer, domain.documents); - linksCompiler.compile(instructionConsumer, domain.domain, domain.documents); - } - if (domain.redirect != null) { - redirectCompiler.compile(instructionConsumer, domain.domain, domain.redirect); - } - - domainMetadataCompiler.compile(instructionConsumer, domain.domain, requireNonNullElse(domain.documents, Collections.emptyList())); - } - - public void compileStreaming(SideloadSource sideloadSource, - Consumer instructionConsumer) { - ProcessedDomain domain = sideloadSource.getDomain(); - Iterator documentsIterator = sideloadSource.getDocumentsStream(); - - // Guaranteed to always be first - instructionConsumer.accept(new LoadProcessedDomain(domain.domain, domain.state, domain.ip)); - - int countAll = 0; - int countGood = 0; - - logger.info("Writing docs"); - - while (documentsIterator.hasNext()) { - var doc = documentsIterator.next(); - countAll++; - if (doc.isOk()) countGood++; - - documentsCompiler.compileDocumentDetails(instructionConsumer, doc, countAll); - documentsCompiler.compileWords(instructionConsumer, doc, countAll); - } - - domainMetadataCompiler.compileFake(instructionConsumer, domain.domain, countAll, countGood); - } -} diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/LinksCompiler.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/LinksCompiler.java deleted file mode 100644 index e84a7c54..00000000 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/LinksCompiler.java +++ /dev/null @@ -1,35 +0,0 @@ -package nu.marginalia.converting.compiler; - -import nu.marginalia.converting.instruction.Instruction; -import nu.marginalia.converting.instruction.instructions.DomainLink; -import nu.marginalia.converting.instruction.instructions.LoadDomain; -import nu.marginalia.converting.instruction.instructions.LoadDomainLink; -import nu.marginalia.converting.model.ProcessedDocument; -import nu.marginalia.model.EdgeDomain; - -import java.util.Arrays; -import java.util.List; -import java.util.Objects; -import java.util.function.Consumer; - -public class LinksCompiler { - - public void compile(Consumer instructionConsumer, - EdgeDomain from, - List documents) { - - EdgeDomain[] domains = documents.stream() - .map(doc -> doc.details) - .filter(Objects::nonNull) - .flatMap(details -> details.linksExternal.stream()) - .map(link -> link.domain) - .distinct() - .toArray(EdgeDomain[]::new); - - DomainLink[] links = new DomainLink[domains.length]; - Arrays.setAll(links, i -> new DomainLink(from, domains[i])); - - instructionConsumer.accept(new LoadDomain(domains)); - instructionConsumer.accept(new LoadDomainLink(links)); - } -} diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/RedirectCompiler.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/RedirectCompiler.java deleted file mode 100644 index dcd0201f..00000000 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/RedirectCompiler.java +++ /dev/null @@ -1,20 +0,0 @@ -package nu.marginalia.converting.compiler; - -import nu.marginalia.converting.instruction.Instruction; -import nu.marginalia.converting.instruction.instructions.DomainLink; -import nu.marginalia.converting.instruction.instructions.LoadDomain; -import nu.marginalia.converting.instruction.instructions.LoadDomainLink; -import nu.marginalia.converting.instruction.instructions.LoadDomainRedirect; -import nu.marginalia.model.EdgeDomain; - -import java.util.List; -import java.util.function.Consumer; - -public class RedirectCompiler { - - public void compile(Consumer instructionConsumer, EdgeDomain from, EdgeDomain to) { - instructionConsumer.accept(new LoadDomain(to)); - instructionConsumer.accept(new LoadDomainLink(new DomainLink(from, to))); - instructionConsumer.accept(new LoadDomainRedirect(new DomainLink(from, to))); - } -} diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/model/GeneratorType.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/model/GeneratorType.java similarity index 100% rename from code/process-models/converting-model/src/main/java/nu/marginalia/converting/model/GeneratorType.java rename to code/processes/converting-process/src/main/java/nu/marginalia/converting/model/GeneratorType.java From 87a859329117aa3eb032c6c28519272f76adc180 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 14 Sep 2023 10:11:04 +0200 Subject: [PATCH 09/14] (work-log) Fix bug where items weren't added to the current batch on logItem --- .../src/main/java/nu/marginalia/worklog/BatchingWorkLogImpl.java | 1 + 1 file changed, 1 insertion(+) diff --git a/code/process-models/work-log/src/main/java/nu/marginalia/worklog/BatchingWorkLogImpl.java b/code/process-models/work-log/src/main/java/nu/marginalia/worklog/BatchingWorkLogImpl.java index 2b8d8689..77539377 100644 --- a/code/process-models/work-log/src/main/java/nu/marginalia/worklog/BatchingWorkLogImpl.java +++ b/code/process-models/work-log/src/main/java/nu/marginalia/worklog/BatchingWorkLogImpl.java @@ -75,6 +75,7 @@ public class BatchingWorkLogImpl implements BatchingWorkLog { @Override public void logItem(String id) throws IOException { writeLogEntry(new AddItem(id)); + addItemToCurrentBatch(id); } @Override From c71f6ad417bfe9da09645e7444b202365bb8e7ef Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 14 Sep 2023 10:11:57 +0200 Subject: [PATCH 10/14] (converter) Add heartbeats to the loader processes and execute the tasks in parallel for a ~2X speedup --- .../marginalia/converting/ConverterMain.java | 3 +- .../converting/writer/ConverterWriter.java | 50 ++++++++++++------- .../LoaderIndexJournalWriter.java | 2 +- .../nu/marginalia/loading/LoaderMain.java | 25 +++++++--- .../documents/DocumentLoaderService.java | 28 +++++++++-- .../documents/KeywordLoaderService.java | 29 +++++++++-- .../loading/domains/DomainIdRegistry.java | 11 ---- .../links/DomainLinksLoaderService.java | 24 +++++++-- .../domains/DomainLoaderServiceTest.java | 18 +++++-- .../links/DomainLinksLoaderServiceTest.java | 17 ++++++- .../loader/LoaderIndexJournalWriterTest.java | 2 +- 11 files changed, 152 insertions(+), 57 deletions(-) rename code/processes/loading-process/src/main/java/nu/marginalia/loading/{documents => }/LoaderIndexJournalWriter.java (98%) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java index eefb2be2..7afab740 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java @@ -92,8 +92,6 @@ public class ConverterMain { final int maxPoolSize = Runtime.getRuntime().availableProcessors(); - - try (BatchingWorkLog batchingWorkLog = new BatchingWorkLogImpl(plan.process.getLogFile()); ConverterWriter converterWriter = new ConverterWriter(batchingWorkLog, plan.process.getDir())) { @@ -111,6 +109,7 @@ public class ConverterMain { pool.submit(() -> { ProcessedDomain processed = processor.process(domain); converterWriter.accept(processed); + heartbeat.setProgress(processedDomains.incrementAndGet() / (double) totalDomains); }); } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterWriter.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterWriter.java index 595601a5..ac0dd71c 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterWriter.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterWriter.java @@ -3,6 +3,8 @@ package nu.marginalia.converting.writer; import lombok.SneakyThrows; import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.worklog.BatchingWorkLog; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.nio.file.Path; import java.time.Duration; @@ -14,19 +16,22 @@ import java.util.concurrent.TimeUnit; public class ConverterWriter implements AutoCloseable { + private static final Logger logger = LoggerFactory.getLogger(ConverterWriter.class); + private final BatchingWorkLog workLog; private final Path basePath; - private final Duration switchInterval = - Duration.of(10, ChronoUnit.MINUTES); - private final ArrayBlockingQueue domainData = - new ArrayBlockingQueue<>(4); + private final Duration switchInterval + = Duration.of(10, ChronoUnit.MINUTES); + private final ArrayBlockingQueue domainData + = new ArrayBlockingQueue<>(4); private final Thread workerThread; - ConverterBatchWriter writer; + private ConverterBatchWriter currentWriter; volatile boolean running = true; + public ConverterWriter(BatchingWorkLog workLog, Path basePath) { this.workLog = workLog; this.basePath = basePath; @@ -44,20 +49,27 @@ public class ConverterWriter implements AutoCloseable { private void writerThread() { IntervalAction switcher = new IntervalAction(this::switchBatch, switchInterval); - writer = new ConverterBatchWriter(basePath, workLog.getBatchNumber()); + currentWriter = new ConverterBatchWriter(basePath, workLog.getBatchNumber()); while (running || !domainData.isEmpty()) { - var data = domainData.poll(10, TimeUnit.SECONDS); + // poll with a timeout so we have an + // opportunity to check the running condition + // ... we could interrupt the thread as well, but + // as we enter third party code it's difficult to guarantee it will deal + // well with being interrupted + var data = domainData.poll(1, TimeUnit.SECONDS); if (data == null) continue; String id = data.id; - if (workLog.isItemCommitted(id) || workLog.isItemInCurrentBatch(id)) + if (workLog.isItemCommitted(id) || workLog.isItemInCurrentBatch(id)) { + logger.warn("Skipping already logged item {}", id); continue; + } - writer.write(data); + currentWriter.write(data); workLog.logItem(id); @@ -72,10 +84,12 @@ public class ConverterWriter implements AutoCloseable { return false; } + // order matters here - writer.close(); + currentWriter.close(); workLog.logFinishedBatch(); - writer = new ConverterBatchWriter(basePath, workLog.getBatchNumber()); + logger.info("Switching to batch {}", workLog.getBatchNumber()); + currentWriter = new ConverterBatchWriter(basePath, workLog.getBatchNumber()); return true; } @@ -86,7 +100,7 @@ public class ConverterWriter implements AutoCloseable { workerThread.join(); // order matters here - writer.close(); + currentWriter.close(); workLog.logFinishedBatch(); } } @@ -105,17 +119,17 @@ class IntervalAction { /** Execute the provided action if enough time has passed * since the last successful invocation */ public void tick() { + var now = Instant.now(); if (nextActionInstant == null) { - nextActionInstant = Instant.now().plus(interval); + nextActionInstant = now.plus(interval); return; } - if (Instant.now().isBefore(nextActionInstant)) - return; - try { - if (action.call()) { - nextActionInstant = Instant.now().plus(interval); + if (now.isAfter(nextActionInstant) + && action.call()) + { + nextActionInstant = now.plus(interval); } } catch (Exception ex) { diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/documents/LoaderIndexJournalWriter.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderIndexJournalWriter.java similarity index 98% rename from code/processes/loading-process/src/main/java/nu/marginalia/loading/documents/LoaderIndexJournalWriter.java rename to code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderIndexJournalWriter.java index 0bfd6193..c3c9d6f9 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/documents/LoaderIndexJournalWriter.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderIndexJournalWriter.java @@ -1,4 +1,4 @@ -package nu.marginalia.loading.documents; +package nu.marginalia.loading; import com.google.inject.Inject; import com.google.inject.Singleton; diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java index 675feb3d..46f206db 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java @@ -9,7 +9,6 @@ import nu.marginalia.db.storage.FileStorageService; import nu.marginalia.linkdb.LinkdbWriter; import nu.marginalia.loading.documents.DocumentLoaderService; import nu.marginalia.loading.documents.KeywordLoaderService; -import nu.marginalia.loading.documents.LoaderIndexJournalWriter; import nu.marginalia.loading.domains.DomainIdRegistry; import nu.marginalia.loading.domains.DomainLoaderService; import nu.marginalia.loading.links.DomainLinksLoaderService; @@ -26,8 +25,11 @@ import org.slf4j.LoggerFactory; import java.nio.file.Path; import java.sql.SQLException; +import java.util.List; import java.util.Optional; import java.util.UUID; +import java.util.concurrent.ForkJoinPool; +import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import static nu.marginalia.mqapi.ProcessInboxNames.LOADER_INBOX; @@ -106,12 +108,20 @@ public class LoaderMain { validBatchCount); try { - linksService - .loadLinks(domainIdRegistry, inputDataDir, validBatchCount); - keywordLoaderService - .loadKeywords(domainIdRegistry, inputDataDir, validBatchCount); - documentLoaderService - .loadDocuments(domainIdRegistry, inputDataDir, validBatchCount); + var results = ForkJoinPool.commonPool() + .invokeAll( + List.of( + () -> linksService.loadLinks(domainIdRegistry, heartbeat, inputDataDir, validBatchCount), + () -> keywordLoaderService.loadKeywords(domainIdRegistry, heartbeat, inputDataDir, validBatchCount), + () -> documentLoaderService.loadDocuments(domainIdRegistry, heartbeat, inputDataDir, validBatchCount) + ) + ); + + for (var result : results) { + if (result.state() == Future.State.FAILED) { + throw result.exceptionNow(); + } + } instructions.ok(); } @@ -125,7 +135,6 @@ public class LoaderMain { heartbeat.shutDown(); } - System.exit(0); } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/documents/DocumentLoaderService.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/documents/DocumentLoaderService.java index c6ea5a5e..130957f8 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/documents/DocumentLoaderService.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/documents/DocumentLoaderService.java @@ -11,6 +11,9 @@ import nu.marginalia.loading.domains.DomainIdRegistry; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.model.processed.DocumentRecordMetadataProjection; +import nu.marginalia.process.control.ProcessHeartbeat; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.IOException; import java.nio.file.Path; @@ -20,6 +23,8 @@ import java.util.List; @Singleton public class DocumentLoaderService { + private static final Logger logger = LoggerFactory.getLogger(DocumentLoaderService.class); + private final LinkdbWriter linkdbWriter; @Inject @@ -27,15 +32,30 @@ public class DocumentLoaderService { this.linkdbWriter = linkdbWriter; } - public void loadDocuments(DomainIdRegistry domainIdRegistry, + public boolean loadDocuments( + DomainIdRegistry domainIdRegistry, + ProcessHeartbeat processHeartbeat, Path processedDataPathBase, int untilBatch) throws IOException, SQLException { var documentFiles = ProcessedDataFileNames.listDocumentFiles(processedDataPathBase, untilBatch); - for (var file : documentFiles) { - loadDocumentsFromFile(domainIdRegistry, file); + + try (var taskHeartbeat = processHeartbeat.createAdHocTaskHeartbeat("DOCUMENTS")) { + + int processed = 0; + + for (var file : documentFiles) { + taskHeartbeat.progress("LOAD", processed++, documentFiles.size()); + + loadDocumentsFromFile(domainIdRegistry, file); + } + taskHeartbeat.progress("LOAD", processed, documentFiles.size()); } + + logger.info("Finished"); + + return true; } private void loadDocumentsFromFile(DomainIdRegistry domainIdRegistry, Path file) @@ -45,6 +65,8 @@ public class DocumentLoaderService { LinkdbLoader loader = new LinkdbLoader(domainIdRegistry) ) { + logger.info("Loading document meta from {}", file); + stream.forEach(loader::accept); } } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/documents/KeywordLoaderService.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/documents/KeywordLoaderService.java index ef9b619e..9d9d0a4d 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/documents/KeywordLoaderService.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/documents/KeywordLoaderService.java @@ -5,15 +5,20 @@ import com.google.inject.Singleton; import nu.marginalia.io.processed.DocumentRecordParquetFileReader; import nu.marginalia.io.processed.ProcessedDataFileNames; import nu.marginalia.keyword.model.DocumentKeywords; +import nu.marginalia.loading.LoaderIndexJournalWriter; import nu.marginalia.loading.domains.DomainIdRegistry; import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.model.processed.DocumentRecordKeywordsProjection; +import nu.marginalia.process.control.ProcessHeartbeat; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.IOException; import java.nio.file.Path; @Singleton public class KeywordLoaderService { + private static final Logger logger = LoggerFactory.getLogger(KeywordLoaderService.class); private final LoaderIndexJournalWriter writer; @Inject @@ -21,17 +26,33 @@ public class KeywordLoaderService { this.writer = writer; } - public void loadKeywords(DomainIdRegistry domainIdRegistry, + public boolean loadKeywords(DomainIdRegistry domainIdRegistry, + ProcessHeartbeat heartbeat, Path processedDataPathBase, int untilBatch) throws IOException { - var documentFiles = ProcessedDataFileNames.listDocumentFiles(processedDataPathBase, untilBatch); - for (var file : documentFiles) { - loadKeywordsFromFile(domainIdRegistry, file); + try (var task = heartbeat.createAdHocTaskHeartbeat("KEYWORDS")) { + + var documentFiles = ProcessedDataFileNames.listDocumentFiles(processedDataPathBase, untilBatch); + int processed = 0; + + for (var file : documentFiles) { + task.progress("LOAD", processed++, documentFiles.size()); + + loadKeywordsFromFile(domainIdRegistry, file); + } + + task.progress("LOAD", processed, documentFiles.size()); } + + logger.info("Finished"); + + return true; } private void loadKeywordsFromFile(DomainIdRegistry domainIdRegistry, Path file) throws IOException { try (var stream = DocumentRecordParquetFileReader.streamKeywordsProjection(file)) { + logger.info("Loading keywords from {}", file); + stream.filter(DocumentRecordKeywordsProjection::hasKeywords) .forEach(proj -> insertKeywords(domainIdRegistry, proj)); } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/domains/DomainIdRegistry.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/domains/DomainIdRegistry.java index cb825641..2ab6ba46 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/domains/DomainIdRegistry.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/domains/DomainIdRegistry.java @@ -1,7 +1,5 @@ package nu.marginalia.loading.domains; -import nu.marginalia.model.EdgeDomain; - import java.util.HashMap; import java.util.Map; @@ -20,15 +18,6 @@ public class DomainIdRegistry { return id; } - public int getDomainId(EdgeDomain domainName) { - return getDomainId(domainName.toString()); - } - - - public boolean isKnown(String domainName) { - return domainIds.containsKey(domainName); - } - void add(String domainName, int id) { domainIds.put(domainName, id); } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/links/DomainLinksLoaderService.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/links/DomainLinksLoaderService.java index c70647a6..d38112ca 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/links/DomainLinksLoaderService.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/links/DomainLinksLoaderService.java @@ -7,6 +7,7 @@ import nu.marginalia.io.processed.DomainLinkRecordParquetFileReader; import nu.marginalia.io.processed.ProcessedDataFileNames; import nu.marginalia.loading.domains.DomainIdRegistry; import nu.marginalia.model.processed.DomainLinkRecord; +import nu.marginalia.process.control.ProcessHeartbeat; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -27,14 +28,29 @@ public class DomainLinksLoaderService { this.dataSource = dataSource; } - public void loadLinks(DomainIdRegistry domainIdRegistry, Path processedDataPathBase, int untilBatch) throws IOException, SQLException { + public boolean loadLinks(DomainIdRegistry domainIdRegistry, + ProcessHeartbeat heartbeat, + Path processedDataPathBase, + int untilBatch) throws IOException, SQLException { dropLinkData(); - var linkFiles = ProcessedDataFileNames.listDomainLinkFiles(processedDataPathBase, untilBatch); - for (var file : linkFiles) { - loadLinksFromFile(domainIdRegistry, file); + try (var task = heartbeat.createAdHocTaskHeartbeat("LINKS")) { + var linkFiles = ProcessedDataFileNames.listDomainLinkFiles(processedDataPathBase, untilBatch); + + int processed = 0; + + for (var file : linkFiles) { + task.progress("LOAD", processed++, linkFiles.size()); + + loadLinksFromFile(domainIdRegistry, file); + } + + task.progress("LOAD", processed, linkFiles.size()); } + + logger.info("Finished"); + return true; } private void dropLinkData() throws SQLException { diff --git a/code/processes/loading-process/src/test/java/nu/marginalia/loading/domains/DomainLoaderServiceTest.java b/code/processes/loading-process/src/test/java/nu/marginalia/loading/domains/DomainLoaderServiceTest.java index 9340fe14..a3d16162 100644 --- a/code/processes/loading-process/src/test/java/nu/marginalia/loading/domains/DomainLoaderServiceTest.java +++ b/code/processes/loading-process/src/test/java/nu/marginalia/loading/domains/DomainLoaderServiceTest.java @@ -7,10 +7,10 @@ import nu.marginalia.io.processed.ProcessedDataFileNames; import nu.marginalia.loader.DbTestUtil; import nu.marginalia.model.processed.DomainLinkRecord; import nu.marginalia.model.processed.DomainRecord; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.Tag; -import org.junit.jupiter.api.Test; +import nu.marginalia.process.control.ProcessAdHocTaskHeartbeat; +import nu.marginalia.process.control.ProcessHeartbeat; +import org.junit.jupiter.api.*; +import org.mockito.Mockito; import org.testcontainers.containers.MariaDBContainer; import org.testcontainers.junit.jupiter.Container; import org.testcontainers.junit.jupiter.Testcontainers; @@ -29,6 +29,7 @@ import static org.junit.jupiter.api.Assertions.*; @Testcontainers class DomainLoaderServiceTest { List toDelete = new ArrayList<>(); + ProcessHeartbeat heartbeat; @Container static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") @@ -38,6 +39,15 @@ class DomainLoaderServiceTest { .withInitScript("db/migration/V23_06_0_000__base.sql") .withNetworkAliases("mariadb"); + @BeforeEach + public void setUp() { + heartbeat = Mockito.mock(ProcessHeartbeat.class); + + Mockito.when(heartbeat.createAdHocTaskHeartbeat(Mockito.anyString())).thenReturn( + Mockito.mock(ProcessAdHocTaskHeartbeat.class) + ); + } + @AfterEach public void tearDown() throws IOException { for (var path : Lists.reverse(toDelete)) { diff --git a/code/processes/loading-process/src/test/java/nu/marginalia/loading/links/DomainLinksLoaderServiceTest.java b/code/processes/loading-process/src/test/java/nu/marginalia/loading/links/DomainLinksLoaderServiceTest.java index 2f1f9b00..df80020f 100644 --- a/code/processes/loading-process/src/test/java/nu/marginalia/loading/links/DomainLinksLoaderServiceTest.java +++ b/code/processes/loading-process/src/test/java/nu/marginalia/loading/links/DomainLinksLoaderServiceTest.java @@ -8,9 +8,14 @@ import nu.marginalia.loader.DbTestUtil; import nu.marginalia.loading.domains.DomainLoaderService; import nu.marginalia.model.processed.DomainLinkRecord; import nu.marginalia.model.processed.DomainRecord; +import nu.marginalia.process.control.ProcessAdHocTaskHeartbeat; +import nu.marginalia.process.control.ProcessHeartbeat; +import nu.marginalia.process.control.ProcessHeartbeatImpl; import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; +import org.mockito.Mockito; import org.testcontainers.containers.MariaDBContainer; import org.testcontainers.junit.jupiter.Container; import org.testcontainers.junit.jupiter.Testcontainers; @@ -27,6 +32,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals; @Testcontainers class DomainLinksLoaderServiceTest { List toDelete = new ArrayList<>(); + ProcessHeartbeat heartbeat; @Container static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") @@ -36,6 +42,15 @@ class DomainLinksLoaderServiceTest { .withInitScript("db/migration/V23_06_0_000__base.sql") .withNetworkAliases("mariadb"); + @BeforeEach + public void setUp() { + heartbeat = Mockito.mock(ProcessHeartbeat.class); + + Mockito.when(heartbeat.createAdHocTaskHeartbeat(Mockito.anyString())).thenReturn( + Mockito.mock(ProcessAdHocTaskHeartbeat.class) + ); + } + @AfterEach public void tearDown() throws IOException { for (var path : Lists.reverse(toDelete)) { @@ -87,7 +102,7 @@ class DomainLinksLoaderServiceTest { var domainRegistry = domainService.getOrCreateDomainIds(workDir, 2); var dls = new DomainLinksLoaderService(dataSource); - dls.loadLinks(domainRegistry, workDir, 2); + dls.loadLinks(domainRegistry, heartbeat, workDir, 2); Map> expected = new HashMap<>(); Map> actual = new HashMap<>(); diff --git a/code/processes/loading-process/src/test/java/nu/marginalia/loading/loader/LoaderIndexJournalWriterTest.java b/code/processes/loading-process/src/test/java/nu/marginalia/loading/loader/LoaderIndexJournalWriterTest.java index 709b3110..472e692d 100644 --- a/code/processes/loading-process/src/test/java/nu/marginalia/loading/loader/LoaderIndexJournalWriterTest.java +++ b/code/processes/loading-process/src/test/java/nu/marginalia/loading/loader/LoaderIndexJournalWriterTest.java @@ -5,7 +5,7 @@ import nu.marginalia.db.storage.model.FileStorage; import nu.marginalia.db.storage.model.FileStorageType; import nu.marginalia.index.journal.reader.IndexJournalReaderSingleFile; import nu.marginalia.keyword.model.DocumentKeywords; -import nu.marginalia.loading.documents.LoaderIndexJournalWriter; +import nu.marginalia.loading.LoaderIndexJournalWriter; import nu.marginalia.model.idx.DocumentMetadata; import nu.marginallia.index.journal.IndexJournalFileNames; import org.junit.jupiter.api.AfterEach; From eaeb23d41e51e2ce6cc5a836cbb734c47e7267a6 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 14 Sep 2023 11:21:44 +0200 Subject: [PATCH 11/14] (refactor) Remove converting-model package completely --- .../marginalia/model/html}/HtmlStandard.java | 4 +- code/features-convert/pubdate/build.gradle | 1 - .../pubdate/PubDateFromHtmlStandard.java | 2 +- .../marginalia/pubdate/PubDateHeuristic.java | 2 +- .../nu/marginalia/pubdate/PubDateParser.java | 2 +- .../nu/marginalia/pubdate/PubDateSniffer.java | 2 +- .../PubDateHeuristicDOMParsingPass1.java | 2 +- .../PubDateHeuristicDOMParsingPass2.java | 2 +- ...PubDateHeuristicGuessFromHtmlStandard.java | 2 +- .../PubDateHeuristicHtml5AnyTimeTag.java | 2 +- .../PubDateHeuristicHtml5ArticleDateTag.java | 2 +- .../PubDateHeuristicHtml5ItempropDateTag.java | 2 +- .../heuristic/PubDateHeuristicJSONLD.java | 2 +- .../PubDateHeuristicLastModified.java | 2 +- .../heuristic/PubDateHeuristicMicrodata.java | 2 +- .../heuristic/PubDateHeuristicOpenGraph.java | 2 +- .../heuristic/PubDateHeuristicRDFaTag.java | 2 +- .../PubDateHeuristicUrlPatternPass1.java | 2 +- .../PubDateHeuristicUrlPatternPass2.java | 2 +- .../pubdate/PubDateSnifferTest.java | 2 +- .../converting-model/build.gradle | 47 ------------------- .../process-models/converting-model/readme.md | 3 -- .../processes/converting-process/build.gradle | 1 - .../model/ProcessedDocumentDetails.java | 1 + .../processor/logic/DocumentValuator.java | 3 +- .../logic/HtmlStandardExtractor.java | 2 +- .../processor/logic/links/FileLinks.java | 1 - .../AbstractDocumentProcessorPlugin.java | 3 +- .../plugin/HtmlDocumentProcessorPlugin.java | 3 +- .../PlainTextDocumentProcessorPlugin.java | 3 +- .../sideload/StackexchangeSideloader.java | 1 + .../converting/ConvertingIntegrationTest.java | 2 +- code/processes/crawling-process/build.gradle | 1 - code/processes/loading-process/build.gradle | 1 - code/processes/readme.md | 27 +++++------ code/tools/experiment-runner/build.gradle | 1 - settings.gradle | 1 - 37 files changed, 40 insertions(+), 102 deletions(-) rename code/{process-models/converting-model/src/main/java/nu/marginalia/converting/model => common/model/src/main/java/nu/marginalia/model/html}/HtmlStandard.java (78%) delete mode 100644 code/process-models/converting-model/build.gradle delete mode 100644 code/process-models/converting-model/readme.md diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/model/HtmlStandard.java b/code/common/model/src/main/java/nu/marginalia/model/html/HtmlStandard.java similarity index 78% rename from code/process-models/converting-model/src/main/java/nu/marginalia/converting/model/HtmlStandard.java rename to code/common/model/src/main/java/nu/marginalia/model/html/HtmlStandard.java index ecb3d630..cdd23742 100644 --- a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/model/HtmlStandard.java +++ b/code/common/model/src/main/java/nu/marginalia/model/html/HtmlStandard.java @@ -1,6 +1,6 @@ -package nu.marginalia.converting.model; - +package nu.marginalia.model.html; +// This class really doesn't belong anywhere, but will squat here for now public enum HtmlStandard { PLAIN(0, 1), UNKNOWN(0, 1), diff --git a/code/features-convert/pubdate/build.gradle b/code/features-convert/pubdate/build.gradle index 1535b203..ee256ebf 100644 --- a/code/features-convert/pubdate/build.gradle +++ b/code/features-convert/pubdate/build.gradle @@ -15,7 +15,6 @@ java { dependencies { implementation project(':code:common:model') - implementation project(':code:process-models:converting-model') implementation libs.lombok annotationProcessor libs.lombok diff --git a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateFromHtmlStandard.java b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateFromHtmlStandard.java index d7777e0e..dfbab8d3 100644 --- a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateFromHtmlStandard.java +++ b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateFromHtmlStandard.java @@ -1,6 +1,6 @@ package nu.marginalia.pubdate; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; public class PubDateFromHtmlStandard { /** Used to bias pub date heuristics */ diff --git a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateHeuristic.java b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateHeuristic.java index ddc3b9c4..56355806 100644 --- a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateHeuristic.java +++ b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateHeuristic.java @@ -2,7 +2,7 @@ package nu.marginalia.pubdate; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.crawl.PubDate; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; import org.jsoup.nodes.Document; import java.util.Optional; diff --git a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateParser.java b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateParser.java index 1abd84dd..1fbade80 100644 --- a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateParser.java +++ b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateParser.java @@ -1,6 +1,6 @@ package nu.marginalia.pubdate; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.crawl.PubDate; import java.time.DateTimeException; diff --git a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateSniffer.java b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateSniffer.java index b8b9b704..90b25915 100644 --- a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateSniffer.java +++ b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateSniffer.java @@ -1,6 +1,6 @@ package nu.marginalia.pubdate; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.EdgeUrl; import nu.marginalia.pubdate.heuristic.*; diff --git a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java index 5f8c7ffc..28059f64 100644 --- a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java +++ b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java @@ -1,6 +1,6 @@ package nu.marginalia.pubdate.heuristic; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.pubdate.PubDateHeuristic; import nu.marginalia.pubdate.PubDateParser; diff --git a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java index 2bcf5dab..bb625180 100644 --- a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java +++ b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java @@ -1,6 +1,6 @@ package nu.marginalia.pubdate.heuristic; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.pubdate.PubDateEffortLevel; import nu.marginalia.pubdate.PubDateFromHtmlStandard; diff --git a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicGuessFromHtmlStandard.java b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicGuessFromHtmlStandard.java index c082f555..30486f2f 100644 --- a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicGuessFromHtmlStandard.java +++ b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicGuessFromHtmlStandard.java @@ -1,6 +1,6 @@ package nu.marginalia.pubdate.heuristic; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.pubdate.PubDateEffortLevel; import nu.marginalia.pubdate.PubDateHeuristic; diff --git a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicHtml5AnyTimeTag.java b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicHtml5AnyTimeTag.java index ac8dbf01..30513a47 100644 --- a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicHtml5AnyTimeTag.java +++ b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicHtml5AnyTimeTag.java @@ -1,6 +1,6 @@ package nu.marginalia.pubdate.heuristic; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.pubdate.PubDateHeuristic; import nu.marginalia.pubdate.PubDateParser; diff --git a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicHtml5ArticleDateTag.java b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicHtml5ArticleDateTag.java index 0bcb28dd..45c8b091 100644 --- a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicHtml5ArticleDateTag.java +++ b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicHtml5ArticleDateTag.java @@ -1,6 +1,6 @@ package nu.marginalia.pubdate.heuristic; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.pubdate.PubDateHeuristic; import nu.marginalia.pubdate.PubDateParser; diff --git a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicHtml5ItempropDateTag.java b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicHtml5ItempropDateTag.java index ac88fcb4..aa09d392 100644 --- a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicHtml5ItempropDateTag.java +++ b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicHtml5ItempropDateTag.java @@ -1,6 +1,6 @@ package nu.marginalia.pubdate.heuristic; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.pubdate.PubDateHeuristic; import nu.marginalia.pubdate.PubDateParser; diff --git a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicJSONLD.java b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicJSONLD.java index d3173b81..3ddf58eb 100644 --- a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicJSONLD.java +++ b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicJSONLD.java @@ -5,7 +5,7 @@ import com.google.gson.GsonBuilder; import com.google.gson.JsonSyntaxException; import com.google.gson.annotations.SerializedName; import lombok.ToString; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.pubdate.PubDateHeuristic; import nu.marginalia.pubdate.PubDateParser; diff --git a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicLastModified.java b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicLastModified.java index 69a780b9..ca42d469 100644 --- a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicLastModified.java +++ b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicLastModified.java @@ -1,6 +1,6 @@ package nu.marginalia.pubdate.heuristic; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.pubdate.PubDateHeuristic; import nu.marginalia.pubdate.PubDateParser; diff --git a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicMicrodata.java b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicMicrodata.java index 62a16f5a..584375f2 100644 --- a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicMicrodata.java +++ b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicMicrodata.java @@ -1,6 +1,6 @@ package nu.marginalia.pubdate.heuristic; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.pubdate.PubDateHeuristic; diff --git a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicOpenGraph.java b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicOpenGraph.java index 023e954c..74a7a654 100644 --- a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicOpenGraph.java +++ b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicOpenGraph.java @@ -1,6 +1,6 @@ package nu.marginalia.pubdate.heuristic; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.pubdate.PubDateEffortLevel; import nu.marginalia.pubdate.PubDateHeuristic; diff --git a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicRDFaTag.java b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicRDFaTag.java index 18bd7e80..1ed20019 100644 --- a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicRDFaTag.java +++ b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicRDFaTag.java @@ -1,6 +1,6 @@ package nu.marginalia.pubdate.heuristic; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.pubdate.PubDateEffortLevel; import nu.marginalia.pubdate.PubDateHeuristic; diff --git a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicUrlPatternPass1.java b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicUrlPatternPass1.java index 65b1d4da..6a6d5630 100644 --- a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicUrlPatternPass1.java +++ b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicUrlPatternPass1.java @@ -1,6 +1,6 @@ package nu.marginalia.pubdate.heuristic; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.pubdate.PubDateHeuristic; import nu.marginalia.pubdate.PubDateParser; diff --git a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicUrlPatternPass2.java b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicUrlPatternPass2.java index e05a10ef..ea3ab9d9 100644 --- a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicUrlPatternPass2.java +++ b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicUrlPatternPass2.java @@ -1,6 +1,6 @@ package nu.marginalia.pubdate.heuristic; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.pubdate.PubDateHeuristic; import nu.marginalia.pubdate.PubDateParser; diff --git a/code/features-convert/pubdate/src/test/java/nu/marginalia/pubdate/PubDateSnifferTest.java b/code/features-convert/pubdate/src/test/java/nu/marginalia/pubdate/PubDateSnifferTest.java index 1794c196..efd320e8 100644 --- a/code/features-convert/pubdate/src/test/java/nu/marginalia/pubdate/PubDateSnifferTest.java +++ b/code/features-convert/pubdate/src/test/java/nu/marginalia/pubdate/PubDateSnifferTest.java @@ -2,7 +2,7 @@ package nu.marginalia.pubdate; import nu.marginalia.WmsaHome; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.pubdate.heuristic.PubDateHeuristicDOMParsingPass2; import org.jsoup.Jsoup; import org.junit.jupiter.api.Test; diff --git a/code/process-models/converting-model/build.gradle b/code/process-models/converting-model/build.gradle deleted file mode 100644 index cb25b932..00000000 --- a/code/process-models/converting-model/build.gradle +++ /dev/null @@ -1,47 +0,0 @@ -plugins { - id 'java' - id "io.freefair.lombok" version "8.2.2" - - id 'jvm-test-suite' -} - -java { - toolchain { - languageVersion.set(JavaLanguageVersion.of(20)) - } -} -dependencies { - - //implementation project(':third-party:monkey-patch-gson') - - implementation project(':code:common:db') - implementation project(':code:common:model') - implementation project(':code:api:index-api') - implementation project(':code:common:service-discovery') - implementation project(':code:common:service-client') - implementation project(':code:libraries:language-processing') - - implementation project(':code:features-convert:keyword-extraction') - - implementation libs.lombok - annotationProcessor libs.lombok - implementation libs.bundles.slf4j - - implementation libs.notnull - implementation libs.trove - implementation libs.fastutil - - testImplementation libs.bundles.slf4j.test - testImplementation libs.bundles.junit - testImplementation libs.mockito -} - -test { - useJUnitPlatform() -} - -task fastTests(type: Test) { - useJUnitPlatform { - excludeTags "slow" - } -} diff --git a/code/process-models/converting-model/readme.md b/code/process-models/converting-model/readme.md deleted file mode 100644 index 52973e48..00000000 --- a/code/process-models/converting-model/readme.md +++ /dev/null @@ -1,3 +0,0 @@ -# Converting Models - -!!To be deleted!! \ No newline at end of file diff --git a/code/processes/converting-process/build.gradle b/code/processes/converting-process/build.gradle index cb8e80e1..fdc37e75 100644 --- a/code/processes/converting-process/build.gradle +++ b/code/processes/converting-process/build.gradle @@ -43,7 +43,6 @@ dependencies { implementation project(':code:libraries:big-string') implementation project(':code:libraries:language-processing') - implementation project(':code:process-models:converting-model') implementation project(':code:process-models:processed-data') implementation project(':code:process-models:work-log') implementation project(':code:process-models:crawling-model') diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDocumentDetails.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDocumentDetails.java index 97cb964b..ee70fb14 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDocumentDetails.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDocumentDetails.java @@ -2,6 +2,7 @@ package nu.marginalia.converting.model; import lombok.ToString; import nu.marginalia.model.crawl.HtmlFeature; +import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.EdgeUrl; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java index baacb766..218f16b8 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java @@ -2,14 +2,13 @@ package nu.marginalia.converting.processor.logic; import crawlercommons.utils.Strings; import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.converting.model.DisqualifiedException; import nu.marginalia.model.crawl.HtmlFeature; import org.jetbrains.annotations.NotNull; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; -import org.jsoup.nodes.TextNode; import org.jsoup.select.NodeVisitor; import java.util.Set; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/HtmlStandardExtractor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/HtmlStandardExtractor.java index 52537f68..f0f994da 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/HtmlStandardExtractor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/HtmlStandardExtractor.java @@ -1,7 +1,7 @@ package nu.marginalia.converting.processor.logic; import com.google.common.base.Strings; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; import org.jsoup.nodes.Document; import org.jsoup.nodes.DocumentType; import org.slf4j.Logger; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/links/FileLinks.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/links/FileLinks.java index e8809b67..10c31606 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/links/FileLinks.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/links/FileLinks.java @@ -4,7 +4,6 @@ import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import org.jsoup.nodes.Document; -import java.nio.file.Path; import java.util.HashSet; import java.util.Set; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java index 2d2f58ca..913ba81d 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java @@ -1,10 +1,9 @@ package nu.marginalia.converting.processor.plugin; import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.converting.language.LanguageFilter; import nu.marginalia.language.model.DocumentLanguageData; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.keyword.model.DocumentKeywordsBuilder; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.converting.model.DisqualifiedException; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java index 7d48bf3b..c51e9690 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java @@ -13,10 +13,9 @@ import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.link_parser.LinkParser; import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.keyword.DocumentKeywordExtractor; import nu.marginalia.language.sentence.SentenceExtractor; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.idx.DocumentFlags; import nu.marginalia.keyword.model.DocumentKeywordsBuilder; import nu.marginalia.model.idx.DocumentMetadata; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java index bc288430..797b3b6d 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java @@ -5,10 +5,9 @@ import com.google.inject.name.Named; import nu.marginalia.converting.language.LanguageFilter; import nu.marginalia.converting.processor.logic.DocumentLengthLogic; import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.keyword.DocumentKeywordExtractor; import nu.marginalia.language.sentence.SentenceExtractor; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.idx.DocumentFlags; import nu.marginalia.keyword.model.DocumentKeywordsBuilder; import nu.marginalia.model.idx.DocumentMetadata; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/StackexchangeSideloader.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/StackexchangeSideloader.java index 0f40639b..07ad6391 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/StackexchangeSideloader.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/StackexchangeSideloader.java @@ -10,6 +10,7 @@ import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.UrlIndexingState; +import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.idx.DocumentFlags; import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.idx.WordFlags; diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java index d43ddecf..5c6ebe81 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java @@ -3,7 +3,7 @@ package nu.marginalia.converting; import com.google.inject.Guice; import com.google.inject.Injector; -import nu.marginalia.converting.model.HtmlStandard; +import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.processor.DomainProcessor; import nu.marginalia.crawling.io.SerializableCrawlDataStream; diff --git a/code/processes/crawling-process/build.gradle b/code/processes/crawling-process/build.gradle index 5dfe0556..27b7bf32 100644 --- a/code/processes/crawling-process/build.gradle +++ b/code/processes/crawling-process/build.gradle @@ -34,7 +34,6 @@ dependencies { implementation project(':code:libraries:language-processing') implementation project(':code:libraries:easy-lsh') implementation project(':code:process-models:crawling-model') - implementation project(':code:process-models:converting-model') implementation project(':code:features-crawl:crawl-blocklist') implementation project(':code:features-crawl:link-parser') diff --git a/code/processes/loading-process/build.gradle b/code/processes/loading-process/build.gradle index 736fec8d..a890ec23 100644 --- a/code/processes/loading-process/build.gradle +++ b/code/processes/loading-process/build.gradle @@ -35,7 +35,6 @@ dependencies { testImplementation project(':code:services-core:search-service') implementation project(':code:process-models:crawling-model') - implementation project(':code:process-models:converting-model') implementation project(':code:process-models:processed-data') implementation project(':code:process-models:work-log') diff --git a/code/processes/readme.md b/code/processes/readme.md index 6b1ccede..b2839a09 100644 --- a/code/processes/readme.md +++ b/code/processes/readme.md @@ -11,13 +11,17 @@ based on the content in the database. ## 2. Converting Process The [converting-process](converting-process/) reads crawl data from the crawling step and -processes them, extracting keywords and metadata and saves them as compressed JSON models -described in [converting-model](../process-models/converting-model/). +processes them, extracting keywords and metadata and saves them as parquet files +described in [processed-data](../process-models/processed-data/). ## 3. Loading Process -The [loading-process](loading-process/) reads the processed data and creates an index journal -and lexicon, and loads domains and addresses into the MariaDB-database. +The [loading-process](loading-process/) reads the processed data. + +It has creates an [index journal](../features-index/index-journal), +a [link database](../common/linkdb), +and loads domains and domain-links +into the [MariaDB database](../common/db). ## 4. Index Construction Process @@ -56,21 +60,14 @@ Schematically the crawling and loading process looks like this: +------------+ features, links, URLs | //==================\\ - || Compressed JSON: || Processed - || URLs[] || Files + || Parquet: || Processed + || Documents[] || Files || Domains[] || || Links[] || - || Keywords[] || - || ... || - || URLs[] || - || Domains[] || - || Links[] || - || Keywords[] || - || ... || \\==================// | - +------------+ - | LOADING | Insert URLs in link DB + +------------+ Insert domains into mariadb + | LOADING | Insert URLs, titles in link DB | STEP | Insert keywords in Index +------------+ | diff --git a/code/tools/experiment-runner/build.gradle b/code/tools/experiment-runner/build.gradle index 77d84e21..3eb4c244 100644 --- a/code/tools/experiment-runner/build.gradle +++ b/code/tools/experiment-runner/build.gradle @@ -31,7 +31,6 @@ dependencies { implementation project(':code:libraries:term-frequency-dict') implementation project(':code:libraries:big-string') implementation project(':code:processes:converting-process') - implementation project(':code:process-models:converting-model') implementation project(':code:process-models:crawling-model') implementation project(':code:features-convert:adblock') diff --git a/settings.gradle b/settings.gradle index af44349d..dc42ead8 100644 --- a/settings.gradle +++ b/settings.gradle @@ -63,7 +63,6 @@ include 'code:processes:loading-process' include 'code:processes:index-constructor-process' include 'code:processes:test-data' -include 'code:process-models:converting-model' include 'code:process-models:crawling-model' include 'code:process-models:work-log' include 'code:process-models:processed-data' From 35996d0adb02d9de7cb62f6f01af5920190144e4 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 14 Sep 2023 11:33:36 +0200 Subject: [PATCH 12/14] (docs) Update the documentation up-to-date information --- code/common/linkdb/readme.md | 11 +++++++++++ code/process-models/processed-data/readme.md | 18 ++++++++++++++++++ third-party/parquet-floor/readme.md | 8 ++++++++ 3 files changed, 37 insertions(+) create mode 100644 code/common/linkdb/readme.md create mode 100644 code/process-models/processed-data/readme.md diff --git a/code/common/linkdb/readme.md b/code/common/linkdb/readme.md new file mode 100644 index 00000000..a87166bc --- /dev/null +++ b/code/common/linkdb/readme.md @@ -0,0 +1,11 @@ +The link database contains information about links, +such as their ID, their URL, their title, their description, +and so forth. + +The link database is a sqlite file. The reason this information +is not in the MariaDB database is that this would make updates to +this information take effect in production immediately, even before +the information was searchable. + +It is constructed by the [loading-process](../../processes/loading-process), and consumed +by the [search-service](../../services-core/search-service). \ No newline at end of file diff --git a/code/process-models/processed-data/readme.md b/code/process-models/processed-data/readme.md new file mode 100644 index 00000000..4bc8c857 --- /dev/null +++ b/code/process-models/processed-data/readme.md @@ -0,0 +1,18 @@ +The processed-data package contains models and logic for +reading and writing parquet files with the output from the +[converting-process](../../processes/converting-process). + +Main models: + +* [DocumentRecord](src/main/java/nu/marginalia/model/processed/DocumentRecord.java) +* * [DocumentRecordKeywordsProjection](src/main/java/nu/marginalia/model/processed/DocumentRecordKeywordsProjection.java) +* * [DocumentRecordMetadataProjection](src/main/java/nu/marginalia/model/processed/DocumentRecordMetadataProjection.java) +* [DomainLinkRecord](src/main/java/nu/marginalia/model/processed/DomainLinkRecord.java) +* [DomainRecord](src/main/java/nu/marginalia/model/processed/DomainRecord.java) + +Since parquet is a column based format, some of the readable models are projections +that only read parts of the input file. + +## See Also + +[third-party/parquet-floor](../../../third-party/parquet-floor) \ No newline at end of file diff --git a/third-party/parquet-floor/readme.md b/third-party/parquet-floor/readme.md index b1e21c40..70715f1e 100644 --- a/third-party/parquet-floor/readme.md +++ b/third-party/parquet-floor/readme.md @@ -6,3 +6,11 @@ Git: https://github.com/strategicblue/parquet-floor It's basically an adaptor for Parquet I/O without needing to pull half of Hadoop into your project. + +The library has been modified with support for reading +and writing lists of values, and the default +compression has been altered to zstd. + +# Further reading: + +https://parquet.apache.org/docs/ \ No newline at end of file From 5e5aaf9a7e4091486eb32011dffff68e31ffbb52 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 14 Sep 2023 12:12:07 +0200 Subject: [PATCH 13/14] (converter, control) Re-enable sideloading encyclopedia data --- .../marginalia/converting/ConverterMain.java | 10 +++--- .../writer/ConverterBatchWriter.java | 32 ++++++++++++++++--- .../nu/marginalia/control/ControlService.java | 1 + .../control/svc/ControlActionsService.java | 20 +++++++++++- .../resources/templates/control/actions.hdb | 15 +++++++++ 5 files changed, 68 insertions(+), 10 deletions(-) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java index 7afab740..99a46893 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java @@ -7,6 +7,7 @@ import com.google.inject.Injector; import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.converting.sideload.SideloadSource; import nu.marginalia.converting.sideload.SideloadSourceFactory; +import nu.marginalia.converting.writer.ConverterBatchWriter; import nu.marginalia.converting.writer.ConverterWriter; import nu.marginalia.db.storage.FileStorageService; import nu.marginalia.mq.MessageQueueFactory; @@ -25,6 +26,7 @@ import nu.marginalia.converting.processor.DomainProcessor; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.IOException; import java.nio.file.Path; import java.sql.SQLException; import java.util.Optional; @@ -82,10 +84,10 @@ public class ConverterMain { heartbeat.start(); } - public void convert(SideloadSource sideloadSource, Path writeDir) throws Exception { - int maxPoolSize = 16; - - // FIXME + public void convert(SideloadSource sideloadSource, Path writeDir) throws IOException { + try (var writer = new ConverterBatchWriter(writeDir, 0)) { + writer.write(sideloadSource); + } } public void convert(CrawlPlan plan) throws Exception { diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterBatchWriter.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterBatchWriter.java index cea46f20..cc9f0467 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterBatchWriter.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterBatchWriter.java @@ -2,7 +2,9 @@ package nu.marginalia.converting.writer; import gnu.trove.list.TLongList; import gnu.trove.list.array.TLongArrayList; +import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.model.ProcessedDomain; +import nu.marginalia.converting.sideload.SideloadSource; import nu.marginalia.io.processed.DocumentRecordParquetFileWriter; import nu.marginalia.io.processed.DomainLinkRecordParquetFileWriter; import nu.marginalia.io.processed.DomainRecordParquetFileWriter; @@ -24,14 +26,15 @@ import java.util.concurrent.Callable; import java.util.concurrent.ForkJoinPool; import java.util.concurrent.Future; -public class ConverterBatchWriter { +/** Writer for a single batch of converter parquet files */ +public class ConverterBatchWriter implements AutoCloseable { private final DomainRecordParquetFileWriter domainWriter; private final DomainLinkRecordParquetFileWriter domainLinkWriter; private final DocumentRecordParquetFileWriter documentWriter; private static final Logger logger = LoggerFactory.getLogger(ConverterBatchWriter.class); - ConverterBatchWriter(Path basePath, int batchNumber) throws IOException { + public ConverterBatchWriter(Path basePath, int batchNumber) throws IOException { domainWriter = new DomainRecordParquetFileWriter( ProcessedDataFileNames.domainFileName(basePath, batchNumber) ); @@ -43,6 +46,14 @@ public class ConverterBatchWriter { ); } + public void write(SideloadSource sideloadSource) throws IOException { + var domain = sideloadSource.getDomain(); + + writeDomainData(domain); + + writeDocumentData(domain.domain, sideloadSource.getDocumentsStream()); + } + public void write(ProcessedDomain domain) { var results = ForkJoinPool.commonPool().invokeAll( writeTasks(domain) @@ -67,10 +78,22 @@ public class ConverterBatchWriter { if (domain.documents == null) return this; - String domainName = domain.domain.toString(); + writeDocumentData(domain.domain, domain.documents.iterator()); + + return this; + } + + private void writeDocumentData(EdgeDomain domain, + Iterator documentIterator) + throws IOException + { + int ordinal = 0; - for (var document : domain.documents) { + String domainName = domain.toString(); + + while (documentIterator.hasNext()) { + var document = documentIterator.next(); if (document.details == null) { new DocumentRecord( domainName, @@ -119,7 +142,6 @@ public class ConverterBatchWriter { ordinal++; } - return this; } private Object writeLinkData(ProcessedDomain domain) throws IOException { diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java index 43e9d985..2b6b6d58 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java @@ -188,6 +188,7 @@ public class ControlService extends Service { Spark.post("/public/actions/trigger-data-exports", controlActionsService::triggerDataExports, redirectToActors); Spark.post("/public/actions/flush-api-caches", controlActionsService::flushApiCaches, redirectToActors); Spark.post("/public/actions/truncate-links-database", controlActionsService::truncateLinkDatabase, redirectToActors); + Spark.post("/public/actions/sideload-encyclopedia", controlActionsService::sideloadEncyclopedia, redirectToActors); // Review Random Domains Spark.get("/public/review-random-domains", this::reviewRandomDomainsModel, reviewRandomDomainsRenderer::render); diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActionsService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActionsService.java index a8267d40..337a05bf 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActionsService.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActionsService.java @@ -4,18 +4,20 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.control.actor.ControlActors; import nu.marginalia.control.actor.Actor; +import nu.marginalia.control.actor.task.ConvertActor; import nu.marginalia.db.DomainTypes; import nu.marginalia.index.client.IndexClient; import nu.marginalia.index.client.IndexMqEndpoints; import nu.marginalia.mq.MessageQueueFactory; import nu.marginalia.mq.outbox.MqOutbox; -import nu.marginalia.search.client.SearchClient; import nu.marginalia.service.control.ServiceEventLog; import nu.marginalia.service.id.ServiceId; import spark.Request; import spark.Response; import spark.Spark; +import java.nio.file.Files; +import java.nio.file.Path; import java.util.UUID; @Singleton @@ -97,6 +99,22 @@ public class ControlActionsService { return ""; } + public Object sideloadEncyclopedia(Request request, Response response) throws Exception { + + Path sourcePath = Path.of(request.queryParams("source")); + if (!Files.exists(sourcePath)) { + Spark.halt(404); + return "No such file " + sourcePath; + } + + eventLog.logEvent("USER-ACTION", "SIDELOAD ENCYCLOPEDIA"); + + actors.startFrom(Actor.CONVERT, ConvertActor.CONVERT_ENCYCLOPEDIA, sourcePath.toString()); + + return ""; + } + + public Object triggerRepartition(Request request, Response response) throws Exception { indexClient.outbox().sendAsync(IndexMqEndpoints.INDEX_REPARTITION, ""); diff --git a/code/services-core/control-service/src/main/resources/templates/control/actions.hdb b/code/services-core/control-service/src/main/resources/templates/control/actions.hdb index 9ec528d4..e9768322 100644 --- a/code/services-core/control-service/src/main/resources/templates/control/actions.hdb +++ b/code/services-core/control-service/src/main/resources/templates/control/actions.hdb @@ -24,6 +24,21 @@ + + Sideload Encyclopedia

+ This will load pre-digested encyclopedia data + from a encyclopedia.marginalia.nu-style database. + + +

+
+ +

+ + +
+ + Reload Blogs List From c67d95c00f0e6bb42ecc742ce718544aca2de21e Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 14 Sep 2023 14:13:03 +0200 Subject: [PATCH 14/14] (converter) Write dummy processor log when sideloading --- .../java/nu/marginalia/converting/ConverterMain.java | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java index 99a46893..38b72164 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java @@ -84,9 +84,14 @@ public class ConverterMain { heartbeat.start(); } - public void convert(SideloadSource sideloadSource, Path writeDir) throws IOException { - try (var writer = new ConverterBatchWriter(writeDir, 0)) { + public void convert(SideloadSource sideloadSource, Path writeDir) throws Exception { + try (var writer = new ConverterBatchWriter(writeDir, 0); + BatchingWorkLog batchingWorkLog = new BatchingWorkLogImpl(writeDir.resolve("processor.log")) + ) { writer.write(sideloadSource); + + // We write an empty log with just a finish marker for the sideloading action + batchingWorkLog.logFinishedBatch(); } }