diff --git a/code/features-convert/stackexchange-xml/build.gradle b/code/features-convert/stackexchange-xml/build.gradle index daa26cff..8126187d 100644 --- a/code/features-convert/stackexchange-xml/build.gradle +++ b/code/features-convert/stackexchange-xml/build.gradle @@ -13,7 +13,6 @@ java { dependencies { implementation libs.bundles.slf4j - implementation 'org.tukaani:xz:1.8' implementation project(':code:libraries:blocking-thread-pool') implementation project(':code:common:model') implementation libs.notnull @@ -26,6 +25,7 @@ dependencies { implementation libs.zstd implementation libs.trove implementation libs.commons.compress + implementation libs.xz testImplementation libs.bundles.slf4j.test testImplementation libs.bundles.junit diff --git a/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/sqlite/StackExchangePostsDb.java b/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/sqlite/StackExchangePostsDb.java index e9bbc38f..3f588b02 100644 --- a/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/sqlite/StackExchangePostsDb.java +++ b/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/sqlite/StackExchangePostsDb.java @@ -36,8 +36,8 @@ public class StackExchangePostsDb { public static void create(String domain, Path sqliteFile, Path stackExchange7zFile) { - if (Files.exists(sqliteFile)) - Files.delete(sqliteFile); + Files.deleteIfExists(sqliteFile); + String connStr = "jdbc:sqlite:" + sqliteFile; try (var connection = DriverManager.getConnection(connStr); diff --git a/code/processes/converting-process/build.gradle b/code/processes/converting-process/build.gradle index 556f8015..52b093fc 100644 --- a/code/processes/converting-process/build.gradle +++ b/code/processes/converting-process/build.gradle @@ -95,8 +95,6 @@ dependencies { testImplementation libs.bundles.junit testImplementation libs.mockito - implementation 'org.tukaani:xz:1.8' - testImplementation project(':code:processes:test-data') testImplementation project(':code:processes:crawling-process') } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java index e6e824dc..87d04df9 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java @@ -240,50 +240,57 @@ public class ConverterMain extends ProcessMainClass { var msgOpt = getMessage(inbox, nu.marginalia.mqapi.converting.ConvertRequest.class.getSimpleName()); var msg = msgOpt.orElseThrow(() -> new RuntimeException("No message received")); - var request = gson.fromJson(msg.payload(), nu.marginalia.mqapi.converting.ConvertRequest.class); + try { + var request = gson.fromJson(msg.payload(), nu.marginalia.mqapi.converting.ConvertRequest.class); - return switch(request.action) { - case ConvertCrawlData -> { - var crawlData = fileStorageService.getStorage(request.crawlStorage); - var processData = fileStorageService.getStorage(request.processedDataStorage); + return switch (request.action) { + case ConvertCrawlData -> { + var crawlData = fileStorageService.getStorage(request.crawlStorage); + var processData = fileStorageService.getStorage(request.processedDataStorage); - var plan = new CrawlPlan(null, - new CrawlPlan.WorkDir(crawlData.path(), "crawler.log"), - new CrawlPlan.WorkDir(processData.path(), "processor.log")); + var plan = new CrawlPlan(null, + new CrawlPlan.WorkDir(crawlData.path(), "crawler.log"), + new CrawlPlan.WorkDir(processData.path(), "processor.log")); - yield new ConvertCrawlDataAction(plan, msg, inbox); - } - case SideloadEncyclopedia -> { - var processData = fileStorageService.getStorage(request.processedDataStorage); + yield new ConvertCrawlDataAction(plan, msg, inbox); + } + case SideloadEncyclopedia -> { + var processData = fileStorageService.getStorage(request.processedDataStorage); - yield new SideloadAction(sideloadSourceFactory.sideloadEncyclopediaMarginaliaNu(Path.of(request.inputSource), request.baseUrl), - processData.asPath(), - msg, inbox); - } - case SideloadDirtree -> { - var processData = fileStorageService.getStorage(request.processedDataStorage); + yield new SideloadAction(sideloadSourceFactory.sideloadEncyclopediaMarginaliaNu(Path.of(request.inputSource), request.baseUrl), + processData.asPath(), + msg, inbox); + } + case SideloadDirtree -> { + var processData = fileStorageService.getStorage(request.processedDataStorage); - yield new SideloadAction( - sideloadSourceFactory.sideloadDirtree(Path.of(request.inputSource)), - processData.asPath(), - msg, inbox); - } - case SideloadWarc -> { - var processData = fileStorageService.getStorage(request.processedDataStorage); + yield new SideloadAction( + sideloadSourceFactory.sideloadDirtree(Path.of(request.inputSource)), + processData.asPath(), + msg, inbox); + } + case SideloadWarc -> { + var processData = fileStorageService.getStorage(request.processedDataStorage); - yield new SideloadAction( - sideloadSourceFactory.sideloadWarc(Path.of(request.inputSource)), - processData.asPath(), - msg, inbox); - } - case SideloadStackexchange -> { - var processData = fileStorageService.getStorage(request.processedDataStorage); + yield new SideloadAction( + sideloadSourceFactory.sideloadWarc(Path.of(request.inputSource)), + processData.asPath(), + msg, inbox); + } + case SideloadStackexchange -> { + var processData = fileStorageService.getStorage(request.processedDataStorage); - yield new SideloadAction(sideloadSourceFactory.sideloadStackexchange(Path.of(request.inputSource)), - processData.asPath(), - msg, inbox); - } - }; + yield new SideloadAction(sideloadSourceFactory.sideloadStackexchange(Path.of(request.inputSource)), + processData.asPath(), + msg, inbox); + } + }; + } + catch (Exception ex) { + inbox.sendResponse(msg, MqInboxResponse.err(STR."\{ex.getClass().getSimpleName()}: \{ex.getMessage()}")); + + throw ex; + } } private Optional getMessage(MqSingleShotInbox inbox, String expectedFunction) throws SQLException, InterruptedException { diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java index 808d4224..058c0eba 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java @@ -16,6 +16,7 @@ import java.nio.file.Files; import java.nio.file.Path; import java.sql.SQLException; import java.util.Collection; +import java.util.List; public class SideloadSourceFactory { private final Gson gson; @@ -57,14 +58,21 @@ public class SideloadSourceFactory { return warcSideloadFactory.createSideloaders(pathToWarcFiles); } - /** Do not use, this code isn't finished */ public Collection sideloadStackexchange(Path pathToDbFileRoot) throws IOException { - try (var dirs = Files.walk(pathToDbFileRoot)) { - return dirs - .filter(Files::isRegularFile) - .filter(f -> f.toFile().getName().endsWith(".db")) - .map(dbFile -> new StackexchangeSideloader(dbFile, sentenceExtractorProvider, documentKeywordExtractor)) - .toList(); + if (Files.isRegularFile(pathToDbFileRoot)) { + return List.of(new StackexchangeSideloader(pathToDbFileRoot, sentenceExtractorProvider, documentKeywordExtractor)); + } + else if (Files.isDirectory(pathToDbFileRoot)) { + try (var dirs = Files.walk(pathToDbFileRoot)) { + return dirs + .filter(Files::isRegularFile) + .filter(f -> f.toFile().getName().endsWith(".db")) + .map(dbFile -> new StackexchangeSideloader(dbFile, sentenceExtractorProvider, documentKeywordExtractor)) + .toList(); + } + } + else { // unix socket, etc + throw new IllegalArgumentException("Path to stackexchange db file(s) must be a file or directory"); } } } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/warc/WarcSideloadFactory.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/warc/WarcSideloadFactory.java index f192a961..5ba88432 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/warc/WarcSideloadFactory.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/warc/WarcSideloadFactory.java @@ -21,23 +21,33 @@ public class WarcSideloadFactory { } public Collection createSideloaders(Path pathToWarcFiles) throws IOException { - final List files = new ArrayList<>(); - - try (var stream = Files.list(pathToWarcFiles)) { - stream - .filter(Files::isRegularFile) - .filter(this::isWarcFile) - .forEach(files::add); + if (Files.isRegularFile(pathToWarcFiles)) { + return List.of(new WarcSideloader(pathToWarcFiles, processing)); } + else if (Files.isDirectory(pathToWarcFiles)) { - List sources = new ArrayList<>(); + final List files = new ArrayList<>(); - for (Path file : files) { - sources.add(new WarcSideloader(file, processing)); + try (var stream = Files.list(pathToWarcFiles)) { + stream + .filter(Files::isRegularFile) + .filter(this::isWarcFile) + .forEach(files::add); + + } + + List sources = new ArrayList<>(); + + for (Path file : files) { + sources.add(new WarcSideloader(file, processing)); + } + + return sources; + } + else { + throw new IllegalArgumentException("Path " + pathToWarcFiles + " is neither a file nor a directory"); } - - return sources; } private boolean isWarcFile(Path path) { diff --git a/code/services-core/executor-service/build.gradle b/code/services-core/executor-service/build.gradle index adaab6fb..d26eadce 100644 --- a/code/services-core/executor-service/build.gradle +++ b/code/services-core/executor-service/build.gradle @@ -40,6 +40,7 @@ dependencies { implementation project(':code:process-models:crawling-model') implementation project(':code:features-crawl:link-parser') implementation project(':code:features-convert:data-extractors') + implementation project(':code:features-convert:stackexchange-xml') implementation project(':code:features-index:index-journal') implementation project(':code:api:index-api') implementation project(':code:api:query-api') diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ConvertActor.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ConvertActor.java index 40b547a5..9cd08ea4 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ConvertActor.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ConvertActor.java @@ -10,6 +10,8 @@ import nu.marginalia.actor.state.Resume; import nu.marginalia.encyclopedia.EncyclopediaConverter; import nu.marginalia.process.ProcessOutboxes; import nu.marginalia.process.ProcessService; +import nu.marginalia.sideload.SideloadHelper; +import nu.marginalia.sideload.StackExchangeSideloadHelper; import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.model.FileStorageBaseType; import nu.marginalia.storage.model.FileStorageId; @@ -21,11 +23,8 @@ import nu.marginalia.mqapi.converting.ConvertRequest; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; -import java.nio.ByteBuffer; import java.nio.file.Files; import java.nio.file.Path; -import java.util.zip.CRC32; @Singleton public class ConvertActor extends RecordActorPrototype { @@ -109,7 +108,7 @@ public class ConvertActor extends RecordActorPrototype { if (source.toLowerCase().endsWith(".zim")) { // If we're fed a ZIM file, we need to convert it to a sqlite database first - String hash = getCrc32FileHash(sourcePath); + String hash = SideloadHelper.getCrc32FileHash(sourcePath); // To avoid re-converting the same file, we'll assign the file a name based on its hash // and the original filename. This way, if we're fed the same file again, we'll be able to just @@ -179,6 +178,10 @@ public class ConvertActor extends RecordActorPrototype { storageService.setFileStorageState(processedArea.id(), FileStorageState.NEW); + // Convert stackexchange data to sqlite database + // (we can't use a Predigest- step here because the conversion is too complicated) + StackExchangeSideloadHelper.convertStackexchangeData(sourcePath); + // Pre-send convert request yield new ConvertWait( @@ -200,21 +203,7 @@ public class ConvertActor extends RecordActorPrototype { }; } - private String getCrc32FileHash(Path file) throws IOException { - ByteBuffer buffer = ByteBuffer.allocate(8192); - try (var channel = Files.newByteChannel(file)) { - CRC32 crc = new CRC32(); - - while (channel.read(buffer) > 0) { - buffer.flip(); - crc.update(buffer); - buffer.clear(); - } - - return Long.toHexString(crc.getValue()); - } - } @Override public String describe() { diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/sideload/SideloadHelper.java b/code/services-core/executor-service/src/main/java/nu/marginalia/sideload/SideloadHelper.java new file mode 100644 index 00000000..f8b4c67d --- /dev/null +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/sideload/SideloadHelper.java @@ -0,0 +1,25 @@ +package nu.marginalia.sideload; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.zip.CRC32; + +public class SideloadHelper { + public static String getCrc32FileHash(Path file) throws IOException { + ByteBuffer buffer = ByteBuffer.allocate(8192); + + try (var channel = Files.newByteChannel(file)) { + CRC32 crc = new CRC32(); + + while (channel.read(buffer) > 0) { + buffer.flip(); + crc.update(buffer); + buffer.clear(); + } + + return Long.toHexString(crc.getValue()); + } + } +} diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/sideload/StackExchangeSideloadHelper.java b/code/services-core/executor-service/src/main/java/nu/marginalia/sideload/StackExchangeSideloadHelper.java new file mode 100644 index 00000000..444146ad --- /dev/null +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/sideload/StackExchangeSideloadHelper.java @@ -0,0 +1,102 @@ +package nu.marginalia.sideload; + +import nu.marginalia.integration.stackexchange.sqlite.StackExchangePostsDb; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardCopyOption; +import java.util.Optional; +import java.util.zip.CRC32; + +/** Contains helper functions for pre-converting stackexchange style 7z + * files to marginalia-digestible sqlite databases*/ +public class StackExchangeSideloadHelper { + private static final Logger logger = LoggerFactory.getLogger(StackExchangeSideloadHelper.class); + + /** Looks for stackexchange 7z files in the given path and converts them to sqlite databases. + * The function is idempotent, so it is safe to call it multiple times on the same path + * (it will not re-convert files that have already been successfully converted) + * */ + public static void convertStackexchangeData(Path sourcePath) { + if (Files.isDirectory(sourcePath)) { + try (var contents = Files.list(sourcePath)) { + contents.filter(Files::isRegularFile) + .parallel() + .forEach(StackExchangeSideloadHelper::convertSingleStackexchangeFile); + } catch (IOException ex) { + logger.warn("Failed to convert stackexchange 7z file to sqlite database", ex); + } + } else if (Files.isRegularFile(sourcePath)) { + convertSingleStackexchangeFile(sourcePath); + } + } + + private static void convertSingleStackexchangeFile(Path sourcePath) { + String fileName = sourcePath.toFile().getName(); + + if (fileName.endsWith(".db")) return; + if (!fileName.endsWith(".7z")) return; + + Optional domain = getStackexchangeDomainFromFilename(fileName); + if (domain.isEmpty()) + return; + + try { + Path destPath = getStackexchangeDbPath(sourcePath); + if (Files.exists(destPath)) return; + + Path tempFile = Files.createTempFile(destPath.getParent(), "processed", "db.tmp"); + try { + logger.info("Converting stackexchange 7z file {} to sqlite database", sourcePath); + StackExchangePostsDb.create(domain.get(), tempFile, sourcePath); + logger.info("Finished converting stackexchange 7z file {} to sqlite database", sourcePath); + Files.move(tempFile, destPath, StandardCopyOption.REPLACE_EXISTING); + } catch (Exception e) { + logger.error("Failed to convert stackexchange 7z file to sqlite database", e); + Files.deleteIfExists(tempFile); + Files.deleteIfExists(destPath); + } + } catch (IOException ex) { + logger.warn("Failed to convert stackexchange 7z file to sqlite database", ex); + } + } + + private static Path getStackexchangeDbPath(Path sourcePath) throws IOException { + String fileName = sourcePath.toFile().getName(); + String hash = SideloadHelper.getCrc32FileHash(sourcePath); + + return sourcePath.getParent().resolve(STR."\{fileName}.\{hash}.db"); + } + + private static Optional getStackexchangeDomainFromFilename(String fileName) { + // We are only interested in .tld.7z files + if (!fileName.endsWith(".7z") && fileName.length() > 7) + return Optional.empty(); + + + // Stackoverflow is special, because it has one 7z file per site + // (we only want Posts) + + if (fileName.equals("stackoverflow-Posts.7z")) + return Optional.of("stackoverflow.com"); + else if (fileName.startsWith("stackoverflow.com-")) { + return Optional.empty(); + } + + // For stackexchange, we filter out the meta archives + + // We are not interested in the meta files + if (fileName.startsWith("meta.")) + return Optional.empty(); + if (fileName.contains(".meta.")) + return Optional.empty(); + + // Pattern is 'foobar.stackexchange.com.7z' + return Optional.of(fileName.substring(0, fileName.length() - 3)); + } + +} \ No newline at end of file diff --git a/code/tools/stackexchange-converter/build.gradle b/code/tools/stackexchange-converter/build.gradle deleted file mode 100644 index e41faf17..00000000 --- a/code/tools/stackexchange-converter/build.gradle +++ /dev/null @@ -1,40 +0,0 @@ -plugins { - id 'java' - - id 'application' - - id 'jvm-test-suite' -} - -java { - toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) - } -} - -application { - mainClass = 'nu.marginalia.tools.StackexchangeConverter' - applicationName = 'stackexchange-converter' -} - -tasks.distZip.enabled = false - -dependencies { - implementation project(':code:features-convert:stackexchange-xml') - - implementation libs.bundles.slf4j - implementation libs.notnull - - implementation libs.guice - implementation libs.jsoup - implementation libs.trove - implementation libs.fastutil - - implementation libs.bundles.nlp - implementation libs.commons.lang3 - - testImplementation libs.bundles.slf4j.test - testImplementation libs.bundles.junit - testImplementation libs.mockito -} - diff --git a/code/tools/stackexchange-converter/readme.md b/code/tools/stackexchange-converter/readme.md deleted file mode 100644 index 2490d045..00000000 --- a/code/tools/stackexchange-converter/readme.md +++ /dev/null @@ -1,24 +0,0 @@ -This tool converts from stackexchange's 7z-compressed XML -format to a sqlite database that is digestible by the search engine. - -See [features-convert/stackexchange-xml](../../features-convert/stackexchange-xml) for -an explanation why this is necessary. - -Stackexchange's data dumps can be downloaded from archive.org -here: [https://archive.org/details/stackexchange](https://archive.org/details/stackexchange) - -Usage - -```shell -$ stackexchange-converter domain-name input.7z output.db -``` - -Stackexchange is relatively conservative about allowing -new questions, so this is a job that doesn't run more than once. - -Note: Reading and writing these db files is *absurdly* slow -on a mechanical hard-drive. - -## See Also - -* [features-convert/stackexchange-xml](../../features-convert/stackexchange-xml) \ No newline at end of file diff --git a/code/tools/stackexchange-converter/src/main/java/nu/marginalia/tools/StackexchangeConverter.java b/code/tools/stackexchange-converter/src/main/java/nu/marginalia/tools/StackexchangeConverter.java deleted file mode 100644 index a287bdd2..00000000 --- a/code/tools/stackexchange-converter/src/main/java/nu/marginalia/tools/StackexchangeConverter.java +++ /dev/null @@ -1,31 +0,0 @@ -package nu.marginalia.tools; - -import nu.marginalia.integration.stackexchange.sqlite.StackExchangePostsDb; - -import java.nio.file.Files; -import java.nio.file.Path; - -public class StackexchangeConverter { - public static void main(String[] args) { - - if (args.length != 3) { - System.err.println("Converts a stackexchange Posts 7z file to a Marginalia-digestible sqlite-db\n"); - System.err.println("Arguments: domain-name input-file.7z output-file.db"); - return; - } - - String domain = args[0]; - - Path inputFile = Path.of(args[1]); - Path outputFile = Path.of(args[2]); - - if (!Files.exists(inputFile)) - System.err.println("Input file " + inputFile + " does not exists"); - - System.out.println("Converting " + inputFile); - - StackExchangePostsDb.create(domain, outputFile, inputFile); - - System.out.println("... done!"); - } -} diff --git a/doc/images/convert_2.png b/doc/images/convert_2.png new file mode 100644 index 00000000..c5adb27d Binary files /dev/null and b/doc/images/convert_2.png differ diff --git a/doc/images/load_warc.png b/doc/images/load_warc.png new file mode 100644 index 00000000..5e0cedde Binary files /dev/null and b/doc/images/load_warc.png differ diff --git a/doc/images/sideload_menu.png b/doc/images/sideload_menu.png new file mode 100644 index 00000000..6a85d076 Binary files /dev/null and b/doc/images/sideload_menu.png differ diff --git a/doc/images/sideload_warc.png b/doc/images/sideload_warc.png new file mode 100644 index 00000000..dd763efc Binary files /dev/null and b/doc/images/sideload_warc.png differ diff --git a/doc/sideloading-howto.md b/doc/sideloading-howto.md index 3faf68fa..93a44981 100644 --- a/doc/sideloading-howto.md +++ b/doc/sideloading-howto.md @@ -1,23 +1,121 @@ # Sideloading How-To -(This document is a bit of a draft to get this down in writing -while it's still fresh in my head.) - Some websites are much larger than others, this includes Wikipedia, Stack Overflow, and a few others. They are so large they are impractical to crawl in the traditional fashion, but luckily they make available data dumps that can be processed and loaded into the search engine through other means. -## Notes on Docker +To this end, it's possible to sideload data into the search engine +from other sources than the web crawler. -If you're running the system in docker, you'll need to provide the paths -to the data in a way where it is available to the docker container. +## Index Nodes + +In practice, if you want to sideload data, you need to do it on +a separate index node. Index nodes are separate instances of the +index software. The default configuration is to have two index nodes, +one for the web crawler, and one for sideloaded data. + +The need for a separate node is due to incompatibilities in the work flows. + +It is also a good idea in general, as very large domains can easily be so large that the entire time budget +for the query is spent sifting through documents from that one domain, this is +especially true with something like Wikipedia, which has a lot of documents at +least tangentially related to any given topic. + +This how-to assumes that you are operating on index-node 2. + +## Notes on the upload directory + +This is written assuming that the system is installed with the `install.sh` +script, which deploys the system with docker-compose, and has a directory +structure like + +``` +... +index-1/backup/ +index-1/index/ +index-1/storage/ +index-1/uploads/ +index-1/work/ +index-2/backup/ +index-2/index/ +index-2/storage/ +index-2/uploads/ +index-2/work/ +... +``` + +We're going to be putting files in the **uploads** directories. If you have installed +the system in some other way, or changed the configuration significantly, you need +to adjust the paths accordingly. + +## Sideloading + +The sideloading actions are available through Actions menu in each node. + +![Sideload menu](images/sideload_menu.png) + +## Sideloading WARCs + +WARC files are the standard format for web archives. They can be created e.g. with wget. +The Marginalia software can read WARC files directly, and sideload them into the index, +as long as each warc file contains only one domain. + +Let's for example archive www.marginalia.nu (I own this domain, so feel free to try this at home) + +```bash +$ wget -r --warc-file=marginalia www.marginalia.nu +``` + +**Note** If you intend to do this on other websites, you should probably add a `--wait` parameter to wget, +e.g. `wget --wait=1 -r --warc-file=...` to avoid hammering the website with requests and getting blocked. + +This will take a moment, and create a file called `marginalia.warc.gz`. We move it to the +upload directory of the index node, and sideload it through the Actions menu. + +```bash +$ mkdir -p index-2/uploads/marginalia-warc +$ mv marginalia.warc.gz index-2/uploads/marginalia-warc +``` + +Go to the Actions menu, and select the "Sideload WARC" action. This will show a list of +subdirectories in the Uploads directory. Select the directory containing the WARC file, and +click "Sideload". + +![Sideload WARC screenshot](images/sideload_warc.png) + +This should take you to the node overview, where you can see the progress of the sideloading. +It will take a moment, as the WARC file is being processed. + +![Processing in progress](images/convert_2.png) + +It will not be loaded automatically. This is to permit you to sideload multiple sources. + +When you are ready to load it, go to the Actions menu, and select "Load Crawl Data". + +![Load Crawl Data](images/load_warc.png) + +Select all the sources you want to load, and click "Load". This will load the data into the +index, and make it available for searching. + +## Sideloading Wikipedia + +Due to licensing incompatibilities with OpenZim's GPL-2 and AGPL, the workflow +depends on using the conversion process from [https://encyclopedia.marginalia.nu/](https://encyclopedia.marginalia.nu/) +to pre-digest the data. + +Build the [encyclopedia.marginalia.nu Code](https://github.com/MarginaliaSearch/encyclopedia.marginalia.nu) +and follow the instructions for downloading a ZIM file, and then run something like + +```$./encyclopedia convert file.zim articles.db``` + +This db-file can be processed and loaded into the search engine through the +Actions view. + +FIXME: It will currently only point to en.wikipedia.org, this should be +made configurable. -Either mount the data into the executor's container, or copy it into e.g. -the data directory, which is mounted into the container as `/wmsa/data`. -For a test deployment, a file placed in `run/data/foo` will be available -in the container as `/wmsa/data/foo`. ## Sideloading a directory tree @@ -98,23 +196,6 @@ python-3.11.5/[...] This yaml-file can be processed and loaded into the search engine through the Actions view. -## Sideloading Wikipedia - -For now, this workflow depends on using the conversion process from -[https://encyclopedia.marginalia.nu/](https://encyclopedia.marginalia.nu/) -to pre-digest the data. This is because it uses OpenZIM which has a -license that is incompatible with this project. - -Build the [encyclopedia.marginalia.nu Code](https://github.com/MarginaliaSearch/encyclopedia.marginalia.nu) -and follow the instructions for downloading a ZIM file, and then run something like - -```$./encyclopedia convert file.zim articles.db``` - -This db-file can be processed and loaded into the search engine through the -Actions view. - -FIXME: It will currently only point to en.wikipedia.org, this should be -made configurable. ## Sideloading Stack Overflow/Stackexchange diff --git a/settings.gradle b/settings.gradle index d91df376..190df710 100644 --- a/settings.gradle +++ b/settings.gradle @@ -85,11 +85,9 @@ include 'code:tools:term-frequency-extractor' include 'code:tools:experiment-runner' include 'code:tools:screenshot-capture-tool' include 'code:tools:load-test' -include 'code:tools:stackexchange-converter' include 'code:tools:crawl-data-unfcker' include 'third-party:porterstemmer' -include 'third-party:xz' include 'third-party:symspell' include 'third-party:rdrpostagger' include 'third-party:openzim' @@ -164,7 +162,7 @@ dependencyResolutionManagement { library('httpcomponents.client','org.apache.httpcomponents','httpclient').version('4.5.13') library('commons.net', 'commons-net','commons-net').version('3.9.0') library('commons.lang3', 'org.apache.commons','commons-lang3').version('3.12.0') - library('commons.compress','org.apache.commons','commons-compress').version('1.21') + library('commons.compress','org.apache.commons','commons-compress').version('1.25.0') library('commons.io','commons-io','commons-io').version('2.11.0') library('commons.codec', 'commons-codec', 'commons-codec').version('1.16.0') @@ -185,6 +183,7 @@ dependencyResolutionManagement { library('zstd','com.github.luben','zstd-jni').version('1.5.2-2') library('lz4','org.lz4','lz4-java').version('1.8.0') + library('xz','org.tukaani','xz').version('1.9') library('flyway.core','org.flywaydb','flyway-core').version('10.4.1') library('flyway.mysql','org.flywaydb','flyway-mysql').version('10.4.1') diff --git a/third-party/encyclopedia-marginalia-nu/build.gradle b/third-party/encyclopedia-marginalia-nu/build.gradle index 443b599d..992f70e2 100644 --- a/third-party/encyclopedia-marginalia-nu/build.gradle +++ b/third-party/encyclopedia-marginalia-nu/build.gradle @@ -17,7 +17,6 @@ dependencies { implementation project(':code:libraries:blocking-thread-pool') - implementation project(':third-party:xz') implementation project(':third-party:openzim') } diff --git a/third-party/openzim/build.gradle b/third-party/openzim/build.gradle index 08f91136..2a1c9da0 100644 --- a/third-party/openzim/build.gradle +++ b/third-party/openzim/build.gradle @@ -16,7 +16,7 @@ dependencies { implementation libs.databind implementation libs.bundles.gson - implementation project(':third-party:xz') + implementation libs.xz } test { diff --git a/third-party/xz/build.gradle b/third-party/xz/build.gradle deleted file mode 100644 index b49a1ccd..00000000 --- a/third-party/xz/build.gradle +++ /dev/null @@ -1,16 +0,0 @@ -plugins { - id 'java' -} - -java { - toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) - } -} - -dependencies { -} - -test { - useJUnitPlatform() -} diff --git a/third-party/xz/readme.md b/third-party/xz/readme.md deleted file mode 100644 index a74b4d52..00000000 --- a/third-party/xz/readme.md +++ /dev/null @@ -1,9 +0,0 @@ -# XZ - -[XZ for Java](https://tukaani.org/xz/) - Public Domain - -"XZ Utils is free general-purpose data compression software with a high compression ratio. -XZ Utils were written for POSIX-like systems, but also work on some not-so-POSIX systems. -XZ Utils are the successor to LZMA Utils." - -Needed for [openzim](../openzim) to deal with modern zim files. \ No newline at end of file diff --git a/third-party/xz/src/main/java/org/tukaani/xz/BlockInputStream.java b/third-party/xz/src/main/java/org/tukaani/xz/BlockInputStream.java deleted file mode 100644 index d015daba..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/BlockInputStream.java +++ /dev/null @@ -1,212 +0,0 @@ -/* - * BlockInputStream - * - * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz; - -import java.io.InputStream; -import java.io.DataInputStream; -import java.io.ByteArrayInputStream; -import java.io.IOException; -import java.util.Arrays; -import org.tukaani.xz.common.DecoderUtil; -import org.tukaani.xz.check.Check; - -class BlockInputStream extends InputStream { - private final InputStream in; - private final DataInputStream inData; - private final CountingInputStream inCounted; - private InputStream filterChain; - private final Check check; - - private long uncompressedSizeInHeader = -1; - private long compressedSizeInHeader = -1; - private long compressedSizeLimit; - private final int headerSize; - private long uncompressedSize = 0; - - public BlockInputStream(InputStream in, Check check, int memoryLimit) - throws IOException, IndexIndicatorException { - this.in = in; - this.check = check; - inData = new DataInputStream(in); - - byte[] buf = new byte[DecoderUtil.BLOCK_HEADER_SIZE_MAX]; - - // Block Header Size or Index Indicator - inData.readFully(buf, 0, 1); - - // See if this begins the Index field. - if (buf[0] == 0x00) - throw new IndexIndicatorException(); - - // Read the rest of the Block Header. - headerSize = 4 * (buf[0] + 1); - inData.readFully(buf, 1, headerSize - 1); - - // Validate the CRC32. - if (!DecoderUtil.isCRC32Valid(buf, 0, headerSize - 4, headerSize - 4)) - throw new CorruptedInputException("XZ Block Header is corrupt"); - - // Check for reserved bits in Block Flags. - if ((buf[1] & 0x3C) != 0) - throw new UnsupportedOptionsException( - "Unsupported options in XZ Block Header"); - - // Memory for the Filter Flags field - int filterCount = (buf[1] & 0x03) + 1; - long[] filterIDs = new long[filterCount]; - byte[][] filterProps = new byte[filterCount][]; - - // Use a stream to parse the fields after the Block Flags field. - // Exclude the CRC32 field at the end. - ByteArrayInputStream bufStream = new ByteArrayInputStream( - buf, 2, headerSize - 6); - - try { - // Set the maximum valid compressed size. This is overriden - // by the value from the Compressed Size field if it is present. - compressedSizeLimit = (DecoderUtil.VLI_MAX & ~3) - - headerSize - check.getSize(); - - // Decode and validate Compressed Size if the relevant flag - // is set in Block Flags. - if ((buf[1] & 0x40) != 0x00) { - compressedSizeInHeader = DecoderUtil.decodeVLI(bufStream); - - if (compressedSizeInHeader == 0 - || compressedSizeInHeader > compressedSizeLimit) - throw new CorruptedInputException(); - - compressedSizeLimit = compressedSizeInHeader; - } - - // Decode Uncompressed Size if the relevant flag is set - // in Block Flags. - if ((buf[1] & 0x80) != 0x00) - uncompressedSizeInHeader = DecoderUtil.decodeVLI(bufStream); - - // Decode Filter Flags. - for (int i = 0; i < filterCount; ++i) { - filterIDs[i] = DecoderUtil.decodeVLI(bufStream); - - long filterPropsSize = DecoderUtil.decodeVLI(bufStream); - if (filterPropsSize > bufStream.available()) - throw new CorruptedInputException(); - - filterProps[i] = new byte[(int)filterPropsSize]; - bufStream.read(filterProps[i]); - } - - } catch (IOException e) { - throw new CorruptedInputException("XZ Block Header is corrupt"); - } - - // Check that the remaining bytes are zero. - for (int i = bufStream.available(); i > 0; --i) - if (bufStream.read() != 0x00) - throw new UnsupportedOptionsException( - "Unsupported options in XZ Block Header"); - - // Check if the Filter IDs are supported, decode - // the Filter Properties, and check that they are - // supported by this decoder implementation. - FilterDecoder[] filters = new FilterDecoder[filterIDs.length]; - - for (int i = 0; i < filters.length; ++i) { - if (filterIDs[i] == LZMA2Coder.FILTER_ID) - filters[i] = new LZMA2Decoder(filterProps[i]); - - else if (filterIDs[i] == DeltaCoder.FILTER_ID) - filters[i] = new DeltaDecoder(filterProps[i]); - - else - throw new UnsupportedOptionsException( - "Unknown Filter ID " + filterIDs[i]); - } - - RawCoder.validate(filters); - - // Check the memory usage limit. - if (memoryLimit >= 0) { - int memoryNeeded = 0; - for (int i = 0; i < filters.length; ++i) - memoryNeeded += filters[i].getMemoryUsage(); - - if (memoryNeeded > memoryLimit) - throw new MemoryLimitException(memoryNeeded, memoryLimit); - } - - // Use an input size counter to calculate - // the size of the Compressed Data field. - inCounted = new CountingInputStream(in); - - // Initialize the filter chain. - filterChain = inCounted; - for (int i = filters.length - 1; i >= 0; --i) - filterChain = filters[i].getInputStream(filterChain); - } - - public int read() throws IOException { - byte[] buf = new byte[1]; - return read(buf, 0, 1) == -1 ? -1 : (buf[0] & 0xFF); - } - - public int read(byte[] buf, int off, int len) throws IOException { - int ret = filterChain.read(buf, off, len); - long compressedSize = inCounted.getSize(); - - if (ret > 0) { - check.update(buf, off, ret); - uncompressedSize += ret; - - // Catch invalid values. - if (compressedSize < 0 - || compressedSize > compressedSizeLimit - || uncompressedSize < 0 - || (uncompressedSizeInHeader != -1 - && uncompressedSize > uncompressedSizeInHeader)) - throw new CorruptedInputException(); - - } else if (ret == -1) { - // Validate Compressed Size and Uncompressed Size if they were - // present in Block Header. - if ((compressedSizeInHeader != -1 - && compressedSizeInHeader != compressedSize) - || (uncompressedSizeInHeader != -1 - && uncompressedSizeInHeader != uncompressedSize)) - throw new CorruptedInputException(); - - // Block Padding bytes must be zeros. - for (long i = compressedSize; (i & 3) != 0; ++i) - if (inData.readUnsignedByte() != 0x00) - throw new CorruptedInputException(); - - // Validate the integrity check. - byte[] storedCheck = new byte[check.getSize()]; - inData.readFully(storedCheck); - if (!Arrays.equals(check.finish(), storedCheck)) - throw new CorruptedInputException("Integrity (" - + check.getName() + ") check does not match"); - } - - return ret; - } - - public int available() throws IOException { - return filterChain.available(); - } - - public long getUnpaddedSize() { - return headerSize + inCounted.getSize() + check.getSize(); - } - - public long getUncompressedSize() { - return uncompressedSize; - } -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/BlockOutputStream.java b/third-party/xz/src/main/java/org/tukaani/xz/BlockOutputStream.java deleted file mode 100644 index b031116d..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/BlockOutputStream.java +++ /dev/null @@ -1,128 +0,0 @@ -/* - * BlockOutputStream - * - * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz; - -import java.io.OutputStream; -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import org.tukaani.xz.common.EncoderUtil; -import org.tukaani.xz.check.Check; - -class BlockOutputStream extends FinishableOutputStream { - private final OutputStream out; - private final CountingOutputStream outCounted; - private FinishableOutputStream filterChain; - private final Check check; - - private final int headerSize; - private final long compressedSizeLimit; - private long uncompressedSize = 0; - - public BlockOutputStream(OutputStream out, FilterEncoder[] filters, - Check check) throws IOException { - this.out = out; - this.check = check; - - // Initialize the filter chain. - outCounted = new CountingOutputStream(out); - filterChain = outCounted; - for (int i = 0; i < filters.length; ++i) - filterChain = filters[i].getOutputStream(filterChain); - - // Prepare to encode the Block Header field. - ByteArrayOutputStream bufStream = new ByteArrayOutputStream(); - - // Write a dummy Block Header Size field. The real value is written - // once everything else except CRC32 has been written. - bufStream.write(0x00); - - // Write Block Flags. Storing Compressed Size or Uncompressed Size - // isn't supported for now. - bufStream.write(filters.length - 1); - - // List of Filter Flags - for (int i = 0; i < filters.length; ++i) { - EncoderUtil.encodeVLI(bufStream, filters[i].getFilterID()); - byte[] filterProps = filters[i].getFilterProps(); - EncoderUtil.encodeVLI(bufStream, filterProps.length); - bufStream.write(filterProps); - } - - // Header Padding - while ((bufStream.size() & 3) != 0) - bufStream.write(0x00); - - byte[] buf = bufStream.toByteArray(); - - // Total size of the Block Header: Take the size of the CRC32 field - // into account. - headerSize = buf.length + 4; - - // This is just a sanity check. - if (headerSize > EncoderUtil.BLOCK_HEADER_SIZE_MAX) - throw new UnsupportedOptionsException(); - - // Block Header Size - buf[0] = (byte)(buf.length / 4); - - // Write the Block Header field to the output stream. - out.write(buf); - EncoderUtil.writeCRC32(out, buf); - - // Calculate the maximum allowed size of the Compressed Data field. - // It is hard to exceed it so this is mostly to be pedantic. - compressedSizeLimit = (EncoderUtil.VLI_MAX & ~3) - - headerSize - check.getSize(); - } - - public void write(int b) throws IOException { - byte[] buf = new byte[1]; - buf[0] = (byte)b; - write(buf, 0, 1); - } - - public void write(byte[] buf, int off, int len) throws IOException { - filterChain.write(buf, off, len); - check.update(buf, off, len); - uncompressedSize += len; - validate(); - } - - public void finish() throws IOException { - // Finish the Compressed Data field. - filterChain.finish(); - validate(); - - // Block Padding - for (long i = outCounted.getSize(); (i & 3) != 0; ++i) - out.write(0x00); - - // Check - out.write(check.finish()); - } - - private void validate() throws IOException { - long compressedSize = outCounted.getSize(); - - // It is very hard to trigger this exception. - // This is just to be pedantic. - if (compressedSize < 0 || compressedSize > compressedSizeLimit - || uncompressedSize < 0) - throw new XZIOException("XZ Stream has grown too big"); - } - - public long getUnpaddedSize() { - return headerSize + outCounted.getSize() + check.getSize(); - } - - public long getUncompressedSize() { - return uncompressedSize; - } -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/CorruptedInputException.java b/third-party/xz/src/main/java/org/tukaani/xz/CorruptedInputException.java deleted file mode 100644 index d7d95207..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/CorruptedInputException.java +++ /dev/null @@ -1,37 +0,0 @@ -/* - * CorruptedInputException - * - * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz; - -/** - * Thrown when the compressed input data is corrupt. - * However, it is possible that some or all of the data - * already read from the input stream was corrupt too. - */ -public class CorruptedInputException extends XZIOException { - private static final long serialVersionUID = 3L; - - /** - * Creates a new CorruptedInputException with - * the default error detail message. - */ - public CorruptedInputException() { - super("Compressed data is corrupt"); - } - - /** - * Creates a new CorruptedInputException with - * the specified error detail message. - * - * @param s error detail message - */ - public CorruptedInputException(String s) { - super(s); - } -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/CountingInputStream.java b/third-party/xz/src/main/java/org/tukaani/xz/CountingInputStream.java deleted file mode 100644 index 2d85eaf6..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/CountingInputStream.java +++ /dev/null @@ -1,42 +0,0 @@ -/* - * CountingInputStream - * - * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz; - -import java.io.FilterInputStream; -import java.io.InputStream; -import java.io.IOException; - -class CountingInputStream extends FilterInputStream { - private long size = 0; - - public CountingInputStream(InputStream in) { - super(in); - } - - public int read() throws IOException { - int ret = in.read(); - if (ret != -1 && size >= 0) - ++size; - - return ret; - } - - public int read(byte[] b, int off, int len) throws IOException { - int ret = in.read(b, off, len); - if (ret > 0 && size >= 0) - size += ret; - - return ret; - } - - public long getSize() { - return size; - } -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/CountingOutputStream.java b/third-party/xz/src/main/java/org/tukaani/xz/CountingOutputStream.java deleted file mode 100644 index 8a1c1907..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/CountingOutputStream.java +++ /dev/null @@ -1,46 +0,0 @@ -/* - * CountingOutputStream - * - * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz; - -import java.io.OutputStream; -import java.io.IOException; - -class CountingOutputStream extends FinishableOutputStream { - private final OutputStream out; - private long size = 0; - - public CountingOutputStream(OutputStream out) { - this.out = out; - } - - public void write(int b) throws IOException { - out.write(b); - if (size >= 0) - ++size; - } - - public void write(byte[] b, int off, int len) throws IOException { - out.write(b, off, len); - if (size >= 0) - size += len; - } - - public void flush() throws IOException { - out.flush(); - } - - public void close() throws IOException { - out.close(); - } - - public long getSize() { - return size; - } -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/DeltaCoder.java b/third-party/xz/src/main/java/org/tukaani/xz/DeltaCoder.java deleted file mode 100644 index 808834c8..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/DeltaCoder.java +++ /dev/null @@ -1,26 +0,0 @@ -/* - * DeltaCoder - * - * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz; - -abstract class DeltaCoder implements FilterCoder { - public static final long FILTER_ID = 0x03; - - public boolean changesSize() { - return false; - } - - public boolean nonLastOK() { - return true; - } - - public boolean lastOK() { - return false; - } -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/DeltaDecoder.java b/third-party/xz/src/main/java/org/tukaani/xz/DeltaDecoder.java deleted file mode 100644 index 445d1782..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/DeltaDecoder.java +++ /dev/null @@ -1,32 +0,0 @@ -/* - * DeltaDecoder - * - * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz; - -import java.io.InputStream; - -class DeltaDecoder extends DeltaCoder implements FilterDecoder { - private final int distance; - - DeltaDecoder(byte[] props) throws UnsupportedOptionsException { - if (props.length != 1) - throw new UnsupportedOptionsException( - "Unsupported Delta filter properties"); - - distance = (props[0] & 0xFF) + 1; - } - - public int getMemoryUsage() { - return 1; - } - - public InputStream getInputStream(InputStream in) { - return new DeltaInputStream(in, distance); - } -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/DeltaInputStream.java b/third-party/xz/src/main/java/org/tukaani/xz/DeltaInputStream.java deleted file mode 100644 index 876c7033..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/DeltaInputStream.java +++ /dev/null @@ -1,105 +0,0 @@ -/* - * DeltaInputStream - * - * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz; - -import java.io.InputStream; -import java.io.IOException; -import org.tukaani.xz.delta.DeltaDecoder; - -/** - * Decodes Delta-filtered data. - *

- * The delta filter doesn't change the size of the data and thus it - * cannot have an end-of-payload marker. It will simply decode until - * its input stream indicates end of input. - */ -public class DeltaInputStream extends InputStream { - /** - * Smallest supported delta calculation distance. - */ - public static final int DISTANCE_MIN = 1; - - /** - * Largest supported delta calculation distance. - */ - public static final int DISTANCE_MAX = 256; - - private final InputStream in; - private final DeltaDecoder delta; - - /** - * Creates a new Delta decoder with the given delta calculation distance. - * - * @param in input stream from which Delta filtered data - * is read - * - * @param distance delta calculation distance, must be in the - * range [DISTANCE_MIN, - * DISTANCE_MAX] - */ - public DeltaInputStream(InputStream in, int distance) { - this.in = in; - this.delta = new DeltaDecoder(distance); - } - - /** - * Decode the next byte from this input stream. - * - * @return the next decoded byte, or -1 to indicate - * the end of input on the input stream in - * - * @throws IOException may be thrown by in - */ - public int read() throws IOException { - byte[] buf = new byte[1]; - return read(buf, 0, 1) == -1 ? -1 : (buf[0] & 0xFF); - } - - /** - * Decode into an array of bytes. - *

- * This calls in.read(buf, off, len) and defilters the - * returned data. - * - * @param buf target buffer for decoded data - * @param off start offset in buf - * @param len maximum number of bytes to read - * - * @return number of bytes read, or -1 to indicate - * the end of the input stream in - * - * @throws IOException may be thrown by underlaying input - * stream in - */ - public int read(byte[] buf, int off, int len) throws IOException { - int size = in.read(buf, off, len); - if (size == -1) - return -1; - - delta.decode(buf, off, size); - return size; - } - - /** - * Calls in.available(). - * - * @return the value returned by in.available() - */ - public int available() throws IOException { - return in.available(); - } - - /** - * Calls in.close(). - */ - public void close() throws IOException { - in.close(); - } -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/FilterCoder.java b/third-party/xz/src/main/java/org/tukaani/xz/FilterCoder.java deleted file mode 100644 index 1e95e37f..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/FilterCoder.java +++ /dev/null @@ -1,16 +0,0 @@ -/* - * FilterCoder - * - * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz; - -interface FilterCoder { - boolean changesSize(); - boolean nonLastOK(); - boolean lastOK(); -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/FilterDecoder.java b/third-party/xz/src/main/java/org/tukaani/xz/FilterDecoder.java deleted file mode 100644 index 8e2d0061..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/FilterDecoder.java +++ /dev/null @@ -1,17 +0,0 @@ -/* - * FilterDecoder - * - * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz; - -import java.io.InputStream; - -interface FilterDecoder extends FilterCoder { - int getMemoryUsage(); - InputStream getInputStream(InputStream in); -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/FilterEncoder.java b/third-party/xz/src/main/java/org/tukaani/xz/FilterEncoder.java deleted file mode 100644 index 2b2c2a51..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/FilterEncoder.java +++ /dev/null @@ -1,16 +0,0 @@ -/* - * FilterEncoder - * - * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz; - -interface FilterEncoder extends FilterCoder { - long getFilterID(); - byte[] getFilterProps(); - FinishableOutputStream getOutputStream(FinishableOutputStream out); -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/FilterOptions.java b/third-party/xz/src/main/java/org/tukaani/xz/FilterOptions.java deleted file mode 100644 index ec2a6894..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/FilterOptions.java +++ /dev/null @@ -1,28 +0,0 @@ -/* - * FilterOptions - * - * Authors: Lasse Collin - * Igor Pavlov - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz; - -import java.io.InputStream; -import java.io.IOException; - -public abstract class FilterOptions implements Cloneable { - public abstract int getEncoderMemoryUsage(); - public abstract FinishableOutputStream getOutputStream( - FinishableOutputStream out); - - public abstract int getDecoderMemoryUsage(); - public abstract InputStream getInputStream(InputStream in) - ; - - abstract FilterEncoder getFilterEncoder(); - - FilterOptions() {} -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/FinishableOutputStream.java b/third-party/xz/src/main/java/org/tukaani/xz/FinishableOutputStream.java deleted file mode 100644 index 64d4ca53..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/FinishableOutputStream.java +++ /dev/null @@ -1,31 +0,0 @@ -/* - * FinishableOutputStream - * - * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz; - -import java.io.OutputStream; -import java.io.IOException; - -/** - * Output stream that supports finishing without closing - * the underlying stream. - */ -public abstract class FinishableOutputStream extends OutputStream { - /** - * Finish the stream without closing the underlying stream. - * No more data may be written to the stream after finishing. - *

- * The finish method of FinishableOutputStream - * does nothing. Subclasses should override it if they need finishing - * support, which is the case, for example, with compressors. - * - * @throws IOException - */ - public void finish() throws IOException {} -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/IndexIndicatorException.java b/third-party/xz/src/main/java/org/tukaani/xz/IndexIndicatorException.java deleted file mode 100644 index fc6bc038..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/IndexIndicatorException.java +++ /dev/null @@ -1,14 +0,0 @@ -/* - * IndexIndicatorException - * - * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz; - -class IndexIndicatorException extends Exception { - private static final long serialVersionUID = 1L; -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/LZMA2Coder.java b/third-party/xz/src/main/java/org/tukaani/xz/LZMA2Coder.java deleted file mode 100644 index b0963b75..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/LZMA2Coder.java +++ /dev/null @@ -1,26 +0,0 @@ -/* - * LZMA2Coder - * - * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz; - -abstract class LZMA2Coder implements FilterCoder { - public static final long FILTER_ID = 0x21; - - public boolean changesSize() { - return true; - } - - public boolean nonLastOK() { - return false; - } - - public boolean lastOK() { - return true; - } -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/LZMA2Decoder.java b/third-party/xz/src/main/java/org/tukaani/xz/LZMA2Decoder.java deleted file mode 100644 index 82075c21..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/LZMA2Decoder.java +++ /dev/null @@ -1,35 +0,0 @@ -/* - * LZMA2Decoder - * - * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz; - -import java.io.InputStream; - -class LZMA2Decoder extends LZMA2Coder implements FilterDecoder { - private int dictSize; - - LZMA2Decoder(byte[] props) throws UnsupportedOptionsException { - // Up to 1.5 GiB dictionary is supported. The bigger ones - // are too big for int. - if (props.length != 1 || (props[0] & 0xFF) > 37) - throw new UnsupportedOptionsException( - "Unsupported LZMA2 properties"); - - dictSize = 2 | (props[0] & 1); - dictSize <<= (props[0] >>> 1) + 11; - } - - public int getMemoryUsage() { - return LZMA2InputStream.getMemoryUsage(dictSize); - } - - public InputStream getInputStream(InputStream in) { - return new LZMA2InputStream(in, dictSize); - } -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/LZMA2Encoder.java b/third-party/xz/src/main/java/org/tukaani/xz/LZMA2Encoder.java deleted file mode 100644 index 97f4532d..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/LZMA2Encoder.java +++ /dev/null @@ -1,35 +0,0 @@ -/* - * LZMA2Encoder - * - * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz; - -class LZMA2Encoder extends LZMA2Coder implements FilterEncoder { - private final LZMA2Options options; - private final byte[] props = new byte[1]; - - LZMA2Encoder(LZMA2Options options) { - // Make a private copy so that the caller is free to change its copy. - this.options = (LZMA2Options)options.clone(); - - // TODO: Props!!! - - } - - public long getFilterID() { - return FILTER_ID; - } - - public byte[] getFilterProps() { - return props; - } - - public FinishableOutputStream getOutputStream(FinishableOutputStream out) { - return options.getOutputStream(out); - } -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/LZMA2InputStream.java b/third-party/xz/src/main/java/org/tukaani/xz/LZMA2InputStream.java deleted file mode 100644 index 36a17f13..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/LZMA2InputStream.java +++ /dev/null @@ -1,328 +0,0 @@ -/* - * LZMA2InputStream - * - * Authors: Lasse Collin - * Igor Pavlov - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz; - -import java.io.InputStream; -import java.io.DataInputStream; -import java.io.IOException; -import org.tukaani.xz.lz.LZDecoder; -import org.tukaani.xz.rangecoder.RangeDecoder; -import org.tukaani.xz.lzma.LZMADecoder; - -/** - * Decompresses a raw LZMA2 stream. - */ -public class LZMA2InputStream extends InputStream { - /** - * Smallest valid LZMA2 dictionary size. - *

- * Very tiny dictionaries would be a performance problem, so - * the minimum is 4 KiB. - */ - public static final int DICT_SIZE_MIN = 4096; - - /** - * Largest dictionary size supported by this implementation. - *

- * The LZMA2 algorithm allows dictionaries up to one byte less than 4 GiB. - * This implementation supports only 16 bytes less than 2 GiB for raw - * LZMA2 streams, and for .xz files the maximum is 1.5 GiB. This - * limitation is due to Java using signed 32-bit integers for array - * indexing. The limitation shouldn't matter much in practice since so - * huge dictionaries are not normally used. - */ - public static final int DICT_SIZE_MAX = Integer.MAX_VALUE & ~15; - - private static final int COMPRESSED_SIZE_MAX = 1 << 16; - - private final DataInputStream in; - - private final LZDecoder lz; - private final RangeDecoder rc = new RangeDecoder(COMPRESSED_SIZE_MAX); - private LZMADecoder lzma; - - private int uncompressedSize = 0; - private boolean isLZMAChunk; - - private boolean needDictReset = true; - private boolean needProps = true; - private boolean endReached = false; - - private IOException exception = null; - - /** - * Gets approximate decompressor memory requirements as kibibytes for - * the given dictionary size. - * - * @param dictSize LZMA2 dictionary size as bytes, must be - * in the range [DICT_SIZE_MIN, - * DICT_SIZE_MAX] - * - * @return approximate memory requirements as kibibytes (KiB) - */ - public static int getMemoryUsage(int dictSize) { - // The base state is aroudn 30-40 KiB (probabilities etc.), - // range decoder needs COMPRESSED_SIZE_MAX bytes for buffering, - // and LZ decoder needs a dictionary buffer. - return 40 + COMPRESSED_SIZE_MAX / 1024 + getDictSize(dictSize) / 1024; - } - - private static int getDictSize(int dictSize) { - if (dictSize < DICT_SIZE_MIN || dictSize > DICT_SIZE_MAX) - throw new IllegalArgumentException( - "Unsupported dictionary size " + dictSize); - - // Round dictionary size upward to a multiple of 16. This way LZMA - // can use LZDecoder.getPos() for calculating LZMA's posMask. - // Note that this check is needed only for raw LZMA2 streams; it is - // redundant with .xz. - return (dictSize + 15) & ~15; - } - - /** - * Creates a new input stream that decompresses raw LZMA2 data - * from in. - *

- * The caller needs to know the dictionary size used when compressing; - * the dictionary size isn't stored as part of a raw LZMA2 stream. - *

- * Specifying a too small dictionary size will prevent decompressing - * the stream. Specifying a too big dictionary is waste of memory but - * decompression will work. - *

- * There is no need to specify a dictionary bigger than - * the uncompressed size of the data even if a bigger dictionary - * was used when compressing. If you know the uncompressed size - * of the data, this might allow saving some memory. - * - * @param in input stream from which LZMA2-compressed - * data is read - * - * @param dictSize LZMA2 dictionary size as bytes, must be - * in the range [DICT_SIZE_MIN, - * DICT_SIZE_MAX] - */ - public LZMA2InputStream(InputStream in, int dictSize) { - this.in = new DataInputStream(in); - this.lz = new LZDecoder(getDictSize(dictSize), null); - } - - /** - * Creates a new LZMA2 decompressor using a preset dictionary. - *

- * This is like LZMAInputStream() except that the - * dictionary may be initialized using a preset dictionary. - * If a preset dictionary was used when compressing the data, the - * same preset dictionary must be provided when decompressing. - * - * @param in input stream from which LZMA2-compressed - * data is read - * - * @param dictSize LZMA2 dictionary size as bytes, must be - * in the range [DICT_SIZE_MIN, - * DICT_SIZE_MAX] - * - * @param presetDict preset dictionary or null - * to use no preset dictionary - */ - public LZMA2InputStream(InputStream in, int dictSize, byte[] presetDict) { - this.in = new DataInputStream(in); - this.lz = new LZDecoder(getDictSize(dictSize), presetDict); - - if (presetDict.length > 0) - needDictReset = false; - } - - /** - * Decompresses the next byte from this input stream. - *

- * Reading lots of data with read() from this input stream - * may be inefficient. Wrap it in java.io.BufferedInputStream - * if you need to read lots of data one byte at a time. - * - * @return the next decompressed byte, or -1 - * to indicate the end of the compressed stream - * - * @throws CorruptedInputException - * - * @throws EOFException - * compressed input is truncated or corrupt - * - * @throws IOException may be thrown by in - */ - public int read() throws IOException { - byte[] buf = new byte[1]; - return read(buf, 0, 1) == -1 ? -1 : (buf[0] & 0xFF); - } - - /** - * Decompresses into an array of bytes. - *

- * If len is zero, no bytes are read and 0 - * is returned. Otherwise this will block until len - * bytes have been decompressed, the end of LZMA2 stream is reached, - * or an exception is thrown. - * - * @param buf target buffer for uncompressed data - * @param off start offset in buf - * @param len maximum number of uncompressed bytes to read - * - * @return number of bytes read, or -1 to indicate - * the end of the compressed stream - * - * @throws CorruptedInputException - * - * @throws EOFException - * compressed input is truncated or corrupt - * - * @throws IOException may be thrown by in - */ - public int read(byte[] buf, int off, int len) throws IOException { - if (off < 0 || len < 0 || off + len < 0 || off + len > buf.length) - throw new IllegalArgumentException(); - - if (len == 0) - return 0; - - if (exception != null) - throw exception; - - if (endReached) - return -1; - - try { - int size = 0; - - while (len > 0) { - if (uncompressedSize == 0) { - decodeChunkHeader(); - if (endReached) - return size == 0 ? -1 : size; - } - - int copySizeMax = Math.min(uncompressedSize, len); - - if (!isLZMAChunk) { - lz.copyUncompressed(in, copySizeMax); - } else { - lz.setLimit(copySizeMax); - lzma.decode(); - } - - int copiedSize = lz.flush(buf, off); - off += copiedSize; - len -= copiedSize; - size += copiedSize; - uncompressedSize -= copiedSize; - - if (uncompressedSize == 0) - if (!rc.isFinished() || lz.hasPending()) - throw new CorruptedInputException(); - } - - return size; - - } catch (IOException e) { - exception = e; - throw e; - } - } - - private void decodeChunkHeader() throws IOException { - int control = in.readUnsignedByte(); - - if (control == 0x00) { - endReached = true; - return; - } - - if (control >= 0xE0 || control == 0x01) { - needProps = true; - needDictReset = false; - lz.reset(); - } else if (needDictReset) { - throw new CorruptedInputException(); - } - - if (control >= 0x80) { - isLZMAChunk = true; - - uncompressedSize = (control & 0x1F) << 16; - uncompressedSize += in.readUnsignedShort() + 1; - - int compressedSize = in.readUnsignedShort() + 1; - - if (control >= 0xC0) { - needProps = false; - decodeProps(); - - } else if (needProps) { - throw new CorruptedInputException(); - - } else if (control >= 0xA0) { - lzma.reset(); - } - - rc.prepareInputBuffer(in, compressedSize); - - } else if (control > 0x02) { - throw new CorruptedInputException(); - - } else { - isLZMAChunk = false; - uncompressedSize = in.readUnsignedShort() + 1; - } - } - - private void decodeProps() throws IOException { - int props = in.readUnsignedByte(); - - if (props > (4 * 5 + 4) * 9 + 8) - throw new CorruptedInputException(); - - int pb = props / (9 * 5); - props -= pb * 9 * 5; - int lp = props / 9; - int lc = props - lp * 9; - - if (lc + lp > 4) - throw new CorruptedInputException(); - - lzma = new LZMADecoder(lz, rc, lc, lp, pb); - } - - /** - * Returns the number of uncompressed bytes that can be read - * without blocking. The value is returned with an assumption - * that the compressed input data will be valid. If the compressed - * data is corrupt, CorruptedInputException may get - * thrown before the number of bytes claimed to be available have - * been read from this input stream. - *

- * In LZMAInputStream, the return value will be non-zero when the - * decompressor is in the middle of an LZMA2 chunk. The return value - * will then be the number of uncompressed bytes remaining from that - * chunk. - * - * @return the number of uncompressed bytes that can be read - * without blocking - */ - public int available() { - return uncompressedSize; - } - - /** - * Calls in.close(). - */ - public void close() throws IOException { - in.close(); - } -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/LZMA2Options.java b/third-party/xz/src/main/java/org/tukaani/xz/LZMA2Options.java deleted file mode 100644 index 438821d9..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/LZMA2Options.java +++ /dev/null @@ -1,139 +0,0 @@ -/* - * LZMA2Options - * - * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz; - -import java.io.InputStream; -import java.io.IOException; - -/** - * Options for LZMA2. - *

- * FIXME: This is unfinished and things might change. - */ -public class LZMA2Options extends FilterOptions { - /** - * Default compression preset. - */ - public static final int PRESET_DEFAULT = 6; - - /** - * Minimum dictionary size. - */ - public static final int DICT_SIZE_MIN = 4096; - - /** - * Maximum dictionary size for compression. - *

- * FIXME? Decompression dictionary size can be bigger. - */ - public static final int DICT_SIZE_MAX = 128 << 20; - - /** - * Maximum value for lc + lp. - */ - public static final int LC_LP_MAX = 4; - - /** - * Maximum value for pb. - */ - public static final int PB_MAX = 4; - - /** - * Compression mode: uncompressed. - * The data is wrapped into a LZMA2 stream without compression. - */ - public static final int MODE_UNCOMPRESSED = 0; - - /** - * Compression mode: fast. - * This is usually combined with a hash chain match finder. - */ - public static final int MODE_FAST = 1; - - /** - * Compression mode: normal. - * This is usually combined with a binary tree match finder. - */ - public static final int MODE_NORMAL = 2; - - /** - * Minimum value for niceLen. - */ - public static final int NICE_LEN_MIN = 8; - - /** - * Maximum value for niceLen. - */ - public static final int NICE_LEN_MAX = 273; - - /** - * Match finder: Hash Chain 2-3-4 - */ - public static final int MF_HC4 = 0x04; - - /** - * Match finder: Binary tree 2-3-4 - */ - public static final int MF_BT4 = 0x14; - - private int dictSize; - -/* - public int lc; - public int lp; - public int pb; - public int mode; - public int niceLen; - public int mf; - public int depth; -*/ - - public LZMA2Options() { - setPreset(PRESET_DEFAULT); - } - - public LZMA2Options(int preset) { - setPreset(preset); - } - - public void setPreset(int preset) { - // TODO - dictSize = 8 << 20; - } - - public int getEncoderMemoryUsage() { - return LZMA2OutputStream.getMemoryUsage(this); - } - - public FinishableOutputStream getOutputStream(FinishableOutputStream out) { - return new LZMA2OutputStream(out, this); - } - - public int getDecoderMemoryUsage() { - return LZMA2InputStream.getMemoryUsage(dictSize); - } - - public InputStream getInputStream(InputStream in) { - return new LZMA2InputStream(in, dictSize); - } - - FilterEncoder getFilterEncoder() { - return new LZMA2Encoder(this); - } - - public Object clone() { - try { - return super.clone(); - } catch (CloneNotSupportedException e) { - // Never reached - throw new RuntimeException(); - } - } -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/LZMA2OutputStream.java b/third-party/xz/src/main/java/org/tukaani/xz/LZMA2OutputStream.java deleted file mode 100644 index 156af2d7..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/LZMA2OutputStream.java +++ /dev/null @@ -1,77 +0,0 @@ -/* - * LZMA2OutputStream - * - * Authors: Lasse Collin - * Igor Pavlov - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz; - -import java.io.IOException; - -// -// TODO: This creates a valid LZMA2 stream but it doesn't compress. -// So this is useless except for testing the .xz container support. -// - -class LZMA2OutputStream extends FinishableOutputStream { - private final FinishableOutputStream out; - - static int getMemoryUsage(LZMA2Options options) { - // TODO - return 1; - } - - LZMA2OutputStream(FinishableOutputStream out, LZMA2Options options) { - this.out = out; - } - - public void write(int b) throws IOException { - byte[] buf = new byte[1]; - buf[0] = (byte)b; - write(buf, 0, 1); - } - - public void write(byte[] buf, int off, int len) throws IOException { - if (off < 0 || len < 0 || off + len < 0 || off + len > buf.length) - throw new IllegalArgumentException(); - - while (off > 0x10000) { - writeChunk(buf, off, 0x10000); - off += 0x10000; - len -= 0x10000; - } - - writeChunk(buf, off, len); - } - - private void writeChunk(byte[] buf, int off, int len) throws IOException { - out.write(0x01); - out.write((len - 1) >>> 8); - out.write(len - 1); - out.write(buf, off, len); - } - - private void writeEndMarker() throws IOException { - // TODO: Flush incomplete chunk. - out.write(0x00); - } - - public void flush() throws IOException { - throw new UnsupportedOptionsException( - "Flushing LZMA2OutputStream not implemented yet"); - } - - public void finish() throws IOException { - writeEndMarker(); - out.finish(); - } - - public void close() throws IOException { - writeEndMarker(); - out.close(); - } -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/MemoryLimitException.java b/third-party/xz/src/main/java/org/tukaani/xz/MemoryLimitException.java deleted file mode 100644 index 9d766bd7..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/MemoryLimitException.java +++ /dev/null @@ -1,60 +0,0 @@ -/* - * MemoryLimitException - * - * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz; - -/** - * Thrown when the memory usage limit given to the XZ decompressor - * would be exceeded. - *

- * The amount of memory required and the memory usage limit are - * included in the error detail message in human readable format. - */ -public class MemoryLimitException extends XZIOException { - private static final long serialVersionUID = 3L; - - private final int memoryNeeded; - private final int memoryLimit; - - /** - * Creates a new MemoryLimitException. - *

- * The amount of memory needed and the memory usage limit are - * included in the error detail message. - * - * @param memoryNeeded amount of memory needed as kibibytes (KiB) - * @param memoryLimit specified memory usage limit as kibibytes (KiB) - */ - public MemoryLimitException(int memoryNeeded, int memoryLimit) { - super("" + memoryNeeded + " KiB of memory would be needed; limit was " - + memoryLimit + " KiB"); - - this.memoryNeeded = memoryNeeded; - this.memoryLimit = memoryLimit; - } - - /** - * Gets how much memory is required to decompress the data. - * - * @return amount of memory needed as kibibytes (KiB) - */ - public int getMemoryNeeded() { - return memoryNeeded; - } - - /** - * Gets what the memory usage limit was at the time the exception - * was created. - * - * @return memory usage limit as kibibytes (KiB) - */ - public int getMemoryLimit() { - return memoryLimit; - } -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/RawCoder.java b/third-party/xz/src/main/java/org/tukaani/xz/RawCoder.java deleted file mode 100644 index 12c7da8f..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/RawCoder.java +++ /dev/null @@ -1,33 +0,0 @@ -/* - * RawCoder - * - * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz; - -class RawCoder { - static void validate(FilterCoder[] filters) - throws UnsupportedOptionsException { - for (int i = 0; i < filters.length - 1; ++i) - if (!filters[i].nonLastOK()) - throw new UnsupportedOptionsException( - "Unsupported XZ filter chain"); - - if (!filters[filters.length - 1].lastOK()) - throw new UnsupportedOptionsException( - "Unsupported XZ filter chain"); - - int changesSizeCount = 0; - for (int i = 0; i < filters.length; ++i) - if (filters[i].changesSize()) - ++changesSizeCount; - - if (changesSizeCount > 3) - throw new UnsupportedOptionsException( - "Unsupported XZ filter chain"); - } -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/SingleXZInputStream.java b/third-party/xz/src/main/java/org/tukaani/xz/SingleXZInputStream.java deleted file mode 100644 index 21272ea0..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/SingleXZInputStream.java +++ /dev/null @@ -1,285 +0,0 @@ -/* - * SingleXZInputStream - * - * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz; - -import java.io.InputStream; -import java.io.DataInputStream; -import java.io.IOException; -import java.io.EOFException; -import org.tukaani.xz.common.DecoderUtil; -import org.tukaani.xz.common.StreamFlags; -import org.tukaani.xz.index.IndexHash; -import org.tukaani.xz.check.Check; - -/** - * Decompresses exactly one XZ Stream in streamed mode (no seeking). - * The decompression stops after the first XZ Stream has been decompressed, - * and the read position in the input stream is left at the first byte - * after the end of the XZ Stream. This can be useful when XZ data has - * been stored inside some other file format or protocol. - *

- * Unless you know what you are doing, don't use this class to decompress - * standalone .xz files. For that purpose, use XZInputStream. - * - * @see XZInputStream - */ -public class SingleXZInputStream extends InputStream { - private InputStream in; - private int memoryLimit; - private StreamFlags streamHeaderFlags; - private Check check; - private BlockInputStream blockDecoder = null; - private final IndexHash indexHash = new IndexHash(); - private boolean endReached = false; - private IOException exception = null; - - /** - * Creates a new input stream that decompresses exactly one XZ Stream - * from in. - *

- * This constructor reads and parses the XZ Stream Header (12 bytes) - * from in. The header of the first Block is not read - * until read is called. - * - * @param in input stream from which XZ-compressed - * data is read - * - * @throws XZFormatException - * input is not in the XZ format - * - * @throws CorruptedInputException - * XZ header CRC32 doesn't match - * - * @throws UnsupportedOptionsException - * XZ header is valid but specifies options - * not supported by this implementation - * - * @throws EOFException - * less than 12 bytes of input was available - * from in - * - * @throws IOException may be thrown by in - */ - public SingleXZInputStream(InputStream in) throws IOException { - initialize(in, -1); - } - - /** - * Creates a new single-stream XZ decompressor with optional - * memory usage limit. - *

- * This is identical to SingleXZInputStream(InputStream) - * except that this takes also the memoryLimit argument. - * - * @param in input stream from which XZ-compressed - * data is read - * - * @param memoryLimit memory usage limit as kibibytes (KiB) - * or -1 to impose no memory usage limit - * - * @throws XZFormatException - * input is not in the XZ format - * - * @throws CorruptedInputException - * XZ header CRC32 doesn't match - * - * @throws UnsupportedOptionsException - * XZ header is valid but specifies options - * not supported by this implementation - * - * @throws EOFException - * less than 12 bytes of input was available - * from in - * - * @throws IOException may be thrown by in - */ - public SingleXZInputStream(InputStream in, int memoryLimit) - throws IOException { - initialize(in, memoryLimit); - } - - SingleXZInputStream(InputStream in, int memoryLimit, - byte[] streamHeader) throws IOException { - initialize(in, memoryLimit, streamHeader); - } - - private void initialize(InputStream in, int memoryLimit) - throws IOException { - byte[] streamHeader = new byte[DecoderUtil.STREAM_HEADER_SIZE]; - new DataInputStream(in).readFully(streamHeader); - initialize(in, memoryLimit, streamHeader); - } - - private void initialize(InputStream in, int memoryLimit, - byte[] streamHeader) throws IOException { - this.in = in; - this.memoryLimit = memoryLimit; - streamHeaderFlags = DecoderUtil.decodeStreamHeader(streamHeader); - check = Check.getInstance(streamHeaderFlags.checkType); - } - - /** - * Gets the ID of the integrity check used in this XZ Stream. - * - * @return the Check ID specified in the XZ Stream Header - */ - public int getCheckType() { - return streamHeaderFlags.checkType; - } - - /** - * Gets the name of the integrity check used in this XZ Stream. - * - * @return the name of the check specified in the XZ Stream Header - */ - public String getCheckName() { - return check.getName(); - } - - /** - * Decompresses the next byte from this input stream. - *

- * Reading lots of data with read() from this input stream - * may be inefficient. Wrap it in java.io.BufferedInputStream - * if you need to read lots of data one byte at a time. - * - * @return the next decompressed byte, or -1 - * to indicate the end of the compressed stream - * - * @throws CorruptedInputException - * @throws UnsupportedOptionsException - * @throws MemoryLimitException - * - * @throws EOFException - * compressed input is truncated or corrupt - * - * @throws IOException may be thrown by in - */ - public int read() throws IOException { - byte[] buf = new byte[1]; - return read(buf, 0, 1) == -1 ? -1 : (buf[0] & 0xFF); - } - - /** - * Decompresses into an array of bytes. - *

- * If len is zero, no bytes are read and 0 - * is returned. Otherwise this will try to decompress len - * bytes of uncompressed data. Less than len bytes may - * be read only in the following situations: - *

    - *
  • The end of the compressed data was reached successfully.
  • - *
  • An error is detected after at least one but less len - * bytes have already been successfully decompressed. - * The next call with non-zero len will immediately - * throw the pending exception.
  • - *
  • An exception is thrown.
  • - *
- * - * @param buf target buffer for uncompressed data - * @param off start offset in buf - * @param len maximum number of uncompressed bytes to read - * - * @return number of bytes read, or -1 to indicate - * the end of the compressed stream - * - * @throws CorruptedInputException - * @throws UnsupportedOptionsException - * @throws MemoryLimitException - * - * @throws EOFException - * compressed input is truncated or corrupt - * - * @throws IOException may be thrown by in - */ - public int read(byte[] buf, int off, int len) throws IOException { - if (off < 0 || len < 0 || off + len < 0 || off + len > buf.length) - throw new IllegalArgumentException(); - - if (len == 0) - return 0; - - if (exception != null) - throw exception; - - if (endReached) - return -1; - - int size = 0; - - try { - while (len > 0) { - if (blockDecoder == null) { - try { - blockDecoder = new BlockInputStream(in, check, - memoryLimit); - } catch (IndexIndicatorException e) { - indexHash.validate(in); - validateStreamFooter(); - endReached = true; - return size > 0 ? size : -1; - } - } - - int ret = blockDecoder.read(buf, off, len); - - if (ret > 0) { - size += ret; - off += ret; - len -= ret; - } else if (ret == -1) { - indexHash.add(blockDecoder.getUnpaddedSize(), - blockDecoder.getUncompressedSize()); - blockDecoder = null; - } - } - } catch (IOException e) { - exception = e; - if (size == 0) - throw e; - } - - return size; - } - - private void validateStreamFooter() throws IOException { - byte[] buf = new byte[DecoderUtil.STREAM_HEADER_SIZE]; - new DataInputStream(in).readFully(buf); - StreamFlags streamFooterFlags = DecoderUtil.decodeStreamFooter(buf); - - if (!DecoderUtil.areStreamFlagsEqual(streamHeaderFlags, - streamFooterFlags) - || indexHash.getIndexSize() != streamFooterFlags.backwardSize) - throw new CorruptedInputException( - "XZ Stream Footer does not match Stream Header"); - } - - /** - * Returns the number of uncompressed bytes that can be read - * without blocking. The value is returned with an assumption - * that the compressed input data will be valid. If the compressed - * data is corrupt, CorruptedInputException may get - * thrown before the number of bytes claimed to be available have - * been read from this input stream. - * - * @return the number of uncompressed bytes that can be read - * without blocking - */ - public int available() throws IOException { - return blockDecoder == null ? 0 : blockDecoder.available(); - } - - /** - * Calls in.close(). - */ - public void close() throws IOException { - in.close(); - } -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/UnsupportedOptionsException.java b/third-party/xz/src/main/java/org/tukaani/xz/UnsupportedOptionsException.java deleted file mode 100644 index 9aa16e8c..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/UnsupportedOptionsException.java +++ /dev/null @@ -1,34 +0,0 @@ -/* - * UnsupportedOptionsException - * - * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz; - -/** - * Thrown when compression options not supported by this implementation - * are detected. Some other implementation might support those options. - */ -public class UnsupportedOptionsException extends XZIOException { - private static final long serialVersionUID = 3L; - - /** - * Creates a new UnsupportedOptionsException with null - * as its error detail message. - */ - public UnsupportedOptionsException() {} - - /** - * Creates a new UnsupportedOptionsException with the given - * error detail message. - * - * @param s error detail message - */ - public UnsupportedOptionsException(String s) { - super(s); - } -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/XZ.java b/third-party/xz/src/main/java/org/tukaani/xz/XZ.java deleted file mode 100644 index 4e0857ff..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/XZ.java +++ /dev/null @@ -1,53 +0,0 @@ -/* - * XZ - * - * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz; - -/** - * XZ constants. - */ -public class XZ { - /** - * XZ Header Magic Bytes begin a XZ file. - * This can be useful to detect XZ compressed data. - */ - public static final byte[] HEADER_MAGIC = { - (byte)0xFD, '7', 'z', 'X', 'Z', '\0' }; - - /** - * XZ Footer Magic Bytes are the last bytes of a XZ Stream. - */ - public static final byte[] FOOTER_MAGIC = { 'Y', 'Z' }; - - /** - * Integrity check ID indicating that no integrity check is calculated. - *

- * Omitting the integrity check is strongly discouraged except when - * the integrity of the data will be verified by other means anyway, - * and calculating the check twice would be useless. - */ - public static final int CHECK_NONE = 0; - - /** - * Integrity check ID for CRC32. - */ - public static final int CHECK_CRC32 = 1; - - /** - * Integrity check ID for CRC64. - */ - public static final int CHECK_CRC64 = 4; - - /** - * Integrity check ID for SHA-256. - */ - public static final int CHECK_SHA256 = 10; - - private XZ() {} -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/XZFormatException.java b/third-party/xz/src/main/java/org/tukaani/xz/XZFormatException.java deleted file mode 100644 index 6f63020b..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/XZFormatException.java +++ /dev/null @@ -1,24 +0,0 @@ -/* - * XZFormatException - * - * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz; - -/** - * Thrown when the input data is not in the XZ format. - */ -public class XZFormatException extends XZIOException { - private static final long serialVersionUID = 3L; - - /** - * Creates a new exception with the default error detail message. - */ - public XZFormatException() { - super("Input is not in the XZ format"); - } -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/XZIOException.java b/third-party/xz/src/main/java/org/tukaani/xz/XZIOException.java deleted file mode 100644 index 1801c70c..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/XZIOException.java +++ /dev/null @@ -1,28 +0,0 @@ -/* - * XZIOException - * - * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz; - -/** - * Generic IOException specific to this package. - * All IOExceptions thrown by this package are extended from XZIOException. - * This way it is easier to distinguish exceptions thrown by the XZ code - * from other IOExceptions. - */ -public class XZIOException extends java.io.IOException { - private static final long serialVersionUID = 3L; - - public XZIOException() { - super(); - } - - public XZIOException(String s) { - super(s); - } -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/XZInputStream.java b/third-party/xz/src/main/java/org/tukaani/xz/XZInputStream.java deleted file mode 100644 index 3c44af40..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/XZInputStream.java +++ /dev/null @@ -1,257 +0,0 @@ -/* - * XZInputStream - * - * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz; - -import java.io.InputStream; -import java.io.DataInputStream; -import java.io.IOException; -import java.io.EOFException; -import org.tukaani.xz.common.DecoderUtil; - -/** - * Decompresses a .xz file in streamed mode (no seeking). - *

- * Use this to decompress regular standalone .xz files. This reads from - * its input stream until the end of the input or until an error occurs. - * This supports decompressing concatenated .xz files. - * - * @see SingleXZInputStream - */ -public class XZInputStream extends InputStream { - private final int memoryLimit; - private final InputStream in; - private SingleXZInputStream xzIn; - private boolean endReached = false; - private IOException exception = null; - - /** - * Creates a new input stream that decompresses XZ-compressed data - * from in. - *

- * This constructor reads and parses the XZ Stream Header (12 bytes) - * from in. The header of the first Block is not read - * until read is called. - * - * @param in input stream from which XZ-compressed - * data is read - * - * @throws XZFormatException - * input is not in the XZ format - * - * @throws CorruptedInputException - * XZ header CRC32 doesn't match - * - * @throws UnsupportedOptionsException - * XZ header is valid but specifies options - * not supported by this implementation - * - * @throws EOFException - * less than 12 bytes of input was available - * from in - * - * @throws IOException may be thrown by in - */ - public XZInputStream(InputStream in) throws IOException { - this.in = in; - this.memoryLimit = -1; - this.xzIn = new SingleXZInputStream(in, -1); - } - - /** - * Creates a new input stream that decompresses XZ-compressed data - * from in. - *

- * This is identical to XZInputStream(InputStream) except - * that this takes also the memoryLimit argument. - * - * @param in input stream from which XZ-compressed - * data is read - * - * @param memoryLimit memory usage limit as kibibytes (KiB) - * or -1 to impose no memory usage limit - * - * @throws XZFormatException - * input is not in the XZ format - * - * @throws CorruptedInputException - * XZ header CRC32 doesn't match - * - * @throws UnsupportedOptionsException - * XZ header is valid but specifies options - * not supported by this implementation - * - * @throws EOFException - * less than 12 bytes of input was available - * from in - * - * @throws IOException may be thrown by in - */ - public XZInputStream(InputStream in, int memoryLimit) throws IOException { - this.in = in; - this.memoryLimit = memoryLimit; - this.xzIn = new SingleXZInputStream(in, memoryLimit); - } - - /** - * Decompresses the next byte from this input stream. - *

- * Reading lots of data with read() from this input stream - * may be inefficient. Wrap it in java.io.BufferedInputStream - * if you need to read lots of data one byte at a time. - * - * @return the next decompressed byte, or -1 - * to indicate the end of the compressed stream - * - * @throws CorruptedInputException - * @throws UnsupportedOptionsException - * @throws MemoryLimitException - * - * @throws EOFException - * compressed input is truncated or corrupt - * - * @throws IOException may be thrown by in - */ - public int read() throws IOException { - byte[] buf = new byte[1]; - return read(buf, 0, 1) == -1 ? -1 : (buf[0] & 0xFF); - } - - /** - * Decompresses into an array of bytes. - *

- * If len is zero, no bytes are read and 0 - * is returned. Otherwise this will try to decompress len - * bytes of uncompressed data. Less than len bytes may - * be read only in the following situations: - *

    - *
  • The end of the compressed data was reached successfully.
  • - *
  • An error is detected after at least one but less len - * bytes have already been successfully decompressed. - * The next call with non-zero len will immediately - * throw the pending exception.
  • - *
  • An exception is thrown.
  • - *
- * - * @param buf target buffer for uncompressed data - * @param off start offset in buf - * @param len maximum number of uncompressed bytes to read - * - * @return number of bytes read, or -1 to indicate - * the end of the compressed stream - * - * @throws CorruptedInputException - * @throws UnsupportedOptionsException - * @throws MemoryLimitException - * - * @throws EOFException - * compressed input is truncated or corrupt - * - * @throws IOException may be thrown by in - */ - public int read(byte[] buf, int off, int len) throws IOException { - if (off < 0 || len < 0 || off + len < 0 || off + len > buf.length) - throw new IllegalArgumentException(); - - if (len == 0) - return 0; - - if (exception != null) - throw exception; - - if (endReached) - return -1; - - int size = 0; - - try { - while (len > 0) { - if (xzIn == null) { - prepareNextStream(); - if (endReached) - return size == 0 ? -1 : size; - } - - int ret = xzIn.read(buf, off, len); - - if (ret > 0) { - size += ret; - off += ret; - len -= ret; - } else if (ret == -1) { - xzIn = null; - } - } - } catch (IOException e) { - exception = e; - if (size == 0) - throw e; - } - - return size; - } - - private void prepareNextStream() throws IOException { - DataInputStream inData = new DataInputStream(in); - byte[] buf = new byte[DecoderUtil.STREAM_HEADER_SIZE]; - - // The size of Stream Padding must be a multiple of four bytes, - // all bytes zero. - do { - // First try to read one byte to see if we have reached the end - // of the file. - int ret = inData.read(buf, 0, 1); - if (ret == -1) { - endReached = true; - return; - } - - // Since we got one byte of input, there must be at least - // three more available in a valid file. - inData.readFully(buf, 1, 3); - - } while (buf[0] == 0 && buf[1] == 0 && buf[2] == 0 && buf[3] == 0); - - // Not all bytes are zero. In a valid Stream it indicates the - // beginning of the next Stream. Read the rest of the Stream Header - // and initialize the XZ decoder. - inData.readFully(buf, 4, DecoderUtil.STREAM_HEADER_SIZE - 4); - - try { - xzIn = new SingleXZInputStream(in, memoryLimit, buf); - } catch (XZFormatException e) { - // Since this isn't the first .xz Stream, it is more - // logical to tell that the data is corrupt. - throw new CorruptedInputException( - "Garbage after a valid XZ Stream"); - } - } - - /** - * Returns the number of uncompressed bytes that can be read - * without blocking. The value is returned with an assumption - * that the compressed input data will be valid. If the compressed - * data is corrupt, CorruptedInputException may get - * thrown before the number of bytes claimed to be available have - * been read from this input stream. - * - * @return the number of uncompressed bytes that can be read - * without blocking - */ - public int available() throws IOException { - return xzIn == null ? 0 : xzIn.available(); - } - - /** - * Calls in.close(). - */ - public void close() throws IOException { - in.close(); - } -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/XZOutputStream.java b/third-party/xz/src/main/java/org/tukaani/xz/XZOutputStream.java deleted file mode 100644 index 053473fd..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/XZOutputStream.java +++ /dev/null @@ -1,290 +0,0 @@ -/* - * XZOutputStream - * - * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz; - -import java.io.OutputStream; -import java.io.IOException; - -import org.tukaani.xz.common.EncoderUtil; -import org.tukaani.xz.common.StreamFlags; -import org.tukaani.xz.check.Check; -import org.tukaani.xz.index.IndexEncoder; - -/** - * Compresses into the .xz file format. - */ -public class XZOutputStream extends FinishableOutputStream { - private OutputStream out; - private final StreamFlags streamFlags = new StreamFlags(); - private Check check; - private final IndexEncoder index = new IndexEncoder(); - private FilterEncoder[] filters; - private BlockOutputStream blockEncoder = null; - private IOException exception = null; - private boolean finished = false; - - /** - * Creates a new output stream that compressed data into the .xz format. - * This is takes options for one filter as an argument. This constructor - * is equivalent to passing a single-member filterOptions array to the - * other constructor. - * - * @param out output stream to which the compressed data - * will be written - * - * @param filterOptions - * filter options to use - * - * @param checkType type of the integrity check, - * for example XZ.CHECK_CRC64 - * - * @throws UnsupportedOptionsException - * invalid filter chain - * - * @throws IOException may be thrown from out - */ - public XZOutputStream(OutputStream out, FilterOptions filterOptions, - int checkType) throws IOException { - FilterOptions[] ops = new FilterOptions[1]; - ops[0] = filterOptions; - initialize(out, ops, checkType); - } - - /** - * Creates a new output stream that compressed data into the .xz format. - * This takes an array of filter options, allowing the caller to specify - * a filter chain with 1-4 filters. - * - * @param out output stream to which the compressed data - * will be written - * - * @param filterOptions - * array of filter options to use - * - * @param checkType type of the integrity check, - * for example XZ.CHECK_CRC64 - * - * @throws UnsupportedOptionsException - * invalid filter chain - * - * @throws IOException may be thrown from out - */ - public XZOutputStream(OutputStream out, FilterOptions[] filterOptions, - int checkType) throws IOException { - initialize(out, filterOptions, checkType); - } - - private void initialize(OutputStream out, FilterOptions[] filterOptions, - int checkType) throws IOException { - this.out = out; - updateFilters(filterOptions); - - streamFlags.checkType = checkType; - check = Check.getInstance(checkType); - - encodeStreamHeader(); - } - - /** - * Updates the filter chain. - *

- * Currently this cannot be used to update e.g. LZMA2 options in the - * middle of a XZ Block. Use flush() to finish the current - * XZ Block before calling this function. The new filter chain will then - * be used for the next XZ Block. - */ - public void updateFilters(FilterOptions[] filterOptions) - throws XZIOException { - if (blockEncoder != null) - throw new UnsupportedOptionsException("Changing filter options " - + "in the middle of a XZ Block not implemented"); - - if (filterOptions.length < 1 || filterOptions.length > 4) - throw new UnsupportedOptionsException( - "XZ filter chain must be 1-4 filters"); - - FilterEncoder[] newFilters = new FilterEncoder[filterOptions.length]; - for (int i = 0; i < filterOptions.length; ++i) - newFilters[i] = filterOptions[i].getFilterEncoder(); - - RawCoder.validate(newFilters); - filters = newFilters; - } - - /** - * Writes one byte to be compressed. - * - * @throws XZIOException - * XZ stream has grown too big - * @throws IOException may be thrown by the underlying output stream - */ - public void write(int b) throws IOException { - byte[] buf = new byte[] { (byte)b }; - write(buf, 0, 1); - } - - /** - * Writes an array of bytes to be compressed. - * The compressors tend to do internal buffering and thus the written - * data won't be readable from the compressed output immediately. - * Use flush() to force everything written so far to - * be written to the underlaying output stream, but be aware that - * flushing reduces compression ratio. - * - * @param buf buffer of bytes to be written - * @param off start offset in buf - * @param len number of bytes to write - * - * @throws XZIOException - * XZ stream has grown too big - * @throws XZIOException - * finish() or close() - * was already called - * @throws IOException may be thrown by the underlying output stream - */ - public void write(byte[] buf, int off, int len) throws IOException { - if (off < 0 || len < 0 || off + len < 0 || off + len > buf.length) - throw new IllegalArgumentException(); - - if (len == 0) - return; - - if (finished) - exception = new XZIOException( - "XZOutputStream.write was called on a finished stream"); - - if (exception != null) - throw exception; - - if (blockEncoder == null) - blockEncoder = new BlockOutputStream(out, filters, check); - - try { - blockEncoder.write(buf, off, len); - } catch (IOException e) { - exception = e; - throw e; - } - } - - /** - * Flushes the encoder and calls out.flush(). - *

- * FIXME: I haven't decided yet how this will work in the final version. - * In the current implementation, flushing finishes the current .xz Block. - * This is equivalent to LZMA_FULL_FLUSH in liblzma (XZ Utils). - * Equivalent of liblzma's LZMA_SYNC_FLUSH might be implemented in - * the future, and perhaps should be what flush() should do. - */ - public void flush() throws IOException { - if (exception != null) - throw exception; - - if (blockEncoder != null) { - try { - blockEncoder.finish(); - index.add(blockEncoder.getUnpaddedSize(), - blockEncoder.getUncompressedSize()); - blockEncoder = null; - } catch (IOException e) { - exception = e; - throw e; - } - } - - out.flush(); - } - - /** - * Finishes compression without closing the underlying stream. - * No more data can be written to this stream after finishing - * (calling write with an empty buffer is OK). - *

- * Repeated calls to finish() do nothing unless - * an exception was thrown by this stream earlier. In that case - * the same exception is thrown again. - *

- * After finishing, the stream may be closed normally with - * close(). If the stream will be closed anyway, there - * usually is no need to call finish() separately. - */ - public void finish() throws IOException { - if (!finished) { - // flush() checks for pending exceptions so we don't need to - // worry about it here. - flush(); - - try { - index.encode(out); - encodeStreamFooter(); - finished = true; - } catch (IOException e) { - exception = e; - throw e; - } - } - } - - /** - * Finishes compression and closes the underlying stream. - * The underlying stream out is closed even if finishing - * fails. If both finishing and closing fail, the exception thrown - * by finish() is thrown and the exception from the failed - * out.close() is lost. - */ - public void close() throws IOException { - // If finish() throws an exception, it stores the exception to - // the variable "exception". So we can ignore the possible - // exception here. - try { - finish(); - } catch (IOException e) {} - - try { - out.close(); - } catch (IOException e) { - // Remember the exception but only if there is no previous - // pending exception. - if (exception == null) - exception = e; - } - - if (exception != null) - throw exception; - } - - private void encodeStreamFlags(byte[] buf, int off) { - buf[off] = 0x00; - buf[off + 1] = (byte)streamFlags.checkType; - } - - private void encodeStreamHeader() throws IOException { - out.write(XZ.HEADER_MAGIC); - - byte[] buf = new byte[2]; - encodeStreamFlags(buf, 0); - out.write(buf); - - EncoderUtil.writeCRC32(out, buf); - } - - private void encodeStreamFooter() throws IOException { - byte[] buf = new byte[6]; - long backwardSize = index.getIndexSize() / 4 - 1; - for (int i = 0; i < 4; ++i) - buf[i] = (byte)(backwardSize >>> (i * 8)); - - encodeStreamFlags(buf, 4); - - EncoderUtil.writeCRC32(out, buf); - out.write(buf); - out.write(XZ.FOOTER_MAGIC); - } -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/check/CRC32.java b/third-party/xz/src/main/java/org/tukaani/xz/check/CRC32.java deleted file mode 100644 index 8025ba06..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/check/CRC32.java +++ /dev/null @@ -1,33 +0,0 @@ -/* - * CRC32 - * - * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz.check; - -public class CRC32 extends Check { - private final java.util.zip.CRC32 state = new java.util.zip.CRC32(); - - public CRC32() { - size = 4; - name = "CRC32"; - } - - public void update(byte[] buf, int off, int len) { - state.update(buf, off, len); - } - - public byte[] finish() { - long value = state.getValue(); - byte[] buf = new byte[] { (byte)(value), - (byte)(value >>> 8), - (byte)(value >>> 16), - (byte)(value >>> 24) }; - state.reset(); - return buf; - } -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/check/CRC64.java b/third-party/xz/src/main/java/org/tukaani/xz/check/CRC64.java deleted file mode 100644 index 02b15b74..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/check/CRC64.java +++ /dev/null @@ -1,54 +0,0 @@ -/* - * CRC64 - * - * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz.check; - -public class CRC64 extends Check { - private static final long poly = 0xC96C5795D7870F42L; - private static final long[] crcTable = new long[256]; - - private long crc = -1; - - static { - for (int b = 0; b < crcTable.length; ++b) { - long r = b; - for (int i = 0; i < 8; ++i) { - if ((r & 1) == 1) - r = (r >>> 1) ^ poly; - else - r >>>= 1; - } - - crcTable[b] = r; - } - } - - public CRC64() { - size = 8; - name = "CRC64"; - } - - public void update(byte[] buf, int off, int len) { - int end = off + len; - - while (off < end) - crc = crcTable[(buf[off++] ^ (int)crc) & 0xFF] ^ (crc >>> 8); - } - - public byte[] finish() { - long value = ~crc; - crc = -1; - - byte[] buf = new byte[8]; - for (int i = 0; i < buf.length; ++i) - buf[i] = (byte)(value >> (i * 8)); - - return buf; - } -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/check/Check.java b/third-party/xz/src/main/java/org/tukaani/xz/check/Check.java deleted file mode 100644 index f2fe4bae..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/check/Check.java +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Check - * - * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz.check; - -import org.tukaani.xz.XZ; -import org.tukaani.xz.UnsupportedOptionsException; - -public abstract class Check { - int size; - String name; - - public abstract void update(byte[] buf, int off, int len); - public abstract byte[] finish(); - - public void update(byte[] buf) { - update(buf, 0, buf.length); - } - - public int getSize() { - return size; - } - - public String getName() { - return name; - } - - public static Check getInstance(int checkType) - throws UnsupportedOptionsException { - switch (checkType) { - case XZ.CHECK_NONE: - return new None(); - - case XZ.CHECK_CRC32: - return new CRC32(); - - case XZ.CHECK_CRC64: - return new CRC64(); - - case XZ.CHECK_SHA256: - try { - return new SHA256(); - } catch (java.security.NoSuchAlgorithmException e) {} - - break; - } - - throw new UnsupportedOptionsException( - "Unsupported Check ID " + checkType); - } -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/check/None.java b/third-party/xz/src/main/java/org/tukaani/xz/check/None.java deleted file mode 100644 index b07c8e66..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/check/None.java +++ /dev/null @@ -1,24 +0,0 @@ -/* - * None - * - * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz.check; - -public class None extends Check { - public None() { - size = 0; - name = "None"; - } - - public void update(byte[] buf, int off, int len) {} - - public byte[] finish() { - byte[] empty = new byte[0]; - return empty; - } -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/check/SHA256.java b/third-party/xz/src/main/java/org/tukaani/xz/check/SHA256.java deleted file mode 100644 index 66503c79..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/check/SHA256.java +++ /dev/null @@ -1,30 +0,0 @@ -/* - * SHA256 - * - * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz.check; - -public class SHA256 extends Check { - private final java.security.MessageDigest sha256; - - public SHA256() throws java.security.NoSuchAlgorithmException { - size = 32; - name = "SHA-256"; - sha256 = java.security.MessageDigest.getInstance("SHA-256"); - } - - public void update(byte[] buf, int off, int len) { - sha256.update(buf, off, len); - } - - public byte[] finish() { - byte[] buf = sha256.digest(); - sha256.reset(); - return buf; - } -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/common/DecoderUtil.java b/third-party/xz/src/main/java/org/tukaani/xz/common/DecoderUtil.java deleted file mode 100644 index 77ba4413..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/common/DecoderUtil.java +++ /dev/null @@ -1,121 +0,0 @@ -/* - * DecoderUtil - * - * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz.common; - -import java.io.InputStream; -import java.io.IOException; -import java.io.EOFException; -import java.util.zip.CRC32; -import org.tukaani.xz.XZ; -import org.tukaani.xz.XZFormatException; -import org.tukaani.xz.CorruptedInputException; -import org.tukaani.xz.UnsupportedOptionsException; - -public class DecoderUtil extends Util { - public static boolean isCRC32Valid(byte[] buf, int off, int len, - int ref_off) { - CRC32 crc32 = new CRC32(); - crc32.update(buf, off, len); - long value = crc32.getValue(); - - for (int i = 0; i < 4; ++i) - if ((byte)(value >>> (i * 8)) != buf[ref_off + i]) - return false; - - return true; - } - - public static StreamFlags decodeStreamHeader(byte[] buf) - throws IOException { - for (int i = 0; i < XZ.HEADER_MAGIC.length; ++i) - if (buf[i] != XZ.HEADER_MAGIC[i]) - throw new XZFormatException(); - - if (!isCRC32Valid(buf, XZ.HEADER_MAGIC.length, 2, - XZ.HEADER_MAGIC.length + 2)) - throw new CorruptedInputException("XZ Stream Header is corrupt"); - - try { - return decodeStreamFlags(buf, XZ.HEADER_MAGIC.length); - } catch (UnsupportedOptionsException e) { - throw new UnsupportedOptionsException( - "Unsupported options in XZ Stream Header"); - } - } - - public static StreamFlags decodeStreamFooter(byte[] buf) - throws IOException { - if (buf[10] != XZ.FOOTER_MAGIC[0] || buf[11] != XZ.FOOTER_MAGIC[1]) { - // NOTE: The exception could be XZFormatException too. - // It depends on the situation which one is better. - throw new CorruptedInputException("XZ Stream Footer is corrupt"); - } - - if (!isCRC32Valid(buf, 4, 6, 0)) - throw new CorruptedInputException("XZ Stream Footer is corrupt"); - - StreamFlags streamFlags; - try { - streamFlags = decodeStreamFlags(buf, 8); - } catch (UnsupportedOptionsException e) { - throw new UnsupportedOptionsException( - "Unsupported options in XZ Stream Footer"); - } - - streamFlags.backwardSize = 0; - for (int i = 0; i < 4; ++i) - streamFlags.backwardSize |= (buf[i + 4] & 0xFF) << (i * 8); - - streamFlags.backwardSize = (streamFlags.backwardSize + 1) * 4; - - return streamFlags; - } - - private static StreamFlags decodeStreamFlags(byte[] buf, int off) - throws UnsupportedOptionsException { - if (buf[off] != 0x00 || (buf[off + 1] & 0xFF) >= 0x10) - throw new UnsupportedOptionsException(); - - StreamFlags streamFlags = new StreamFlags(); - streamFlags.checkType = buf[off + 1]; - - return streamFlags; - } - - public static boolean areStreamFlagsEqual(StreamFlags a, StreamFlags b) { - // backwardSize is intentionally not compared. - return a.checkType == b.checkType; - } - - public static long decodeVLI(InputStream in) throws IOException { - int b = in.read(); - if (b == -1) - throw new EOFException(); - - long num = b & 0x7F; - int i = 0; - - while ((b & 0x80) != 0x00) { - if (++i >= VLI_SIZE_MAX) - throw new CorruptedInputException(); - - b = in.read(); - if (b == -1) - throw new EOFException(); - - if (b == 0x00) - throw new CorruptedInputException(); - - num |= (long)(b & 0x7F) << (i * 7); - } - - return num; - } -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/common/EncoderUtil.java b/third-party/xz/src/main/java/org/tukaani/xz/common/EncoderUtil.java deleted file mode 100644 index 57f688b5..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/common/EncoderUtil.java +++ /dev/null @@ -1,36 +0,0 @@ -/* - * EncoderUtil - * - * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz.common; - -import java.io.OutputStream; -import java.io.IOException; -import java.util.zip.CRC32; - -public class EncoderUtil extends Util { - public static void writeCRC32(OutputStream out, byte[] buf) - throws IOException { - CRC32 crc32 = new CRC32(); - crc32.update(buf); - long value = crc32.getValue(); - - for (int i = 0; i < 4; ++i) - out.write((byte)(value >>> (i * 8))); - } - - public static void encodeVLI(OutputStream out, long num) - throws IOException { - while (num >= 0x80) { - out.write((byte)(num | 0x80)); - num >>>= 7; - } - - out.write((byte)num); - } -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/common/StreamFlags.java b/third-party/xz/src/main/java/org/tukaani/xz/common/StreamFlags.java deleted file mode 100644 index b306987d..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/common/StreamFlags.java +++ /dev/null @@ -1,15 +0,0 @@ -/* - * StreamFlags - * - * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz.common; - -public class StreamFlags { - public int checkType = -1; - public long backwardSize = -1; -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/common/Util.java b/third-party/xz/src/main/java/org/tukaani/xz/common/Util.java deleted file mode 100644 index c4324ce0..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/common/Util.java +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Util - * - * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz.common; - -public class Util { - public static final int STREAM_HEADER_SIZE = 12; - public static final long BACKWARD_SIZE_MAX = 1L << 34; - public static final int BLOCK_HEADER_SIZE_MAX = 1024; - public static final long VLI_MAX = Long.MAX_VALUE; - public static final int VLI_SIZE_MAX = 9; - - public static int getVLISize(long num) { - int size = 0; - do { - ++size; - num >>= 7; - } while (num != 0); - - return size; - } -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/delta/DeltaCoder.java b/third-party/xz/src/main/java/org/tukaani/xz/delta/DeltaCoder.java deleted file mode 100644 index e3b300b0..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/delta/DeltaCoder.java +++ /dev/null @@ -1,27 +0,0 @@ -/* - * DeltaCoder - * - * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz.delta; - -abstract class DeltaCoder { - static final int DISTANCE_MIN = 1; - static final int DISTANCE_MAX = 256; - static final int DISTANCE_MASK = DISTANCE_MAX - 1; - - final int distance; - final byte[] history = new byte[DISTANCE_MAX]; - int pos = 0; - - public DeltaCoder(int distance) { - if (distance < DISTANCE_MIN || distance > DISTANCE_MAX) - throw new IllegalArgumentException(); - - this.distance = distance; - } -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/delta/DeltaDecoder.java b/third-party/xz/src/main/java/org/tukaani/xz/delta/DeltaDecoder.java deleted file mode 100644 index 154cbf34..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/delta/DeltaDecoder.java +++ /dev/null @@ -1,24 +0,0 @@ -/* - * DeltaDecoder - * - * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz.delta; - -public class DeltaDecoder extends DeltaCoder { - public DeltaDecoder(int distance) { - super(distance); - } - - public void decode(byte[] buf, int off, int len) { - int end = off + len; - for (int i = off; i < end; ++i) { - buf[i] += history[(distance + pos) & DISTANCE_MASK]; - history[pos-- & DISTANCE_MASK] = buf[i]; - } - } -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/index/IndexBase.java b/third-party/xz/src/main/java/org/tukaani/xz/index/IndexBase.java deleted file mode 100644 index e08f17ce..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/index/IndexBase.java +++ /dev/null @@ -1,56 +0,0 @@ -/* - * IndexBase - * - * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz.index; - -import org.tukaani.xz.common.Util; -import org.tukaani.xz.XZIOException; - -abstract class IndexBase { - private final XZIOException invalidIndexException; - long blocksSum = 0; - long uncompressedSum = 0; - long indexListSize = 0; - long recordCount = 0; - - IndexBase(XZIOException invalidIndexException) { - this.invalidIndexException = invalidIndexException; - } - - private long getUnpaddedIndexSize() { - // Index Indicator + Number of Records + List of Records + CRC32 - return 1 + Util.getVLISize(recordCount) + indexListSize + 4; - } - - public long getIndexSize() { - return (getUnpaddedIndexSize() + 3) & ~3; - } - - long getStreamSize() { - return Util.STREAM_HEADER_SIZE + blocksSum + getIndexSize() - + Util.STREAM_HEADER_SIZE; - } - - int getIndexPaddingSize() { - return (int)((4 - getUnpaddedIndexSize()) & 3); - } - - void add(long unpaddedSize, long uncompressedSize) throws XZIOException { - blocksSum += (unpaddedSize + 3) & ~3; - uncompressedSum += uncompressedSize; - indexListSize += Util.getVLISize(unpaddedSize) - + Util.getVLISize(uncompressedSize); - ++recordCount; - - if (blocksSum < 0 || uncompressedSum < 0 - || getIndexSize() > Util.BACKWARD_SIZE_MAX - || getStreamSize() < 0) - throw invalidIndexException; - } -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/index/IndexEncoder.java b/third-party/xz/src/main/java/org/tukaani/xz/index/IndexEncoder.java deleted file mode 100644 index f00f5c4b..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/index/IndexEncoder.java +++ /dev/null @@ -1,59 +0,0 @@ -/* - * IndexEncoder - * - * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz.index; - -import java.io.OutputStream; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.zip.CheckedOutputStream; -import org.tukaani.xz.common.EncoderUtil; -import org.tukaani.xz.XZIOException; - -public class IndexEncoder extends IndexBase { - private final ArrayList records = new ArrayList<>(); - - public IndexEncoder() { - super(new XZIOException("XZ Stream or its Index has grown too big")); - } - - public void add(long unpaddedSize, long uncompressedSize) - throws XZIOException { - super.add(unpaddedSize, uncompressedSize); - records.add(new IndexRecord(unpaddedSize, uncompressedSize)); - } - - public void encode(OutputStream out) throws IOException { - java.util.zip.CRC32 crc32 = new java.util.zip.CRC32(); - CheckedOutputStream outChecked = new CheckedOutputStream(out, crc32); - - // Index Indicator - outChecked.write(0x00); - - // Number of Records - EncoderUtil.encodeVLI(outChecked, recordCount); - - // List of Records - for (Iterator i = records.iterator(); i.hasNext(); ) { - IndexRecord record = (IndexRecord)i.next(); - EncoderUtil.encodeVLI(outChecked, record.unpadded); - EncoderUtil.encodeVLI(outChecked, record.uncompressed); - } - - // Index Padding - for (int i = getIndexPaddingSize(); i > 0; --i) - outChecked.write(0x00); - - // CRC32 - long value = crc32.getValue(); - for (int i = 0; i < 4; ++i) - out.write((byte)(value >>> (i * 8))); - } -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/index/IndexHash.java b/third-party/xz/src/main/java/org/tukaani/xz/index/IndexHash.java deleted file mode 100644 index ab168c69..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/index/IndexHash.java +++ /dev/null @@ -1,94 +0,0 @@ -/* - * IndexHash - * - * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz.index; - -import java.io.InputStream; -import java.io.DataInputStream; -import java.io.IOException; -import java.nio.ByteBuffer; -import java.util.Arrays; -import java.util.zip.CheckedInputStream; -import org.tukaani.xz.common.DecoderUtil; -import org.tukaani.xz.XZIOException; -import org.tukaani.xz.CorruptedInputException; - -public class IndexHash extends IndexBase { - private org.tukaani.xz.check.Check hash; - - public IndexHash() { - super(new CorruptedInputException()); - - try { - hash = new org.tukaani.xz.check.SHA256(); - } catch (java.security.NoSuchAlgorithmException e) { - hash = new org.tukaani.xz.check.CRC32(); - } - } - - public void add(long unpaddedSize, long uncompressedSize) - throws XZIOException { - super.add(unpaddedSize, uncompressedSize); - - ByteBuffer buf = ByteBuffer.allocate(2 * 8); - buf.putLong(unpaddedSize); - buf.putLong(uncompressedSize); - hash.update(buf.array()); - } - - public void validate(InputStream in) throws IOException { - // Index Indicator (0x00) has already been read by BlockInputStream - // so add 0x00 to the CRC32 here. - java.util.zip.CRC32 crc32 = new java.util.zip.CRC32(); - crc32.update('\0'); - CheckedInputStream inChecked = new CheckedInputStream(in, crc32); - - // Get and validate the Number of Records field. - long storedRecordCount = DecoderUtil.decodeVLI(inChecked); - if (storedRecordCount != recordCount) - throw new CorruptedInputException("XZ Index is corrupt"); - - // Decode and hash the Index field and compare it to - // the hash value calculated from the decoded Blocks. - IndexHash stored = new IndexHash(); - for (long i = 0; i < recordCount; ++i) { - long unpaddedSize = DecoderUtil.decodeVLI(inChecked); - long uncompressedSize = DecoderUtil.decodeVLI(inChecked); - - try { - stored.add(unpaddedSize, uncompressedSize); - } catch (XZIOException e) { - throw new CorruptedInputException("XZ Index is corrupt"); - } - - if (stored.blocksSum > blocksSum - || stored.uncompressedSum > uncompressedSum - || stored.indexListSize > indexListSize) - throw new CorruptedInputException("XZ Index is corrupt"); - } - - if (stored.blocksSum != blocksSum - || stored.uncompressedSum != uncompressedSum - || stored.indexListSize != indexListSize - || !Arrays.equals(stored.hash.finish(), hash.finish())) - throw new CorruptedInputException("XZ Index is corrupt"); - - // Index Padding - DataInputStream inData = new DataInputStream(inChecked); - for (int i = getIndexPaddingSize(); i > 0; --i) - if (inData.readUnsignedByte() != 0x00) - throw new CorruptedInputException("XZ Index is corrupt"); - - // CRC32 - long value = crc32.getValue(); - for (int i = 0; i < 4; ++i) - if (((value >>> (i * 8)) & 0xFF) != inData.readUnsignedByte()) - throw new CorruptedInputException("XZ Index is corrupt"); - } -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/index/IndexRecord.java b/third-party/xz/src/main/java/org/tukaani/xz/index/IndexRecord.java deleted file mode 100644 index 97629cc3..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/index/IndexRecord.java +++ /dev/null @@ -1,20 +0,0 @@ -/* - * IndexRecord - * - * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz.index; - -public class IndexRecord { - public final long unpadded; - public final long uncompressed; - - IndexRecord(long unpadded, long uncompressed) { - this.unpadded = unpadded; - this.uncompressed = uncompressed; - } -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/lz/LZDecoder.java b/third-party/xz/src/main/java/org/tukaani/xz/lz/LZDecoder.java deleted file mode 100644 index 680fec10..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/lz/LZDecoder.java +++ /dev/null @@ -1,126 +0,0 @@ -/* - * LZDecoder - * - * Authors: Lasse Collin - * Igor Pavlov - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz.lz; - -import java.io.DataInputStream; -import java.io.IOException; -import org.tukaani.xz.CorruptedInputException; - -public final class LZDecoder { - private final byte[] buf; - private int start = 0; - private int pos = 0; - private int full = 0; - private int limit = 0; - private int pendingLen = 0; - private int pendingDist = 0; - - public LZDecoder(int dictSize, byte[] presetDict) { - buf = new byte[dictSize]; - - if (presetDict != null) { - pos = Math.min(presetDict.length, dictSize); - full = pos; - start = pos; - System.arraycopy(presetDict, presetDict.length - pos, buf, 0, pos); - } - } - - public void reset() { - start = 0; - pos = 0; - full = 0; - limit = 0; - buf[buf.length - 1] = 0x00; - } - - public void setLimit(int outMax) { - if (buf.length - pos <= outMax) - limit = buf.length; - else - limit = pos + outMax; - } - - public boolean hasSpace() { - return pos < limit; - } - - public boolean hasPending() { - return pendingLen > 0; - } - - public int getPos() { - return pos; - } - - public int getByte(int dist) { - int offset = pos - dist - 1; - if (dist >= pos) - offset += buf.length; - - return buf[offset] & 0xFF; - } - - public void putByte(byte b) { - buf[pos++] = b; - - if (full < pos) - full = pos; - } - - public void repeat(int dist, int len) throws IOException { - if (dist < 0 || dist >= full) - throw new CorruptedInputException(); - - int left = Math.min(limit - pos, len); - pendingLen = len - left; - pendingDist = dist; - - int back = pos - dist - 1; - if (dist >= pos) - back += buf.length; - - do { - buf[pos++] = buf[back++]; - if (back == buf.length) - back = 0; - } while (--left > 0); - - if (full < pos) - full = pos; - } - - public void repeatPending() throws IOException { - if (pendingLen > 0) - repeat(pendingDist, pendingLen); - } - - public void copyUncompressed(DataInputStream inData, int len) - throws IOException { - int copySize = Math.min(buf.length - pos, len); - inData.readFully(buf, pos, copySize); - pos += copySize; - - if (full < pos) - full = pos; - } - - public int flush(byte[] out, int outOff) { - int copySize = pos - start; - if (pos == buf.length) - pos = 0; - - System.arraycopy(buf, start, out, outOff, copySize); - start = pos; - - return copySize; - } -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/lzma/LZMACoder.java b/third-party/xz/src/main/java/org/tukaani/xz/lzma/LZMACoder.java deleted file mode 100644 index ec6b861f..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/lzma/LZMACoder.java +++ /dev/null @@ -1,139 +0,0 @@ -/* - * LZMACoder - * - * Authors: Lasse Collin - * Igor Pavlov - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz.lzma; - -import org.tukaani.xz.rangecoder.RangeCoder; - -abstract class LZMACoder { - static final int POS_STATES_MAX = 1 << 4; - - static final int MATCH_LEN_MIN = 2; - static final int MATCH_LEN_MAX = MATCH_LEN_MIN + LengthCoder.LOW_SYMBOLS - + LengthCoder.MID_SYMBOLS - + LengthCoder.HIGH_SYMBOLS - 1; - - static final int DIST_STATES = 4; - static final int DIST_SLOTS = 1 << 6; - static final int DIST_MODEL_START = 4; - static final int DIST_MODEL_END = 14; - - static final int ALIGN_BITS = 4; - static final int ALIGN_SIZE = 1 << ALIGN_BITS; - static final int ALIGN_MASK = ALIGN_SIZE - 1; - - static final int REPS = 4; - - final int posMask; - - final int[] rep = new int[4]; - final State state = new State(); - - final short[][] isMatch = new short[State.STATES][POS_STATES_MAX]; - final short[] isRep = new short[State.STATES]; - final short[] isRep0 = new short[State.STATES]; - final short[] isRep1 = new short[State.STATES]; - final short[] isRep2 = new short[State.STATES]; - final short[][] isRep0Long = new short[State.STATES][POS_STATES_MAX]; - final short[][] distSlots = new short[DIST_STATES][DIST_SLOTS]; - final short[][] distSpecial = { new short[2], new short[2], - new short[4], new short[4], - new short[8], new short[8], - new short[16], new short[16], - new short[32], new short[32] }; - final short[] distAlign = new short[ALIGN_SIZE]; - - static int getDistState(int len) { - return len < DIST_STATES + MATCH_LEN_MIN - ? len - MATCH_LEN_MIN - : DIST_STATES - 1; - } - - LZMACoder(int pb) { - posMask = (1 << pb) - 1; - } - - void reset() { - rep[0] = 0; - rep[1] = 0; - rep[2] = 0; - rep[3] = 0; - state.reset(); - - for (int i = 0; i < isMatch.length; ++i) - RangeCoder.initProbs(isMatch[i]); - - RangeCoder.initProbs(isRep); - RangeCoder.initProbs(isRep0); - RangeCoder.initProbs(isRep1); - RangeCoder.initProbs(isRep2); - - for (int i = 0; i < isRep0Long.length; ++i) - RangeCoder.initProbs(isRep0Long[i]); - - for (int i = 0; i < distSlots.length; ++i) - RangeCoder.initProbs(distSlots[i]); - - for (int i = 0; i < distSpecial.length; ++i) - RangeCoder.initProbs(distSpecial[i]); - - RangeCoder.initProbs(distAlign); - } - - - abstract static class LiteralCoder { - private final int lc; - private final int literalPosMask; - - LiteralCoder(int lc, int lp) { - this.lc = lc; - this.literalPosMask = (1 << lp) - 1; - } - - final int getSubcoderIndex(int prevByte, int pos) { - int low = prevByte >> (8 - lc); - int high = (pos & literalPosMask) << lc; - return low + high; - } - - - abstract class LiteralSubcoder { - final short[] probs = new short[0x300]; - - void reset() { - RangeCoder.initProbs(probs); - } - } - } - - - abstract static class LengthCoder { - static final int LOW_SYMBOLS = 1 << 3; - static final int MID_SYMBOLS = 1 << 3; - static final int HIGH_SYMBOLS = 1 << 8; - - final short[] choice = new short[2]; - final short[][] low = new short[POS_STATES_MAX][LOW_SYMBOLS]; - final short[][] mid = new short[POS_STATES_MAX][MID_SYMBOLS]; - final short[] high = new short[HIGH_SYMBOLS]; - - void reset() { - RangeCoder.initProbs(choice); - - for (int i = 0; i < low.length; ++i) - RangeCoder.initProbs(low[i]); - - for (int i = 0; i < low.length; ++i) - RangeCoder.initProbs(mid[i]); - - RangeCoder.initProbs(high); - } - } -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/lzma/LZMADecoder.java b/third-party/xz/src/main/java/org/tukaani/xz/lzma/LZMADecoder.java deleted file mode 100644 index 8c97ee79..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/lzma/LZMADecoder.java +++ /dev/null @@ -1,189 +0,0 @@ -/* - * LZMADecoder - * - * Authors: Lasse Collin - * Igor Pavlov - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz.lzma; - -import java.io.IOException; -import org.tukaani.xz.lz.LZDecoder; -import org.tukaani.xz.rangecoder.RangeDecoder; -import org.tukaani.xz.CorruptedInputException; - -public final class LZMADecoder extends LZMACoder { - private final LZDecoder lz; - private final RangeDecoder rc; - private final LiteralDecoder literalDecoder; - private final LengthDecoder matchLenDecoder = new LengthDecoder(); - private final LengthDecoder repLenDecoder = new LengthDecoder(); - - public LZMADecoder(LZDecoder lz, RangeDecoder rc, int lc, int lp, int pb) { - super(pb); - this.lz = lz; - this.rc = rc; - this.literalDecoder = new LiteralDecoder(lc, lp); - reset(); - } - - public void reset() { - super.reset(); - literalDecoder.reset(); - matchLenDecoder.reset(); - repLenDecoder.reset(); - } - - public void decode() throws IOException { - lz.repeatPending(); - - while (lz.hasSpace()) { - int posState = lz.getPos() & posMask; - - if (rc.decodeBit(isMatch[state.get()], posState) == 0) { - literalDecoder.decode(); - } else { - int len = rc.decodeBit(isRep, state.get()) == 0 - ? decodeMatch(posState) - : decodeRepMatch(posState); - lz.repeat(rep[0], len); - } - } - - rc.normalize(); - - if (!rc.isInBufferOK()) - throw new CorruptedInputException(); - } - - private int decodeMatch(int posState) throws IOException { - state.updateMatch(); - - rep[3] = rep[2]; - rep[2] = rep[1]; - rep[1] = rep[0]; - - int len = matchLenDecoder.decode(posState); - int distSlot = rc.decodeBitTree(distSlots[getDistState(len)]); - - if (distSlot < DIST_MODEL_START) { - rep[0] = distSlot; - } else { - int limit = (distSlot >> 1) - 1; - rep[0] = (2 | (distSlot & 1)) << limit; - - if (distSlot < DIST_MODEL_END) { - rep[0] |= rc.decodeReverseBitTree( - distSpecial[distSlot - DIST_MODEL_START]); - } else { - rep[0] |= rc.decodeDirectBits(limit - ALIGN_BITS) - << ALIGN_BITS; - rep[0] |= rc.decodeReverseBitTree(distAlign); - } - } - - return len; - } - - private int decodeRepMatch(int posState) throws IOException { - if (rc.decodeBit(isRep0, state.get()) == 0) { - if (rc.decodeBit(isRep0Long[state.get()], posState) == 0) { - state.updateShortRep(); - return 1; - } - } else { - int tmp; - - if (rc.decodeBit(isRep1, state.get()) == 0) { - tmp = rep[1]; - } else { - if (rc.decodeBit(isRep2, state.get()) == 0) { - tmp = rep[2]; - } else { - tmp = rep[3]; - rep[3] = rep[2]; - } - - rep[2] = rep[1]; - } - - rep[1] = rep[0]; - rep[0] = tmp; - } - - state.updateLongRep(); - - return repLenDecoder.decode(posState); - } - - - private class LiteralDecoder extends LiteralCoder { - final LiteralSubdecoder[] subdecoders; - - LiteralDecoder(int lc, int lp) { - super(lc, lp); - - subdecoders = new LiteralSubdecoder[1 << (lc + lp)]; - for (int i = 0; i < subdecoders.length; ++i) - subdecoders[i] = new LiteralSubdecoder(); - } - - void reset() { - for (int i = 0; i < subdecoders.length; ++i) - subdecoders[i].reset(); - } - - void decode() throws IOException { - int i = getSubcoderIndex(lz.getByte(0), lz.getPos()); - subdecoders[i].decode(); - } - - - private class LiteralSubdecoder extends LiteralSubcoder { - void decode() throws IOException { - int symbol = 1; - - if (state.isLiteral()) { - do { - symbol = (symbol << 1) | rc.decodeBit(probs, symbol); - } while (symbol < 0x100); - - } else { - int matchByte = lz.getByte(rep[0]); - int offset = 0x100; - int matchBit; - int bit; - - do { - matchByte <<= 1; - matchBit = matchByte & offset; - bit = rc.decodeBit(probs, offset + matchBit + symbol); - symbol = (symbol << 1) | bit; - offset &= (-bit) ^ ~matchBit; - } while (symbol < 0x100); - } - - lz.putByte((byte)symbol); - state.updateLiteral(); - } - } - } - - - private class LengthDecoder extends LengthCoder { - int decode(int posState) throws IOException { - if (rc.decodeBit(choice, 0) == 0) - return rc.decodeBitTree(low[posState]) + MATCH_LEN_MIN; - - if (rc.decodeBit(choice, 1) == 0) - return rc.decodeBitTree(mid[posState]) - + MATCH_LEN_MIN + LOW_SYMBOLS; - - return rc.decodeBitTree(high) - + MATCH_LEN_MIN + LOW_SYMBOLS + MID_SYMBOLS; - } - } -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/lzma/State.java b/third-party/xz/src/main/java/org/tukaani/xz/lzma/State.java deleted file mode 100644 index 43895ab0..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/lzma/State.java +++ /dev/null @@ -1,65 +0,0 @@ -/* - * State - * - * Authors: Lasse Collin - * Igor Pavlov - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz.lzma; - -final class State { - static final int STATES = 12; - - private static final int LIT_STATES = 7; - - private static final int LIT_LIT = 0; - private static final int MATCH_LIT_LIT = 1; - private static final int REP_LIT_LIT = 2; - private static final int SHORTREP_LIT_LIT = 3; - private static final int MATCH_LIT = 4; - private static final int REP_LIT = 5; - private static final int SHORTREP_LIT = 6; - private static final int LIT_MATCH = 7; - private static final int LIT_LONGREP = 8; - private static final int LIT_SHORTREP = 9; - private static final int NONLIT_MATCH = 10; - private static final int NONLIT_REP = 11; - - private int state; - - void reset() { - state = LIT_LIT; - } - - int get() { - return state; - } - - void updateLiteral() { - if (state <= SHORTREP_LIT_LIT) - state = LIT_LIT; - else if (state <= LIT_SHORTREP) - state -= 3; - else - state -= 6; - } - - void updateMatch() { - state = state < LIT_STATES ? LIT_MATCH : NONLIT_MATCH; - } - - void updateLongRep() { - state = state < LIT_STATES ? LIT_LONGREP : NONLIT_REP; - } - - void updateShortRep() { - state = state < LIT_STATES ? LIT_SHORTREP : NONLIT_REP; - } - - boolean isLiteral() { - return state < LIT_STATES; - } -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/package-info.java b/third-party/xz/src/main/java/org/tukaani/xz/package-info.java deleted file mode 100644 index 8c6caea2..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/package-info.java +++ /dev/null @@ -1,21 +0,0 @@ -/** - * XZ data compression support. - *

- * In the (very) long term, this aims to be a complete implementation of - * XZ data compression in Java. Currently only streamed decompression is - * supported. - *

- * For the latest source code, see the - * home page of XZ in Java. - * - *

Decompression notes

- * - * If you are decompressing complete files and your application knows - * exactly how much uncompressed data there should be, it is still good - * to try reading one more byte by calling read() and checking - * that it returns -1. This way the decompressor will parse the - * file footers and verify the integrity checks, giving the caller more - * confidence that the uncompressed data is valid. (This advice seems to - * apply to java.util.zip.GZIPInputStream too.) - */ -package org.tukaani.xz; diff --git a/third-party/xz/src/main/java/org/tukaani/xz/rangecoder/RangeCoder.java b/third-party/xz/src/main/java/org/tukaani/xz/rangecoder/RangeCoder.java deleted file mode 100644 index 81b4be80..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/rangecoder/RangeCoder.java +++ /dev/null @@ -1,25 +0,0 @@ -/* - * RangeCoder - * - * Authors: Lasse Collin - * Igor Pavlov - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz.rangecoder; - -public abstract class RangeCoder { - static final int SHIFT_BITS = 8; - static final int TOP_MASK = 0xFF000000; - static final int BIT_MODEL_TOTAL_BITS = 11; - static final int BIT_MODEL_TOTAL = 1 << BIT_MODEL_TOTAL_BITS; - static final short PROB_INIT = (short)(BIT_MODEL_TOTAL / 2); - static final int MOVE_BITS = 5; - - public static void initProbs(short[] probs) { - for (int i = 0; i < probs.length; ++i) - probs[i] = PROB_INIT; - } -} diff --git a/third-party/xz/src/main/java/org/tukaani/xz/rangecoder/RangeDecoder.java b/third-party/xz/src/main/java/org/tukaani/xz/rangecoder/RangeDecoder.java deleted file mode 100644 index f9ea4e56..00000000 --- a/third-party/xz/src/main/java/org/tukaani/xz/rangecoder/RangeDecoder.java +++ /dev/null @@ -1,129 +0,0 @@ -/* - * RangeDecoder - * - * Authors: Lasse Collin - * Igor Pavlov - * - * This file has been put into the public domain. - * You can do whatever you want with this file. - */ - -package org.tukaani.xz.rangecoder; - -import java.io.DataInputStream; -import java.io.IOException; -import org.tukaani.xz.CorruptedInputException; - -public final class RangeDecoder extends RangeCoder { - private static final int INIT_SIZE = 5; - - private final byte[] buf; - private int pos = 0; - private int end = 0; - - private int range = 0; - private int code = 0; - - public RangeDecoder(int inputSizeMax) { - buf = new byte[inputSizeMax - INIT_SIZE]; - } - - public void prepareInputBuffer(DataInputStream in, int len) - throws IOException { - if (len < INIT_SIZE) - throw new CorruptedInputException(); - - if (in.readUnsignedByte() != 0x00) - throw new CorruptedInputException(); - - code = in.readInt(); - range = 0xFFFFFFFF; - - pos = 0; - end = len - INIT_SIZE; - in.readFully(buf, 0, end); - } - - public boolean isInBufferOK() { - return pos <= end; - } - - public boolean isFinished() { - return pos == end && code == 0; - } - - public void normalize() throws IOException { - if ((range & TOP_MASK) == 0) { - try { - // If the input is corrupt, this might throw - // ArrayIndexOutOfBoundsException. - code = (code << SHIFT_BITS) | (buf[pos++] & 0xFF); - range <<= SHIFT_BITS; - } catch (ArrayIndexOutOfBoundsException e) { - throw new CorruptedInputException(); - } - } - } - - public int decodeBit(short[] probs, int index) throws IOException { - normalize(); - - int prob = probs[index]; - int bound = (range >>> BIT_MODEL_TOTAL_BITS) * prob; - int bit; - - // Compare code and bound as if they were unsigned 32-bit integers. - if ((code ^ 0x80000000) < (bound ^ 0x80000000)) { - range = bound; - probs[index] = (short)( - prob + ((BIT_MODEL_TOTAL - prob) >>> MOVE_BITS)); - bit = 0; - } else { - range -= bound; - code -= bound; - probs[index] = (short)(prob - (prob >>> MOVE_BITS)); - bit = 1; - } - - return bit; - } - - public int decodeBitTree(short[] probs) throws IOException { - int symbol = 1; - - do { - symbol = (symbol << 1) | decodeBit(probs, symbol); - } while (symbol < probs.length); - - return symbol - probs.length; - } - - public int decodeReverseBitTree(short[] probs) throws IOException { - int symbol = 1; - int i = 0; - int result = 0; - - do { - int bit = decodeBit(probs, symbol); - symbol = (symbol << 1) | bit; - result |= bit << i++; - } while (symbol < probs.length); - - return result; - } - - public int decodeDirectBits(int count) throws IOException { - int result = 0; - - do { - normalize(); - - range >>>= 1; - int t = (code - range) >>> 31; - code -= range & (t - 1); - result = (result << 1) | (1 - t); - } while (--count != 0); - - return result; - } -}