(control) Fully automatic conversion
Removed the need to have to run an external tool to pre-process the data in order to load stackexchange-style data into the search engine. Removed the tool itself. This stirred up some issues with the dependencies, that were due to both third-party:ing xz and importing it as a dependency. This has been fixed, and :third-party:xz was removed.
This commit is contained in:
parent
3a325845c7
commit
40c9d2050f
@ -13,7 +13,6 @@ java {
|
||||
dependencies {
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
implementation 'org.tukaani:xz:1.8'
|
||||
implementation project(':code:libraries:blocking-thread-pool')
|
||||
implementation project(':code:common:model')
|
||||
implementation libs.notnull
|
||||
@ -26,6 +25,7 @@ dependencies {
|
||||
implementation libs.zstd
|
||||
implementation libs.trove
|
||||
implementation libs.commons.compress
|
||||
implementation libs.xz
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
|
@ -36,8 +36,8 @@ public class StackExchangePostsDb {
|
||||
public static void create(String domain,
|
||||
Path sqliteFile,
|
||||
Path stackExchange7zFile) {
|
||||
if (Files.exists(sqliteFile))
|
||||
Files.delete(sqliteFile);
|
||||
Files.deleteIfExists(sqliteFile);
|
||||
|
||||
String connStr = "jdbc:sqlite:" + sqliteFile;
|
||||
|
||||
try (var connection = DriverManager.getConnection(connStr);
|
||||
|
@ -95,8 +95,6 @@ dependencies {
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
|
||||
implementation 'org.tukaani:xz:1.8'
|
||||
|
||||
testImplementation project(':code:processes:test-data')
|
||||
testImplementation project(':code:processes:crawling-process')
|
||||
}
|
||||
|
@ -240,50 +240,57 @@ public class ConverterMain extends ProcessMainClass {
|
||||
var msgOpt = getMessage(inbox, nu.marginalia.mqapi.converting.ConvertRequest.class.getSimpleName());
|
||||
var msg = msgOpt.orElseThrow(() -> new RuntimeException("No message received"));
|
||||
|
||||
var request = gson.fromJson(msg.payload(), nu.marginalia.mqapi.converting.ConvertRequest.class);
|
||||
try {
|
||||
var request = gson.fromJson(msg.payload(), nu.marginalia.mqapi.converting.ConvertRequest.class);
|
||||
|
||||
return switch(request.action) {
|
||||
case ConvertCrawlData -> {
|
||||
var crawlData = fileStorageService.getStorage(request.crawlStorage);
|
||||
var processData = fileStorageService.getStorage(request.processedDataStorage);
|
||||
return switch (request.action) {
|
||||
case ConvertCrawlData -> {
|
||||
var crawlData = fileStorageService.getStorage(request.crawlStorage);
|
||||
var processData = fileStorageService.getStorage(request.processedDataStorage);
|
||||
|
||||
var plan = new CrawlPlan(null,
|
||||
new CrawlPlan.WorkDir(crawlData.path(), "crawler.log"),
|
||||
new CrawlPlan.WorkDir(processData.path(), "processor.log"));
|
||||
var plan = new CrawlPlan(null,
|
||||
new CrawlPlan.WorkDir(crawlData.path(), "crawler.log"),
|
||||
new CrawlPlan.WorkDir(processData.path(), "processor.log"));
|
||||
|
||||
yield new ConvertCrawlDataAction(plan, msg, inbox);
|
||||
}
|
||||
case SideloadEncyclopedia -> {
|
||||
var processData = fileStorageService.getStorage(request.processedDataStorage);
|
||||
yield new ConvertCrawlDataAction(plan, msg, inbox);
|
||||
}
|
||||
case SideloadEncyclopedia -> {
|
||||
var processData = fileStorageService.getStorage(request.processedDataStorage);
|
||||
|
||||
yield new SideloadAction(sideloadSourceFactory.sideloadEncyclopediaMarginaliaNu(Path.of(request.inputSource), request.baseUrl),
|
||||
processData.asPath(),
|
||||
msg, inbox);
|
||||
}
|
||||
case SideloadDirtree -> {
|
||||
var processData = fileStorageService.getStorage(request.processedDataStorage);
|
||||
yield new SideloadAction(sideloadSourceFactory.sideloadEncyclopediaMarginaliaNu(Path.of(request.inputSource), request.baseUrl),
|
||||
processData.asPath(),
|
||||
msg, inbox);
|
||||
}
|
||||
case SideloadDirtree -> {
|
||||
var processData = fileStorageService.getStorage(request.processedDataStorage);
|
||||
|
||||
yield new SideloadAction(
|
||||
sideloadSourceFactory.sideloadDirtree(Path.of(request.inputSource)),
|
||||
processData.asPath(),
|
||||
msg, inbox);
|
||||
}
|
||||
case SideloadWarc -> {
|
||||
var processData = fileStorageService.getStorage(request.processedDataStorage);
|
||||
yield new SideloadAction(
|
||||
sideloadSourceFactory.sideloadDirtree(Path.of(request.inputSource)),
|
||||
processData.asPath(),
|
||||
msg, inbox);
|
||||
}
|
||||
case SideloadWarc -> {
|
||||
var processData = fileStorageService.getStorage(request.processedDataStorage);
|
||||
|
||||
yield new SideloadAction(
|
||||
sideloadSourceFactory.sideloadWarc(Path.of(request.inputSource)),
|
||||
processData.asPath(),
|
||||
msg, inbox);
|
||||
}
|
||||
case SideloadStackexchange -> {
|
||||
var processData = fileStorageService.getStorage(request.processedDataStorage);
|
||||
yield new SideloadAction(
|
||||
sideloadSourceFactory.sideloadWarc(Path.of(request.inputSource)),
|
||||
processData.asPath(),
|
||||
msg, inbox);
|
||||
}
|
||||
case SideloadStackexchange -> {
|
||||
var processData = fileStorageService.getStorage(request.processedDataStorage);
|
||||
|
||||
yield new SideloadAction(sideloadSourceFactory.sideloadStackexchange(Path.of(request.inputSource)),
|
||||
processData.asPath(),
|
||||
msg, inbox);
|
||||
}
|
||||
};
|
||||
yield new SideloadAction(sideloadSourceFactory.sideloadStackexchange(Path.of(request.inputSource)),
|
||||
processData.asPath(),
|
||||
msg, inbox);
|
||||
}
|
||||
};
|
||||
}
|
||||
catch (Exception ex) {
|
||||
inbox.sendResponse(msg, MqInboxResponse.err(STR."\{ex.getClass().getSimpleName()}: \{ex.getMessage()}"));
|
||||
|
||||
throw ex;
|
||||
}
|
||||
}
|
||||
|
||||
private Optional<MqMessage> getMessage(MqSingleShotInbox inbox, String expectedFunction) throws SQLException, InterruptedException {
|
||||
|
@ -16,6 +16,7 @@ import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
||||
public class SideloadSourceFactory {
|
||||
private final Gson gson;
|
||||
@ -57,14 +58,21 @@ public class SideloadSourceFactory {
|
||||
return warcSideloadFactory.createSideloaders(pathToWarcFiles);
|
||||
}
|
||||
|
||||
/** Do not use, this code isn't finished */
|
||||
public Collection<? extends SideloadSource> sideloadStackexchange(Path pathToDbFileRoot) throws IOException {
|
||||
try (var dirs = Files.walk(pathToDbFileRoot)) {
|
||||
return dirs
|
||||
.filter(Files::isRegularFile)
|
||||
.filter(f -> f.toFile().getName().endsWith(".db"))
|
||||
.map(dbFile -> new StackexchangeSideloader(dbFile, sentenceExtractorProvider, documentKeywordExtractor))
|
||||
.toList();
|
||||
if (Files.isRegularFile(pathToDbFileRoot)) {
|
||||
return List.of(new StackexchangeSideloader(pathToDbFileRoot, sentenceExtractorProvider, documentKeywordExtractor));
|
||||
}
|
||||
else if (Files.isDirectory(pathToDbFileRoot)) {
|
||||
try (var dirs = Files.walk(pathToDbFileRoot)) {
|
||||
return dirs
|
||||
.filter(Files::isRegularFile)
|
||||
.filter(f -> f.toFile().getName().endsWith(".db"))
|
||||
.map(dbFile -> new StackexchangeSideloader(dbFile, sentenceExtractorProvider, documentKeywordExtractor))
|
||||
.toList();
|
||||
}
|
||||
}
|
||||
else { // unix socket, etc
|
||||
throw new IllegalArgumentException("Path to stackexchange db file(s) must be a file or directory");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -21,23 +21,33 @@ public class WarcSideloadFactory {
|
||||
}
|
||||
|
||||
public Collection<? extends SideloadSource> createSideloaders(Path pathToWarcFiles) throws IOException {
|
||||
final List<Path> files = new ArrayList<>();
|
||||
|
||||
try (var stream = Files.list(pathToWarcFiles)) {
|
||||
stream
|
||||
.filter(Files::isRegularFile)
|
||||
.filter(this::isWarcFile)
|
||||
.forEach(files::add);
|
||||
|
||||
if (Files.isRegularFile(pathToWarcFiles)) {
|
||||
return List.of(new WarcSideloader(pathToWarcFiles, processing));
|
||||
}
|
||||
else if (Files.isDirectory(pathToWarcFiles)) {
|
||||
|
||||
List<WarcSideloader> sources = new ArrayList<>();
|
||||
final List<Path> files = new ArrayList<>();
|
||||
|
||||
for (Path file : files) {
|
||||
sources.add(new WarcSideloader(file, processing));
|
||||
try (var stream = Files.list(pathToWarcFiles)) {
|
||||
stream
|
||||
.filter(Files::isRegularFile)
|
||||
.filter(this::isWarcFile)
|
||||
.forEach(files::add);
|
||||
|
||||
}
|
||||
|
||||
List<WarcSideloader> sources = new ArrayList<>();
|
||||
|
||||
for (Path file : files) {
|
||||
sources.add(new WarcSideloader(file, processing));
|
||||
}
|
||||
|
||||
return sources;
|
||||
}
|
||||
else {
|
||||
throw new IllegalArgumentException("Path " + pathToWarcFiles + " is neither a file nor a directory");
|
||||
}
|
||||
|
||||
return sources;
|
||||
}
|
||||
|
||||
private boolean isWarcFile(Path path) {
|
||||
|
@ -40,6 +40,7 @@ dependencies {
|
||||
implementation project(':code:process-models:crawling-model')
|
||||
implementation project(':code:features-crawl:link-parser')
|
||||
implementation project(':code:features-convert:data-extractors')
|
||||
implementation project(':code:features-convert:stackexchange-xml')
|
||||
implementation project(':code:features-index:index-journal')
|
||||
implementation project(':code:api:index-api')
|
||||
implementation project(':code:api:query-api')
|
||||
|
@ -10,6 +10,8 @@ import nu.marginalia.actor.state.Resume;
|
||||
import nu.marginalia.encyclopedia.EncyclopediaConverter;
|
||||
import nu.marginalia.process.ProcessOutboxes;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.sideload.SideloadHelper;
|
||||
import nu.marginalia.sideload.StackExchangeSideloadHelper;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageBaseType;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
@ -21,11 +23,8 @@ import nu.marginalia.mqapi.converting.ConvertRequest;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.zip.CRC32;
|
||||
|
||||
@Singleton
|
||||
public class ConvertActor extends RecordActorPrototype {
|
||||
@ -109,7 +108,7 @@ public class ConvertActor extends RecordActorPrototype {
|
||||
|
||||
if (source.toLowerCase().endsWith(".zim")) {
|
||||
// If we're fed a ZIM file, we need to convert it to a sqlite database first
|
||||
String hash = getCrc32FileHash(sourcePath);
|
||||
String hash = SideloadHelper.getCrc32FileHash(sourcePath);
|
||||
|
||||
// To avoid re-converting the same file, we'll assign the file a name based on its hash
|
||||
// and the original filename. This way, if we're fed the same file again, we'll be able to just
|
||||
@ -179,6 +178,10 @@ public class ConvertActor extends RecordActorPrototype {
|
||||
|
||||
storageService.setFileStorageState(processedArea.id(), FileStorageState.NEW);
|
||||
|
||||
// Convert stackexchange data to sqlite database
|
||||
// (we can't use a Predigest- step here because the conversion is too complicated)
|
||||
StackExchangeSideloadHelper.convertStackexchangeData(sourcePath);
|
||||
|
||||
// Pre-send convert request
|
||||
|
||||
yield new ConvertWait(
|
||||
@ -200,21 +203,7 @@ public class ConvertActor extends RecordActorPrototype {
|
||||
};
|
||||
}
|
||||
|
||||
private String getCrc32FileHash(Path file) throws IOException {
|
||||
ByteBuffer buffer = ByteBuffer.allocate(8192);
|
||||
|
||||
try (var channel = Files.newByteChannel(file)) {
|
||||
CRC32 crc = new CRC32();
|
||||
|
||||
while (channel.read(buffer) > 0) {
|
||||
buffer.flip();
|
||||
crc.update(buffer);
|
||||
buffer.clear();
|
||||
}
|
||||
|
||||
return Long.toHexString(crc.getValue());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String describe() {
|
||||
|
@ -0,0 +1,25 @@
|
||||
package nu.marginalia.sideload;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.zip.CRC32;
|
||||
|
||||
public class SideloadHelper {
|
||||
public static String getCrc32FileHash(Path file) throws IOException {
|
||||
ByteBuffer buffer = ByteBuffer.allocate(8192);
|
||||
|
||||
try (var channel = Files.newByteChannel(file)) {
|
||||
CRC32 crc = new CRC32();
|
||||
|
||||
while (channel.read(buffer) > 0) {
|
||||
buffer.flip();
|
||||
crc.update(buffer);
|
||||
buffer.clear();
|
||||
}
|
||||
|
||||
return Long.toHexString(crc.getValue());
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,102 @@
|
||||
package nu.marginalia.sideload;
|
||||
|
||||
import nu.marginalia.integration.stackexchange.sqlite.StackExchangePostsDb;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.util.Optional;
|
||||
import java.util.zip.CRC32;
|
||||
|
||||
/** Contains helper functions for pre-converting stackexchange style 7z
|
||||
* files to marginalia-digestible sqlite databases*/
|
||||
public class StackExchangeSideloadHelper {
|
||||
private static final Logger logger = LoggerFactory.getLogger(StackExchangeSideloadHelper.class);
|
||||
|
||||
/** Looks for stackexchange 7z files in the given path and converts them to sqlite databases.
|
||||
* The function is idempotent, so it is safe to call it multiple times on the same path
|
||||
* (it will not re-convert files that have already been successfully converted)
|
||||
* */
|
||||
public static void convertStackexchangeData(Path sourcePath) {
|
||||
if (Files.isDirectory(sourcePath)) {
|
||||
try (var contents = Files.list(sourcePath)) {
|
||||
contents.filter(Files::isRegularFile)
|
||||
.parallel()
|
||||
.forEach(StackExchangeSideloadHelper::convertSingleStackexchangeFile);
|
||||
} catch (IOException ex) {
|
||||
logger.warn("Failed to convert stackexchange 7z file to sqlite database", ex);
|
||||
}
|
||||
} else if (Files.isRegularFile(sourcePath)) {
|
||||
convertSingleStackexchangeFile(sourcePath);
|
||||
}
|
||||
}
|
||||
|
||||
private static void convertSingleStackexchangeFile(Path sourcePath) {
|
||||
String fileName = sourcePath.toFile().getName();
|
||||
|
||||
if (fileName.endsWith(".db")) return;
|
||||
if (!fileName.endsWith(".7z")) return;
|
||||
|
||||
Optional<String> domain = getStackexchangeDomainFromFilename(fileName);
|
||||
if (domain.isEmpty())
|
||||
return;
|
||||
|
||||
try {
|
||||
Path destPath = getStackexchangeDbPath(sourcePath);
|
||||
if (Files.exists(destPath)) return;
|
||||
|
||||
Path tempFile = Files.createTempFile(destPath.getParent(), "processed", "db.tmp");
|
||||
try {
|
||||
logger.info("Converting stackexchange 7z file {} to sqlite database", sourcePath);
|
||||
StackExchangePostsDb.create(domain.get(), tempFile, sourcePath);
|
||||
logger.info("Finished converting stackexchange 7z file {} to sqlite database", sourcePath);
|
||||
Files.move(tempFile, destPath, StandardCopyOption.REPLACE_EXISTING);
|
||||
} catch (Exception e) {
|
||||
logger.error("Failed to convert stackexchange 7z file to sqlite database", e);
|
||||
Files.deleteIfExists(tempFile);
|
||||
Files.deleteIfExists(destPath);
|
||||
}
|
||||
} catch (IOException ex) {
|
||||
logger.warn("Failed to convert stackexchange 7z file to sqlite database", ex);
|
||||
}
|
||||
}
|
||||
|
||||
private static Path getStackexchangeDbPath(Path sourcePath) throws IOException {
|
||||
String fileName = sourcePath.toFile().getName();
|
||||
String hash = SideloadHelper.getCrc32FileHash(sourcePath);
|
||||
|
||||
return sourcePath.getParent().resolve(STR."\{fileName}.\{hash}.db");
|
||||
}
|
||||
|
||||
private static Optional<String> getStackexchangeDomainFromFilename(String fileName) {
|
||||
// We are only interested in .tld.7z files
|
||||
if (!fileName.endsWith(".7z") && fileName.length() > 7)
|
||||
return Optional.empty();
|
||||
|
||||
|
||||
// Stackoverflow is special, because it has one 7z file per site
|
||||
// (we only want Posts)
|
||||
|
||||
if (fileName.equals("stackoverflow-Posts.7z"))
|
||||
return Optional.of("stackoverflow.com");
|
||||
else if (fileName.startsWith("stackoverflow.com-")) {
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
// For stackexchange, we filter out the meta archives
|
||||
|
||||
// We are not interested in the meta files
|
||||
if (fileName.startsWith("meta."))
|
||||
return Optional.empty();
|
||||
if (fileName.contains(".meta."))
|
||||
return Optional.empty();
|
||||
|
||||
// Pattern is 'foobar.stackexchange.com.7z'
|
||||
return Optional.of(fileName.substring(0, fileName.length() - 3));
|
||||
}
|
||||
|
||||
}
|
@ -1,40 +0,0 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
|
||||
id 'application'
|
||||
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(21))
|
||||
}
|
||||
}
|
||||
|
||||
application {
|
||||
mainClass = 'nu.marginalia.tools.StackexchangeConverter'
|
||||
applicationName = 'stackexchange-converter'
|
||||
}
|
||||
|
||||
tasks.distZip.enabled = false
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:features-convert:stackexchange-xml')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.notnull
|
||||
|
||||
implementation libs.guice
|
||||
implementation libs.jsoup
|
||||
implementation libs.trove
|
||||
implementation libs.fastutil
|
||||
|
||||
implementation libs.bundles.nlp
|
||||
implementation libs.commons.lang3
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
}
|
||||
|
@ -1,24 +0,0 @@
|
||||
This tool converts from stackexchange's 7z-compressed XML
|
||||
format to a sqlite database that is digestible by the search engine.
|
||||
|
||||
See [features-convert/stackexchange-xml](../../features-convert/stackexchange-xml) for
|
||||
an explanation why this is necessary.
|
||||
|
||||
Stackexchange's data dumps can be downloaded from archive.org
|
||||
here: [https://archive.org/details/stackexchange](https://archive.org/details/stackexchange)
|
||||
|
||||
<b>Usage</b>
|
||||
|
||||
```shell
|
||||
$ stackexchange-converter domain-name input.7z output.db
|
||||
```
|
||||
|
||||
Stackexchange is relatively conservative about allowing
|
||||
new questions, so this is a job that doesn't run more than once.
|
||||
|
||||
<b>Note</b>: Reading and writing these db files is *absurdly* slow
|
||||
on a mechanical hard-drive.
|
||||
|
||||
## See Also
|
||||
|
||||
* [features-convert/stackexchange-xml](../../features-convert/stackexchange-xml)
|
@ -1,31 +0,0 @@
|
||||
package nu.marginalia.tools;
|
||||
|
||||
import nu.marginalia.integration.stackexchange.sqlite.StackExchangePostsDb;
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
public class StackexchangeConverter {
|
||||
public static void main(String[] args) {
|
||||
|
||||
if (args.length != 3) {
|
||||
System.err.println("Converts a stackexchange Posts 7z file to a Marginalia-digestible sqlite-db\n");
|
||||
System.err.println("Arguments: domain-name input-file.7z output-file.db");
|
||||
return;
|
||||
}
|
||||
|
||||
String domain = args[0];
|
||||
|
||||
Path inputFile = Path.of(args[1]);
|
||||
Path outputFile = Path.of(args[2]);
|
||||
|
||||
if (!Files.exists(inputFile))
|
||||
System.err.println("Input file " + inputFile + " does not exists");
|
||||
|
||||
System.out.println("Converting " + inputFile);
|
||||
|
||||
StackExchangePostsDb.create(domain, outputFile, inputFile);
|
||||
|
||||
System.out.println("... done!");
|
||||
}
|
||||
}
|
BIN
doc/images/convert_2.png
Normal file
BIN
doc/images/convert_2.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 62 KiB |
BIN
doc/images/load_warc.png
Normal file
BIN
doc/images/load_warc.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 35 KiB |
BIN
doc/images/sideload_menu.png
Normal file
BIN
doc/images/sideload_menu.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 48 KiB |
BIN
doc/images/sideload_warc.png
Normal file
BIN
doc/images/sideload_warc.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 44 KiB |
@ -1,23 +1,121 @@
|
||||
# Sideloading How-To
|
||||
|
||||
(This document is a bit of a draft to get this down in writing
|
||||
while it's still fresh in my head.)
|
||||
|
||||
Some websites are much larger than others, this includes
|
||||
Wikipedia, Stack Overflow, and a few others. They are so
|
||||
large they are impractical to crawl in the traditional fashion,
|
||||
but luckily they make available data dumps that can be processed
|
||||
and loaded into the search engine through other means.
|
||||
|
||||
## Notes on Docker
|
||||
To this end, it's possible to sideload data into the search engine
|
||||
from other sources than the web crawler.
|
||||
|
||||
If you're running the system in docker, you'll need to provide the paths
|
||||
to the data in a way where it is available to the docker container.
|
||||
## Index Nodes
|
||||
|
||||
In practice, if you want to sideload data, you need to do it on
|
||||
a separate index node. Index nodes are separate instances of the
|
||||
index software. The default configuration is to have two index nodes,
|
||||
one for the web crawler, and one for sideloaded data.
|
||||
|
||||
The need for a separate node is due to incompatibilities in the work flows.
|
||||
|
||||
It is also a good idea in general, as very large domains can easily be so large that the entire time budget
|
||||
for the query is spent sifting through documents from that one domain, this is
|
||||
especially true with something like Wikipedia, which has a lot of documents at
|
||||
least tangentially related to any given topic.
|
||||
|
||||
This how-to assumes that you are operating on index-node 2.
|
||||
|
||||
## Notes on the upload directory
|
||||
|
||||
This is written assuming that the system is installed with the `install.sh`
|
||||
script, which deploys the system with docker-compose, and has a directory
|
||||
structure like
|
||||
|
||||
```
|
||||
...
|
||||
index-1/backup/
|
||||
index-1/index/
|
||||
index-1/storage/
|
||||
index-1/uploads/
|
||||
index-1/work/
|
||||
index-2/backup/
|
||||
index-2/index/
|
||||
index-2/storage/
|
||||
index-2/uploads/
|
||||
index-2/work/
|
||||
...
|
||||
```
|
||||
|
||||
We're going to be putting files in the **uploads** directories. If you have installed
|
||||
the system in some other way, or changed the configuration significantly, you need
|
||||
to adjust the paths accordingly.
|
||||
|
||||
## Sideloading
|
||||
|
||||
The sideloading actions are available through Actions menu in each node.
|
||||
|
||||
![Sideload menu](images/sideload_menu.png)
|
||||
|
||||
## Sideloading WARCs
|
||||
|
||||
WARC files are the standard format for web archives. They can be created e.g. with wget.
|
||||
The Marginalia software can read WARC files directly, and sideload them into the index,
|
||||
as long as each warc file contains only one domain.
|
||||
|
||||
Let's for example archive www.marginalia.nu (I own this domain, so feel free to try this at home)
|
||||
|
||||
```bash
|
||||
$ wget -r --warc-file=marginalia www.marginalia.nu
|
||||
```
|
||||
|
||||
**Note** If you intend to do this on other websites, you should probably add a `--wait` parameter to wget,
|
||||
e.g. `wget --wait=1 -r --warc-file=...` to avoid hammering the website with requests and getting blocked.
|
||||
|
||||
This will take a moment, and create a file called `marginalia.warc.gz`. We move it to the
|
||||
upload directory of the index node, and sideload it through the Actions menu.
|
||||
|
||||
```bash
|
||||
$ mkdir -p index-2/uploads/marginalia-warc
|
||||
$ mv marginalia.warc.gz index-2/uploads/marginalia-warc
|
||||
```
|
||||
|
||||
Go to the Actions menu, and select the "Sideload WARC" action. This will show a list of
|
||||
subdirectories in the Uploads directory. Select the directory containing the WARC file, and
|
||||
click "Sideload".
|
||||
|
||||
![Sideload WARC screenshot](images/sideload_warc.png)
|
||||
|
||||
This should take you to the node overview, where you can see the progress of the sideloading.
|
||||
It will take a moment, as the WARC file is being processed.
|
||||
|
||||
![Processing in progress](images/convert_2.png)
|
||||
|
||||
It will not be loaded automatically. This is to permit you to sideload multiple sources.
|
||||
|
||||
When you are ready to load it, go to the Actions menu, and select "Load Crawl Data".
|
||||
|
||||
![Load Crawl Data](images/load_warc.png)
|
||||
|
||||
Select all the sources you want to load, and click "Load". This will load the data into the
|
||||
index, and make it available for searching.
|
||||
|
||||
## Sideloading Wikipedia
|
||||
|
||||
Due to licensing incompatibilities with OpenZim's GPL-2 and AGPL, the workflow
|
||||
depends on using the conversion process from [https://encyclopedia.marginalia.nu/](https://encyclopedia.marginalia.nu/)
|
||||
to pre-digest the data.
|
||||
|
||||
Build the [encyclopedia.marginalia.nu Code](https://github.com/MarginaliaSearch/encyclopedia.marginalia.nu)
|
||||
and follow the instructions for downloading a ZIM file, and then run something like
|
||||
|
||||
```$./encyclopedia convert file.zim articles.db```
|
||||
|
||||
This db-file can be processed and loaded into the search engine through the
|
||||
Actions view.
|
||||
|
||||
FIXME: It will currently only point to en.wikipedia.org, this should be
|
||||
made configurable.
|
||||
|
||||
Either mount the data into the executor's container, or copy it into e.g.
|
||||
the data directory, which is mounted into the container as `/wmsa/data`.
|
||||
For a test deployment, a file placed in `run/data/foo` will be available
|
||||
in the container as `/wmsa/data/foo`.
|
||||
|
||||
## Sideloading a directory tree
|
||||
|
||||
@ -98,23 +196,6 @@ python-3.11.5/[...]
|
||||
This yaml-file can be processed and loaded into the search engine through the
|
||||
Actions view.
|
||||
|
||||
## Sideloading Wikipedia
|
||||
|
||||
For now, this workflow depends on using the conversion process from
|
||||
[https://encyclopedia.marginalia.nu/](https://encyclopedia.marginalia.nu/)
|
||||
to pre-digest the data. This is because it uses OpenZIM which has a
|
||||
license that is incompatible with this project.
|
||||
|
||||
Build the [encyclopedia.marginalia.nu Code](https://github.com/MarginaliaSearch/encyclopedia.marginalia.nu)
|
||||
and follow the instructions for downloading a ZIM file, and then run something like
|
||||
|
||||
```$./encyclopedia convert file.zim articles.db```
|
||||
|
||||
This db-file can be processed and loaded into the search engine through the
|
||||
Actions view.
|
||||
|
||||
FIXME: It will currently only point to en.wikipedia.org, this should be
|
||||
made configurable.
|
||||
|
||||
## Sideloading Stack Overflow/Stackexchange
|
||||
|
||||
|
@ -85,11 +85,9 @@ include 'code:tools:term-frequency-extractor'
|
||||
include 'code:tools:experiment-runner'
|
||||
include 'code:tools:screenshot-capture-tool'
|
||||
include 'code:tools:load-test'
|
||||
include 'code:tools:stackexchange-converter'
|
||||
include 'code:tools:crawl-data-unfcker'
|
||||
|
||||
include 'third-party:porterstemmer'
|
||||
include 'third-party:xz'
|
||||
include 'third-party:symspell'
|
||||
include 'third-party:rdrpostagger'
|
||||
include 'third-party:openzim'
|
||||
@ -164,7 +162,7 @@ dependencyResolutionManagement {
|
||||
library('httpcomponents.client','org.apache.httpcomponents','httpclient').version('4.5.13')
|
||||
library('commons.net', 'commons-net','commons-net').version('3.9.0')
|
||||
library('commons.lang3', 'org.apache.commons','commons-lang3').version('3.12.0')
|
||||
library('commons.compress','org.apache.commons','commons-compress').version('1.21')
|
||||
library('commons.compress','org.apache.commons','commons-compress').version('1.25.0')
|
||||
library('commons.io','commons-io','commons-io').version('2.11.0')
|
||||
library('commons.codec', 'commons-codec', 'commons-codec').version('1.16.0')
|
||||
|
||||
@ -185,6 +183,7 @@ dependencyResolutionManagement {
|
||||
|
||||
library('zstd','com.github.luben','zstd-jni').version('1.5.2-2')
|
||||
library('lz4','org.lz4','lz4-java').version('1.8.0')
|
||||
library('xz','org.tukaani','xz').version('1.9')
|
||||
|
||||
library('flyway.core','org.flywaydb','flyway-core').version('10.4.1')
|
||||
library('flyway.mysql','org.flywaydb','flyway-mysql').version('10.4.1')
|
||||
|
@ -17,7 +17,6 @@ dependencies {
|
||||
|
||||
implementation project(':code:libraries:blocking-thread-pool')
|
||||
|
||||
implementation project(':third-party:xz')
|
||||
implementation project(':third-party:openzim')
|
||||
}
|
||||
|
||||
|
2
third-party/openzim/build.gradle
vendored
2
third-party/openzim/build.gradle
vendored
@ -16,7 +16,7 @@ dependencies {
|
||||
implementation libs.databind
|
||||
implementation libs.bundles.gson
|
||||
|
||||
implementation project(':third-party:xz')
|
||||
implementation libs.xz
|
||||
}
|
||||
|
||||
test {
|
||||
|
16
third-party/xz/build.gradle
vendored
16
third-party/xz/build.gradle
vendored
@ -1,16 +0,0 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(21))
|
||||
}
|
||||
}
|
||||
|
||||
dependencies {
|
||||
}
|
||||
|
||||
test {
|
||||
useJUnitPlatform()
|
||||
}
|
9
third-party/xz/readme.md
vendored
9
third-party/xz/readme.md
vendored
@ -1,9 +0,0 @@
|
||||
# XZ
|
||||
|
||||
[XZ for Java](https://tukaani.org/xz/) - Public Domain
|
||||
|
||||
"XZ Utils is free general-purpose data compression software with a high compression ratio.
|
||||
XZ Utils were written for POSIX-like systems, but also work on some not-so-POSIX systems.
|
||||
XZ Utils are the successor to LZMA Utils."
|
||||
|
||||
Needed for [openzim](../openzim) to deal with modern zim files.
|
@ -1,212 +0,0 @@
|
||||
/*
|
||||
* BlockInputStream
|
||||
*
|
||||
* Author: Lasse Collin <lasse.collin@tukaani.org>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.DataInputStream;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import org.tukaani.xz.common.DecoderUtil;
|
||||
import org.tukaani.xz.check.Check;
|
||||
|
||||
class BlockInputStream extends InputStream {
|
||||
private final InputStream in;
|
||||
private final DataInputStream inData;
|
||||
private final CountingInputStream inCounted;
|
||||
private InputStream filterChain;
|
||||
private final Check check;
|
||||
|
||||
private long uncompressedSizeInHeader = -1;
|
||||
private long compressedSizeInHeader = -1;
|
||||
private long compressedSizeLimit;
|
||||
private final int headerSize;
|
||||
private long uncompressedSize = 0;
|
||||
|
||||
public BlockInputStream(InputStream in, Check check, int memoryLimit)
|
||||
throws IOException, IndexIndicatorException {
|
||||
this.in = in;
|
||||
this.check = check;
|
||||
inData = new DataInputStream(in);
|
||||
|
||||
byte[] buf = new byte[DecoderUtil.BLOCK_HEADER_SIZE_MAX];
|
||||
|
||||
// Block Header Size or Index Indicator
|
||||
inData.readFully(buf, 0, 1);
|
||||
|
||||
// See if this begins the Index field.
|
||||
if (buf[0] == 0x00)
|
||||
throw new IndexIndicatorException();
|
||||
|
||||
// Read the rest of the Block Header.
|
||||
headerSize = 4 * (buf[0] + 1);
|
||||
inData.readFully(buf, 1, headerSize - 1);
|
||||
|
||||
// Validate the CRC32.
|
||||
if (!DecoderUtil.isCRC32Valid(buf, 0, headerSize - 4, headerSize - 4))
|
||||
throw new CorruptedInputException("XZ Block Header is corrupt");
|
||||
|
||||
// Check for reserved bits in Block Flags.
|
||||
if ((buf[1] & 0x3C) != 0)
|
||||
throw new UnsupportedOptionsException(
|
||||
"Unsupported options in XZ Block Header");
|
||||
|
||||
// Memory for the Filter Flags field
|
||||
int filterCount = (buf[1] & 0x03) + 1;
|
||||
long[] filterIDs = new long[filterCount];
|
||||
byte[][] filterProps = new byte[filterCount][];
|
||||
|
||||
// Use a stream to parse the fields after the Block Flags field.
|
||||
// Exclude the CRC32 field at the end.
|
||||
ByteArrayInputStream bufStream = new ByteArrayInputStream(
|
||||
buf, 2, headerSize - 6);
|
||||
|
||||
try {
|
||||
// Set the maximum valid compressed size. This is overriden
|
||||
// by the value from the Compressed Size field if it is present.
|
||||
compressedSizeLimit = (DecoderUtil.VLI_MAX & ~3)
|
||||
- headerSize - check.getSize();
|
||||
|
||||
// Decode and validate Compressed Size if the relevant flag
|
||||
// is set in Block Flags.
|
||||
if ((buf[1] & 0x40) != 0x00) {
|
||||
compressedSizeInHeader = DecoderUtil.decodeVLI(bufStream);
|
||||
|
||||
if (compressedSizeInHeader == 0
|
||||
|| compressedSizeInHeader > compressedSizeLimit)
|
||||
throw new CorruptedInputException();
|
||||
|
||||
compressedSizeLimit = compressedSizeInHeader;
|
||||
}
|
||||
|
||||
// Decode Uncompressed Size if the relevant flag is set
|
||||
// in Block Flags.
|
||||
if ((buf[1] & 0x80) != 0x00)
|
||||
uncompressedSizeInHeader = DecoderUtil.decodeVLI(bufStream);
|
||||
|
||||
// Decode Filter Flags.
|
||||
for (int i = 0; i < filterCount; ++i) {
|
||||
filterIDs[i] = DecoderUtil.decodeVLI(bufStream);
|
||||
|
||||
long filterPropsSize = DecoderUtil.decodeVLI(bufStream);
|
||||
if (filterPropsSize > bufStream.available())
|
||||
throw new CorruptedInputException();
|
||||
|
||||
filterProps[i] = new byte[(int)filterPropsSize];
|
||||
bufStream.read(filterProps[i]);
|
||||
}
|
||||
|
||||
} catch (IOException e) {
|
||||
throw new CorruptedInputException("XZ Block Header is corrupt");
|
||||
}
|
||||
|
||||
// Check that the remaining bytes are zero.
|
||||
for (int i = bufStream.available(); i > 0; --i)
|
||||
if (bufStream.read() != 0x00)
|
||||
throw new UnsupportedOptionsException(
|
||||
"Unsupported options in XZ Block Header");
|
||||
|
||||
// Check if the Filter IDs are supported, decode
|
||||
// the Filter Properties, and check that they are
|
||||
// supported by this decoder implementation.
|
||||
FilterDecoder[] filters = new FilterDecoder[filterIDs.length];
|
||||
|
||||
for (int i = 0; i < filters.length; ++i) {
|
||||
if (filterIDs[i] == LZMA2Coder.FILTER_ID)
|
||||
filters[i] = new LZMA2Decoder(filterProps[i]);
|
||||
|
||||
else if (filterIDs[i] == DeltaCoder.FILTER_ID)
|
||||
filters[i] = new DeltaDecoder(filterProps[i]);
|
||||
|
||||
else
|
||||
throw new UnsupportedOptionsException(
|
||||
"Unknown Filter ID " + filterIDs[i]);
|
||||
}
|
||||
|
||||
RawCoder.validate(filters);
|
||||
|
||||
// Check the memory usage limit.
|
||||
if (memoryLimit >= 0) {
|
||||
int memoryNeeded = 0;
|
||||
for (int i = 0; i < filters.length; ++i)
|
||||
memoryNeeded += filters[i].getMemoryUsage();
|
||||
|
||||
if (memoryNeeded > memoryLimit)
|
||||
throw new MemoryLimitException(memoryNeeded, memoryLimit);
|
||||
}
|
||||
|
||||
// Use an input size counter to calculate
|
||||
// the size of the Compressed Data field.
|
||||
inCounted = new CountingInputStream(in);
|
||||
|
||||
// Initialize the filter chain.
|
||||
filterChain = inCounted;
|
||||
for (int i = filters.length - 1; i >= 0; --i)
|
||||
filterChain = filters[i].getInputStream(filterChain);
|
||||
}
|
||||
|
||||
public int read() throws IOException {
|
||||
byte[] buf = new byte[1];
|
||||
return read(buf, 0, 1) == -1 ? -1 : (buf[0] & 0xFF);
|
||||
}
|
||||
|
||||
public int read(byte[] buf, int off, int len) throws IOException {
|
||||
int ret = filterChain.read(buf, off, len);
|
||||
long compressedSize = inCounted.getSize();
|
||||
|
||||
if (ret > 0) {
|
||||
check.update(buf, off, ret);
|
||||
uncompressedSize += ret;
|
||||
|
||||
// Catch invalid values.
|
||||
if (compressedSize < 0
|
||||
|| compressedSize > compressedSizeLimit
|
||||
|| uncompressedSize < 0
|
||||
|| (uncompressedSizeInHeader != -1
|
||||
&& uncompressedSize > uncompressedSizeInHeader))
|
||||
throw new CorruptedInputException();
|
||||
|
||||
} else if (ret == -1) {
|
||||
// Validate Compressed Size and Uncompressed Size if they were
|
||||
// present in Block Header.
|
||||
if ((compressedSizeInHeader != -1
|
||||
&& compressedSizeInHeader != compressedSize)
|
||||
|| (uncompressedSizeInHeader != -1
|
||||
&& uncompressedSizeInHeader != uncompressedSize))
|
||||
throw new CorruptedInputException();
|
||||
|
||||
// Block Padding bytes must be zeros.
|
||||
for (long i = compressedSize; (i & 3) != 0; ++i)
|
||||
if (inData.readUnsignedByte() != 0x00)
|
||||
throw new CorruptedInputException();
|
||||
|
||||
// Validate the integrity check.
|
||||
byte[] storedCheck = new byte[check.getSize()];
|
||||
inData.readFully(storedCheck);
|
||||
if (!Arrays.equals(check.finish(), storedCheck))
|
||||
throw new CorruptedInputException("Integrity ("
|
||||
+ check.getName() + ") check does not match");
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
public int available() throws IOException {
|
||||
return filterChain.available();
|
||||
}
|
||||
|
||||
public long getUnpaddedSize() {
|
||||
return headerSize + inCounted.getSize() + check.getSize();
|
||||
}
|
||||
|
||||
public long getUncompressedSize() {
|
||||
return uncompressedSize;
|
||||
}
|
||||
}
|
@ -1,128 +0,0 @@
|
||||
/*
|
||||
* BlockOutputStream
|
||||
*
|
||||
* Author: Lasse Collin <lasse.collin@tukaani.org>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz;
|
||||
|
||||
import java.io.OutputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import org.tukaani.xz.common.EncoderUtil;
|
||||
import org.tukaani.xz.check.Check;
|
||||
|
||||
class BlockOutputStream extends FinishableOutputStream {
|
||||
private final OutputStream out;
|
||||
private final CountingOutputStream outCounted;
|
||||
private FinishableOutputStream filterChain;
|
||||
private final Check check;
|
||||
|
||||
private final int headerSize;
|
||||
private final long compressedSizeLimit;
|
||||
private long uncompressedSize = 0;
|
||||
|
||||
public BlockOutputStream(OutputStream out, FilterEncoder[] filters,
|
||||
Check check) throws IOException {
|
||||
this.out = out;
|
||||
this.check = check;
|
||||
|
||||
// Initialize the filter chain.
|
||||
outCounted = new CountingOutputStream(out);
|
||||
filterChain = outCounted;
|
||||
for (int i = 0; i < filters.length; ++i)
|
||||
filterChain = filters[i].getOutputStream(filterChain);
|
||||
|
||||
// Prepare to encode the Block Header field.
|
||||
ByteArrayOutputStream bufStream = new ByteArrayOutputStream();
|
||||
|
||||
// Write a dummy Block Header Size field. The real value is written
|
||||
// once everything else except CRC32 has been written.
|
||||
bufStream.write(0x00);
|
||||
|
||||
// Write Block Flags. Storing Compressed Size or Uncompressed Size
|
||||
// isn't supported for now.
|
||||
bufStream.write(filters.length - 1);
|
||||
|
||||
// List of Filter Flags
|
||||
for (int i = 0; i < filters.length; ++i) {
|
||||
EncoderUtil.encodeVLI(bufStream, filters[i].getFilterID());
|
||||
byte[] filterProps = filters[i].getFilterProps();
|
||||
EncoderUtil.encodeVLI(bufStream, filterProps.length);
|
||||
bufStream.write(filterProps);
|
||||
}
|
||||
|
||||
// Header Padding
|
||||
while ((bufStream.size() & 3) != 0)
|
||||
bufStream.write(0x00);
|
||||
|
||||
byte[] buf = bufStream.toByteArray();
|
||||
|
||||
// Total size of the Block Header: Take the size of the CRC32 field
|
||||
// into account.
|
||||
headerSize = buf.length + 4;
|
||||
|
||||
// This is just a sanity check.
|
||||
if (headerSize > EncoderUtil.BLOCK_HEADER_SIZE_MAX)
|
||||
throw new UnsupportedOptionsException();
|
||||
|
||||
// Block Header Size
|
||||
buf[0] = (byte)(buf.length / 4);
|
||||
|
||||
// Write the Block Header field to the output stream.
|
||||
out.write(buf);
|
||||
EncoderUtil.writeCRC32(out, buf);
|
||||
|
||||
// Calculate the maximum allowed size of the Compressed Data field.
|
||||
// It is hard to exceed it so this is mostly to be pedantic.
|
||||
compressedSizeLimit = (EncoderUtil.VLI_MAX & ~3)
|
||||
- headerSize - check.getSize();
|
||||
}
|
||||
|
||||
public void write(int b) throws IOException {
|
||||
byte[] buf = new byte[1];
|
||||
buf[0] = (byte)b;
|
||||
write(buf, 0, 1);
|
||||
}
|
||||
|
||||
public void write(byte[] buf, int off, int len) throws IOException {
|
||||
filterChain.write(buf, off, len);
|
||||
check.update(buf, off, len);
|
||||
uncompressedSize += len;
|
||||
validate();
|
||||
}
|
||||
|
||||
public void finish() throws IOException {
|
||||
// Finish the Compressed Data field.
|
||||
filterChain.finish();
|
||||
validate();
|
||||
|
||||
// Block Padding
|
||||
for (long i = outCounted.getSize(); (i & 3) != 0; ++i)
|
||||
out.write(0x00);
|
||||
|
||||
// Check
|
||||
out.write(check.finish());
|
||||
}
|
||||
|
||||
private void validate() throws IOException {
|
||||
long compressedSize = outCounted.getSize();
|
||||
|
||||
// It is very hard to trigger this exception.
|
||||
// This is just to be pedantic.
|
||||
if (compressedSize < 0 || compressedSize > compressedSizeLimit
|
||||
|| uncompressedSize < 0)
|
||||
throw new XZIOException("XZ Stream has grown too big");
|
||||
}
|
||||
|
||||
public long getUnpaddedSize() {
|
||||
return headerSize + outCounted.getSize() + check.getSize();
|
||||
}
|
||||
|
||||
public long getUncompressedSize() {
|
||||
return uncompressedSize;
|
||||
}
|
||||
}
|
@ -1,37 +0,0 @@
|
||||
/*
|
||||
* CorruptedInputException
|
||||
*
|
||||
* Author: Lasse Collin <lasse.collin@tukaani.org>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz;
|
||||
|
||||
/**
|
||||
* Thrown when the compressed input data is corrupt.
|
||||
* However, it is possible that some or all of the data
|
||||
* already read from the input stream was corrupt too.
|
||||
*/
|
||||
public class CorruptedInputException extends XZIOException {
|
||||
private static final long serialVersionUID = 3L;
|
||||
|
||||
/**
|
||||
* Creates a new CorruptedInputException with
|
||||
* the default error detail message.
|
||||
*/
|
||||
public CorruptedInputException() {
|
||||
super("Compressed data is corrupt");
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new CorruptedInputException with
|
||||
* the specified error detail message.
|
||||
*
|
||||
* @param s error detail message
|
||||
*/
|
||||
public CorruptedInputException(String s) {
|
||||
super(s);
|
||||
}
|
||||
}
|
@ -1,42 +0,0 @@
|
||||
/*
|
||||
* CountingInputStream
|
||||
*
|
||||
* Author: Lasse Collin <lasse.collin@tukaani.org>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz;
|
||||
|
||||
import java.io.FilterInputStream;
|
||||
import java.io.InputStream;
|
||||
import java.io.IOException;
|
||||
|
||||
class CountingInputStream extends FilterInputStream {
|
||||
private long size = 0;
|
||||
|
||||
public CountingInputStream(InputStream in) {
|
||||
super(in);
|
||||
}
|
||||
|
||||
public int read() throws IOException {
|
||||
int ret = in.read();
|
||||
if (ret != -1 && size >= 0)
|
||||
++size;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
public int read(byte[] b, int off, int len) throws IOException {
|
||||
int ret = in.read(b, off, len);
|
||||
if (ret > 0 && size >= 0)
|
||||
size += ret;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
public long getSize() {
|
||||
return size;
|
||||
}
|
||||
}
|
@ -1,46 +0,0 @@
|
||||
/*
|
||||
* CountingOutputStream
|
||||
*
|
||||
* Author: Lasse Collin <lasse.collin@tukaani.org>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz;
|
||||
|
||||
import java.io.OutputStream;
|
||||
import java.io.IOException;
|
||||
|
||||
class CountingOutputStream extends FinishableOutputStream {
|
||||
private final OutputStream out;
|
||||
private long size = 0;
|
||||
|
||||
public CountingOutputStream(OutputStream out) {
|
||||
this.out = out;
|
||||
}
|
||||
|
||||
public void write(int b) throws IOException {
|
||||
out.write(b);
|
||||
if (size >= 0)
|
||||
++size;
|
||||
}
|
||||
|
||||
public void write(byte[] b, int off, int len) throws IOException {
|
||||
out.write(b, off, len);
|
||||
if (size >= 0)
|
||||
size += len;
|
||||
}
|
||||
|
||||
public void flush() throws IOException {
|
||||
out.flush();
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
out.close();
|
||||
}
|
||||
|
||||
public long getSize() {
|
||||
return size;
|
||||
}
|
||||
}
|
@ -1,26 +0,0 @@
|
||||
/*
|
||||
* DeltaCoder
|
||||
*
|
||||
* Author: Lasse Collin <lasse.collin@tukaani.org>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz;
|
||||
|
||||
abstract class DeltaCoder implements FilterCoder {
|
||||
public static final long FILTER_ID = 0x03;
|
||||
|
||||
public boolean changesSize() {
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean nonLastOK() {
|
||||
return true;
|
||||
}
|
||||
|
||||
public boolean lastOK() {
|
||||
return false;
|
||||
}
|
||||
}
|
@ -1,32 +0,0 @@
|
||||
/*
|
||||
* DeltaDecoder
|
||||
*
|
||||
* Author: Lasse Collin <lasse.collin@tukaani.org>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz;
|
||||
|
||||
import java.io.InputStream;
|
||||
|
||||
class DeltaDecoder extends DeltaCoder implements FilterDecoder {
|
||||
private final int distance;
|
||||
|
||||
DeltaDecoder(byte[] props) throws UnsupportedOptionsException {
|
||||
if (props.length != 1)
|
||||
throw new UnsupportedOptionsException(
|
||||
"Unsupported Delta filter properties");
|
||||
|
||||
distance = (props[0] & 0xFF) + 1;
|
||||
}
|
||||
|
||||
public int getMemoryUsage() {
|
||||
return 1;
|
||||
}
|
||||
|
||||
public InputStream getInputStream(InputStream in) {
|
||||
return new DeltaInputStream(in, distance);
|
||||
}
|
||||
}
|
@ -1,105 +0,0 @@
|
||||
/*
|
||||
* DeltaInputStream
|
||||
*
|
||||
* Author: Lasse Collin <lasse.collin@tukaani.org>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.IOException;
|
||||
import org.tukaani.xz.delta.DeltaDecoder;
|
||||
|
||||
/**
|
||||
* Decodes Delta-filtered data.
|
||||
* <p>
|
||||
* The delta filter doesn't change the size of the data and thus it
|
||||
* cannot have an end-of-payload marker. It will simply decode until
|
||||
* its input stream indicates end of input.
|
||||
*/
|
||||
public class DeltaInputStream extends InputStream {
|
||||
/**
|
||||
* Smallest supported delta calculation distance.
|
||||
*/
|
||||
public static final int DISTANCE_MIN = 1;
|
||||
|
||||
/**
|
||||
* Largest supported delta calculation distance.
|
||||
*/
|
||||
public static final int DISTANCE_MAX = 256;
|
||||
|
||||
private final InputStream in;
|
||||
private final DeltaDecoder delta;
|
||||
|
||||
/**
|
||||
* Creates a new Delta decoder with the given delta calculation distance.
|
||||
*
|
||||
* @param in input stream from which Delta filtered data
|
||||
* is read
|
||||
*
|
||||
* @param distance delta calculation distance, must be in the
|
||||
* range [<code>DISTANCE_MIN</code>,
|
||||
* <code>DISTANCE_MAX</code>]
|
||||
*/
|
||||
public DeltaInputStream(InputStream in, int distance) {
|
||||
this.in = in;
|
||||
this.delta = new DeltaDecoder(distance);
|
||||
}
|
||||
|
||||
/**
|
||||
* Decode the next byte from this input stream.
|
||||
*
|
||||
* @return the next decoded byte, or <code>-1</code> to indicate
|
||||
* the end of input on the input stream <code>in</code>
|
||||
*
|
||||
* @throws IOException may be thrown by <code>in</code>
|
||||
*/
|
||||
public int read() throws IOException {
|
||||
byte[] buf = new byte[1];
|
||||
return read(buf, 0, 1) == -1 ? -1 : (buf[0] & 0xFF);
|
||||
}
|
||||
|
||||
/**
|
||||
* Decode into an array of bytes.
|
||||
* <p>
|
||||
* This calls <code>in.read(buf, off, len)</code> and defilters the
|
||||
* returned data.
|
||||
*
|
||||
* @param buf target buffer for decoded data
|
||||
* @param off start offset in <code>buf</code>
|
||||
* @param len maximum number of bytes to read
|
||||
*
|
||||
* @return number of bytes read, or <code>-1</code> to indicate
|
||||
* the end of the input stream <code>in</code>
|
||||
*
|
||||
* @throws IOException may be thrown by underlaying input
|
||||
* stream <code>in</code>
|
||||
*/
|
||||
public int read(byte[] buf, int off, int len) throws IOException {
|
||||
int size = in.read(buf, off, len);
|
||||
if (size == -1)
|
||||
return -1;
|
||||
|
||||
delta.decode(buf, off, size);
|
||||
return size;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calls <code>in.available()</code>.
|
||||
*
|
||||
* @return the value returned by <code>in.available()</code>
|
||||
*/
|
||||
public int available() throws IOException {
|
||||
return in.available();
|
||||
}
|
||||
|
||||
/**
|
||||
* Calls <code>in.close()</code>.
|
||||
*/
|
||||
public void close() throws IOException {
|
||||
in.close();
|
||||
}
|
||||
}
|
@ -1,16 +0,0 @@
|
||||
/*
|
||||
* FilterCoder
|
||||
*
|
||||
* Author: Lasse Collin <lasse.collin@tukaani.org>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz;
|
||||
|
||||
interface FilterCoder {
|
||||
boolean changesSize();
|
||||
boolean nonLastOK();
|
||||
boolean lastOK();
|
||||
}
|
@ -1,17 +0,0 @@
|
||||
/*
|
||||
* FilterDecoder
|
||||
*
|
||||
* Author: Lasse Collin <lasse.collin@tukaani.org>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz;
|
||||
|
||||
import java.io.InputStream;
|
||||
|
||||
interface FilterDecoder extends FilterCoder {
|
||||
int getMemoryUsage();
|
||||
InputStream getInputStream(InputStream in);
|
||||
}
|
@ -1,16 +0,0 @@
|
||||
/*
|
||||
* FilterEncoder
|
||||
*
|
||||
* Author: Lasse Collin <lasse.collin@tukaani.org>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz;
|
||||
|
||||
interface FilterEncoder extends FilterCoder {
|
||||
long getFilterID();
|
||||
byte[] getFilterProps();
|
||||
FinishableOutputStream getOutputStream(FinishableOutputStream out);
|
||||
}
|
@ -1,28 +0,0 @@
|
||||
/*
|
||||
* FilterOptions
|
||||
*
|
||||
* Authors: Lasse Collin <lasse.collin@tukaani.org>
|
||||
* Igor Pavlov <http://7-zip.org/>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.IOException;
|
||||
|
||||
public abstract class FilterOptions implements Cloneable {
|
||||
public abstract int getEncoderMemoryUsage();
|
||||
public abstract FinishableOutputStream getOutputStream(
|
||||
FinishableOutputStream out);
|
||||
|
||||
public abstract int getDecoderMemoryUsage();
|
||||
public abstract InputStream getInputStream(InputStream in)
|
||||
;
|
||||
|
||||
abstract FilterEncoder getFilterEncoder();
|
||||
|
||||
FilterOptions() {}
|
||||
}
|
@ -1,31 +0,0 @@
|
||||
/*
|
||||
* FinishableOutputStream
|
||||
*
|
||||
* Author: Lasse Collin <lasse.collin@tukaani.org>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz;
|
||||
|
||||
import java.io.OutputStream;
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Output stream that supports finishing without closing
|
||||
* the underlying stream.
|
||||
*/
|
||||
public abstract class FinishableOutputStream extends OutputStream {
|
||||
/**
|
||||
* Finish the stream without closing the underlying stream.
|
||||
* No more data may be written to the stream after finishing.
|
||||
* <p>
|
||||
* The <code>finish</code> method of <code>FinishableOutputStream</code>
|
||||
* does nothing. Subclasses should override it if they need finishing
|
||||
* support, which is the case, for example, with compressors.
|
||||
*
|
||||
* @throws IOException
|
||||
*/
|
||||
public void finish() throws IOException {}
|
||||
}
|
@ -1,14 +0,0 @@
|
||||
/*
|
||||
* IndexIndicatorException
|
||||
*
|
||||
* Author: Lasse Collin <lasse.collin@tukaani.org>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz;
|
||||
|
||||
class IndexIndicatorException extends Exception {
|
||||
private static final long serialVersionUID = 1L;
|
||||
}
|
@ -1,26 +0,0 @@
|
||||
/*
|
||||
* LZMA2Coder
|
||||
*
|
||||
* Author: Lasse Collin <lasse.collin@tukaani.org>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz;
|
||||
|
||||
abstract class LZMA2Coder implements FilterCoder {
|
||||
public static final long FILTER_ID = 0x21;
|
||||
|
||||
public boolean changesSize() {
|
||||
return true;
|
||||
}
|
||||
|
||||
public boolean nonLastOK() {
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean lastOK() {
|
||||
return true;
|
||||
}
|
||||
}
|
@ -1,35 +0,0 @@
|
||||
/*
|
||||
* LZMA2Decoder
|
||||
*
|
||||
* Author: Lasse Collin <lasse.collin@tukaani.org>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz;
|
||||
|
||||
import java.io.InputStream;
|
||||
|
||||
class LZMA2Decoder extends LZMA2Coder implements FilterDecoder {
|
||||
private int dictSize;
|
||||
|
||||
LZMA2Decoder(byte[] props) throws UnsupportedOptionsException {
|
||||
// Up to 1.5 GiB dictionary is supported. The bigger ones
|
||||
// are too big for int.
|
||||
if (props.length != 1 || (props[0] & 0xFF) > 37)
|
||||
throw new UnsupportedOptionsException(
|
||||
"Unsupported LZMA2 properties");
|
||||
|
||||
dictSize = 2 | (props[0] & 1);
|
||||
dictSize <<= (props[0] >>> 1) + 11;
|
||||
}
|
||||
|
||||
public int getMemoryUsage() {
|
||||
return LZMA2InputStream.getMemoryUsage(dictSize);
|
||||
}
|
||||
|
||||
public InputStream getInputStream(InputStream in) {
|
||||
return new LZMA2InputStream(in, dictSize);
|
||||
}
|
||||
}
|
@ -1,35 +0,0 @@
|
||||
/*
|
||||
* LZMA2Encoder
|
||||
*
|
||||
* Author: Lasse Collin <lasse.collin@tukaani.org>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz;
|
||||
|
||||
class LZMA2Encoder extends LZMA2Coder implements FilterEncoder {
|
||||
private final LZMA2Options options;
|
||||
private final byte[] props = new byte[1];
|
||||
|
||||
LZMA2Encoder(LZMA2Options options) {
|
||||
// Make a private copy so that the caller is free to change its copy.
|
||||
this.options = (LZMA2Options)options.clone();
|
||||
|
||||
// TODO: Props!!!
|
||||
|
||||
}
|
||||
|
||||
public long getFilterID() {
|
||||
return FILTER_ID;
|
||||
}
|
||||
|
||||
public byte[] getFilterProps() {
|
||||
return props;
|
||||
}
|
||||
|
||||
public FinishableOutputStream getOutputStream(FinishableOutputStream out) {
|
||||
return options.getOutputStream(out);
|
||||
}
|
||||
}
|
@ -1,328 +0,0 @@
|
||||
/*
|
||||
* LZMA2InputStream
|
||||
*
|
||||
* Authors: Lasse Collin <lasse.collin@tukaani.org>
|
||||
* Igor Pavlov <http://7-zip.org/>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.DataInputStream;
|
||||
import java.io.IOException;
|
||||
import org.tukaani.xz.lz.LZDecoder;
|
||||
import org.tukaani.xz.rangecoder.RangeDecoder;
|
||||
import org.tukaani.xz.lzma.LZMADecoder;
|
||||
|
||||
/**
|
||||
* Decompresses a raw LZMA2 stream.
|
||||
*/
|
||||
public class LZMA2InputStream extends InputStream {
|
||||
/**
|
||||
* Smallest valid LZMA2 dictionary size.
|
||||
* <p>
|
||||
* Very tiny dictionaries would be a performance problem, so
|
||||
* the minimum is 4 KiB.
|
||||
*/
|
||||
public static final int DICT_SIZE_MIN = 4096;
|
||||
|
||||
/**
|
||||
* Largest dictionary size supported by this implementation.
|
||||
* <p>
|
||||
* The LZMA2 algorithm allows dictionaries up to one byte less than 4 GiB.
|
||||
* This implementation supports only 16 bytes less than 2 GiB for raw
|
||||
* LZMA2 streams, and for .xz files the maximum is 1.5 GiB. This
|
||||
* limitation is due to Java using signed 32-bit integers for array
|
||||
* indexing. The limitation shouldn't matter much in practice since so
|
||||
* huge dictionaries are not normally used.
|
||||
*/
|
||||
public static final int DICT_SIZE_MAX = Integer.MAX_VALUE & ~15;
|
||||
|
||||
private static final int COMPRESSED_SIZE_MAX = 1 << 16;
|
||||
|
||||
private final DataInputStream in;
|
||||
|
||||
private final LZDecoder lz;
|
||||
private final RangeDecoder rc = new RangeDecoder(COMPRESSED_SIZE_MAX);
|
||||
private LZMADecoder lzma;
|
||||
|
||||
private int uncompressedSize = 0;
|
||||
private boolean isLZMAChunk;
|
||||
|
||||
private boolean needDictReset = true;
|
||||
private boolean needProps = true;
|
||||
private boolean endReached = false;
|
||||
|
||||
private IOException exception = null;
|
||||
|
||||
/**
|
||||
* Gets approximate decompressor memory requirements as kibibytes for
|
||||
* the given dictionary size.
|
||||
*
|
||||
* @param dictSize LZMA2 dictionary size as bytes, must be
|
||||
* in the range [<code>DICT_SIZE_MIN</code>,
|
||||
* <code>DICT_SIZE_MAX</code>]
|
||||
*
|
||||
* @return approximate memory requirements as kibibytes (KiB)
|
||||
*/
|
||||
public static int getMemoryUsage(int dictSize) {
|
||||
// The base state is aroudn 30-40 KiB (probabilities etc.),
|
||||
// range decoder needs COMPRESSED_SIZE_MAX bytes for buffering,
|
||||
// and LZ decoder needs a dictionary buffer.
|
||||
return 40 + COMPRESSED_SIZE_MAX / 1024 + getDictSize(dictSize) / 1024;
|
||||
}
|
||||
|
||||
private static int getDictSize(int dictSize) {
|
||||
if (dictSize < DICT_SIZE_MIN || dictSize > DICT_SIZE_MAX)
|
||||
throw new IllegalArgumentException(
|
||||
"Unsupported dictionary size " + dictSize);
|
||||
|
||||
// Round dictionary size upward to a multiple of 16. This way LZMA
|
||||
// can use LZDecoder.getPos() for calculating LZMA's posMask.
|
||||
// Note that this check is needed only for raw LZMA2 streams; it is
|
||||
// redundant with .xz.
|
||||
return (dictSize + 15) & ~15;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new input stream that decompresses raw LZMA2 data
|
||||
* from <code>in</code>.
|
||||
* <p>
|
||||
* The caller needs to know the dictionary size used when compressing;
|
||||
* the dictionary size isn't stored as part of a raw LZMA2 stream.
|
||||
* <p>
|
||||
* Specifying a too small dictionary size will prevent decompressing
|
||||
* the stream. Specifying a too big dictionary is waste of memory but
|
||||
* decompression will work.
|
||||
* <p>
|
||||
* There is no need to specify a dictionary bigger than
|
||||
* the uncompressed size of the data even if a bigger dictionary
|
||||
* was used when compressing. If you know the uncompressed size
|
||||
* of the data, this might allow saving some memory.
|
||||
*
|
||||
* @param in input stream from which LZMA2-compressed
|
||||
* data is read
|
||||
*
|
||||
* @param dictSize LZMA2 dictionary size as bytes, must be
|
||||
* in the range [<code>DICT_SIZE_MIN</code>,
|
||||
* <code>DICT_SIZE_MAX</code>]
|
||||
*/
|
||||
public LZMA2InputStream(InputStream in, int dictSize) {
|
||||
this.in = new DataInputStream(in);
|
||||
this.lz = new LZDecoder(getDictSize(dictSize), null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new LZMA2 decompressor using a preset dictionary.
|
||||
* <p>
|
||||
* This is like <code>LZMAInputStream()</code> except that the
|
||||
* dictionary may be initialized using a preset dictionary.
|
||||
* If a preset dictionary was used when compressing the data, the
|
||||
* same preset dictionary must be provided when decompressing.
|
||||
*
|
||||
* @param in input stream from which LZMA2-compressed
|
||||
* data is read
|
||||
*
|
||||
* @param dictSize LZMA2 dictionary size as bytes, must be
|
||||
* in the range [<code>DICT_SIZE_MIN</code>,
|
||||
* <code>DICT_SIZE_MAX</code>]
|
||||
*
|
||||
* @param presetDict preset dictionary or <code>null</code>
|
||||
* to use no preset dictionary
|
||||
*/
|
||||
public LZMA2InputStream(InputStream in, int dictSize, byte[] presetDict) {
|
||||
this.in = new DataInputStream(in);
|
||||
this.lz = new LZDecoder(getDictSize(dictSize), presetDict);
|
||||
|
||||
if (presetDict.length > 0)
|
||||
needDictReset = false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Decompresses the next byte from this input stream.
|
||||
* <p>
|
||||
* Reading lots of data with <code>read()</code> from this input stream
|
||||
* may be inefficient. Wrap it in <code>java.io.BufferedInputStream</code>
|
||||
* if you need to read lots of data one byte at a time.
|
||||
*
|
||||
* @return the next decompressed byte, or <code>-1</code>
|
||||
* to indicate the end of the compressed stream
|
||||
*
|
||||
* @throws CorruptedInputException
|
||||
*
|
||||
* @throws EOFException
|
||||
* compressed input is truncated or corrupt
|
||||
*
|
||||
* @throws IOException may be thrown by <code>in</code>
|
||||
*/
|
||||
public int read() throws IOException {
|
||||
byte[] buf = new byte[1];
|
||||
return read(buf, 0, 1) == -1 ? -1 : (buf[0] & 0xFF);
|
||||
}
|
||||
|
||||
/**
|
||||
* Decompresses into an array of bytes.
|
||||
* <p>
|
||||
* If <code>len</code> is zero, no bytes are read and <code>0</code>
|
||||
* is returned. Otherwise this will block until <code>len</code>
|
||||
* bytes have been decompressed, the end of LZMA2 stream is reached,
|
||||
* or an exception is thrown.
|
||||
*
|
||||
* @param buf target buffer for uncompressed data
|
||||
* @param off start offset in <code>buf</code>
|
||||
* @param len maximum number of uncompressed bytes to read
|
||||
*
|
||||
* @return number of bytes read, or <code>-1</code> to indicate
|
||||
* the end of the compressed stream
|
||||
*
|
||||
* @throws CorruptedInputException
|
||||
*
|
||||
* @throws EOFException
|
||||
* compressed input is truncated or corrupt
|
||||
*
|
||||
* @throws IOException may be thrown by <code>in</code>
|
||||
*/
|
||||
public int read(byte[] buf, int off, int len) throws IOException {
|
||||
if (off < 0 || len < 0 || off + len < 0 || off + len > buf.length)
|
||||
throw new IllegalArgumentException();
|
||||
|
||||
if (len == 0)
|
||||
return 0;
|
||||
|
||||
if (exception != null)
|
||||
throw exception;
|
||||
|
||||
if (endReached)
|
||||
return -1;
|
||||
|
||||
try {
|
||||
int size = 0;
|
||||
|
||||
while (len > 0) {
|
||||
if (uncompressedSize == 0) {
|
||||
decodeChunkHeader();
|
||||
if (endReached)
|
||||
return size == 0 ? -1 : size;
|
||||
}
|
||||
|
||||
int copySizeMax = Math.min(uncompressedSize, len);
|
||||
|
||||
if (!isLZMAChunk) {
|
||||
lz.copyUncompressed(in, copySizeMax);
|
||||
} else {
|
||||
lz.setLimit(copySizeMax);
|
||||
lzma.decode();
|
||||
}
|
||||
|
||||
int copiedSize = lz.flush(buf, off);
|
||||
off += copiedSize;
|
||||
len -= copiedSize;
|
||||
size += copiedSize;
|
||||
uncompressedSize -= copiedSize;
|
||||
|
||||
if (uncompressedSize == 0)
|
||||
if (!rc.isFinished() || lz.hasPending())
|
||||
throw new CorruptedInputException();
|
||||
}
|
||||
|
||||
return size;
|
||||
|
||||
} catch (IOException e) {
|
||||
exception = e;
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
private void decodeChunkHeader() throws IOException {
|
||||
int control = in.readUnsignedByte();
|
||||
|
||||
if (control == 0x00) {
|
||||
endReached = true;
|
||||
return;
|
||||
}
|
||||
|
||||
if (control >= 0xE0 || control == 0x01) {
|
||||
needProps = true;
|
||||
needDictReset = false;
|
||||
lz.reset();
|
||||
} else if (needDictReset) {
|
||||
throw new CorruptedInputException();
|
||||
}
|
||||
|
||||
if (control >= 0x80) {
|
||||
isLZMAChunk = true;
|
||||
|
||||
uncompressedSize = (control & 0x1F) << 16;
|
||||
uncompressedSize += in.readUnsignedShort() + 1;
|
||||
|
||||
int compressedSize = in.readUnsignedShort() + 1;
|
||||
|
||||
if (control >= 0xC0) {
|
||||
needProps = false;
|
||||
decodeProps();
|
||||
|
||||
} else if (needProps) {
|
||||
throw new CorruptedInputException();
|
||||
|
||||
} else if (control >= 0xA0) {
|
||||
lzma.reset();
|
||||
}
|
||||
|
||||
rc.prepareInputBuffer(in, compressedSize);
|
||||
|
||||
} else if (control > 0x02) {
|
||||
throw new CorruptedInputException();
|
||||
|
||||
} else {
|
||||
isLZMAChunk = false;
|
||||
uncompressedSize = in.readUnsignedShort() + 1;
|
||||
}
|
||||
}
|
||||
|
||||
private void decodeProps() throws IOException {
|
||||
int props = in.readUnsignedByte();
|
||||
|
||||
if (props > (4 * 5 + 4) * 9 + 8)
|
||||
throw new CorruptedInputException();
|
||||
|
||||
int pb = props / (9 * 5);
|
||||
props -= pb * 9 * 5;
|
||||
int lp = props / 9;
|
||||
int lc = props - lp * 9;
|
||||
|
||||
if (lc + lp > 4)
|
||||
throw new CorruptedInputException();
|
||||
|
||||
lzma = new LZMADecoder(lz, rc, lc, lp, pb);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of uncompressed bytes that can be read
|
||||
* without blocking. The value is returned with an assumption
|
||||
* that the compressed input data will be valid. If the compressed
|
||||
* data is corrupt, <code>CorruptedInputException</code> may get
|
||||
* thrown before the number of bytes claimed to be available have
|
||||
* been read from this input stream.
|
||||
* <p>
|
||||
* In LZMAInputStream, the return value will be non-zero when the
|
||||
* decompressor is in the middle of an LZMA2 chunk. The return value
|
||||
* will then be the number of uncompressed bytes remaining from that
|
||||
* chunk.
|
||||
*
|
||||
* @return the number of uncompressed bytes that can be read
|
||||
* without blocking
|
||||
*/
|
||||
public int available() {
|
||||
return uncompressedSize;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calls <code>in.close()</code>.
|
||||
*/
|
||||
public void close() throws IOException {
|
||||
in.close();
|
||||
}
|
||||
}
|
@ -1,139 +0,0 @@
|
||||
/*
|
||||
* LZMA2Options
|
||||
*
|
||||
* Author: Lasse Collin <lasse.collin@tukaani.org>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Options for LZMA2.
|
||||
* <p>
|
||||
* FIXME: This is unfinished and things might change.
|
||||
*/
|
||||
public class LZMA2Options extends FilterOptions {
|
||||
/**
|
||||
* Default compression preset.
|
||||
*/
|
||||
public static final int PRESET_DEFAULT = 6;
|
||||
|
||||
/**
|
||||
* Minimum dictionary size.
|
||||
*/
|
||||
public static final int DICT_SIZE_MIN = 4096;
|
||||
|
||||
/**
|
||||
* Maximum dictionary size for compression.
|
||||
* <p>
|
||||
* FIXME? Decompression dictionary size can be bigger.
|
||||
*/
|
||||
public static final int DICT_SIZE_MAX = 128 << 20;
|
||||
|
||||
/**
|
||||
* Maximum value for lc + lp.
|
||||
*/
|
||||
public static final int LC_LP_MAX = 4;
|
||||
|
||||
/**
|
||||
* Maximum value for pb.
|
||||
*/
|
||||
public static final int PB_MAX = 4;
|
||||
|
||||
/**
|
||||
* Compression mode: uncompressed.
|
||||
* The data is wrapped into a LZMA2 stream without compression.
|
||||
*/
|
||||
public static final int MODE_UNCOMPRESSED = 0;
|
||||
|
||||
/**
|
||||
* Compression mode: fast.
|
||||
* This is usually combined with a hash chain match finder.
|
||||
*/
|
||||
public static final int MODE_FAST = 1;
|
||||
|
||||
/**
|
||||
* Compression mode: normal.
|
||||
* This is usually combined with a binary tree match finder.
|
||||
*/
|
||||
public static final int MODE_NORMAL = 2;
|
||||
|
||||
/**
|
||||
* Minimum value for <code>niceLen</code>.
|
||||
*/
|
||||
public static final int NICE_LEN_MIN = 8;
|
||||
|
||||
/**
|
||||
* Maximum value for <code>niceLen</code>.
|
||||
*/
|
||||
public static final int NICE_LEN_MAX = 273;
|
||||
|
||||
/**
|
||||
* Match finder: Hash Chain 2-3-4
|
||||
*/
|
||||
public static final int MF_HC4 = 0x04;
|
||||
|
||||
/**
|
||||
* Match finder: Binary tree 2-3-4
|
||||
*/
|
||||
public static final int MF_BT4 = 0x14;
|
||||
|
||||
private int dictSize;
|
||||
|
||||
/*
|
||||
public int lc;
|
||||
public int lp;
|
||||
public int pb;
|
||||
public int mode;
|
||||
public int niceLen;
|
||||
public int mf;
|
||||
public int depth;
|
||||
*/
|
||||
|
||||
public LZMA2Options() {
|
||||
setPreset(PRESET_DEFAULT);
|
||||
}
|
||||
|
||||
public LZMA2Options(int preset) {
|
||||
setPreset(preset);
|
||||
}
|
||||
|
||||
public void setPreset(int preset) {
|
||||
// TODO
|
||||
dictSize = 8 << 20;
|
||||
}
|
||||
|
||||
public int getEncoderMemoryUsage() {
|
||||
return LZMA2OutputStream.getMemoryUsage(this);
|
||||
}
|
||||
|
||||
public FinishableOutputStream getOutputStream(FinishableOutputStream out) {
|
||||
return new LZMA2OutputStream(out, this);
|
||||
}
|
||||
|
||||
public int getDecoderMemoryUsage() {
|
||||
return LZMA2InputStream.getMemoryUsage(dictSize);
|
||||
}
|
||||
|
||||
public InputStream getInputStream(InputStream in) {
|
||||
return new LZMA2InputStream(in, dictSize);
|
||||
}
|
||||
|
||||
FilterEncoder getFilterEncoder() {
|
||||
return new LZMA2Encoder(this);
|
||||
}
|
||||
|
||||
public Object clone() {
|
||||
try {
|
||||
return super.clone();
|
||||
} catch (CloneNotSupportedException e) {
|
||||
// Never reached
|
||||
throw new RuntimeException();
|
||||
}
|
||||
}
|
||||
}
|
@ -1,77 +0,0 @@
|
||||
/*
|
||||
* LZMA2OutputStream
|
||||
*
|
||||
* Authors: Lasse Collin <lasse.collin@tukaani.org>
|
||||
* Igor Pavlov <http://7-zip.org/>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
//
|
||||
// TODO: This creates a valid LZMA2 stream but it doesn't compress.
|
||||
// So this is useless except for testing the .xz container support.
|
||||
//
|
||||
|
||||
class LZMA2OutputStream extends FinishableOutputStream {
|
||||
private final FinishableOutputStream out;
|
||||
|
||||
static int getMemoryUsage(LZMA2Options options) {
|
||||
// TODO
|
||||
return 1;
|
||||
}
|
||||
|
||||
LZMA2OutputStream(FinishableOutputStream out, LZMA2Options options) {
|
||||
this.out = out;
|
||||
}
|
||||
|
||||
public void write(int b) throws IOException {
|
||||
byte[] buf = new byte[1];
|
||||
buf[0] = (byte)b;
|
||||
write(buf, 0, 1);
|
||||
}
|
||||
|
||||
public void write(byte[] buf, int off, int len) throws IOException {
|
||||
if (off < 0 || len < 0 || off + len < 0 || off + len > buf.length)
|
||||
throw new IllegalArgumentException();
|
||||
|
||||
while (off > 0x10000) {
|
||||
writeChunk(buf, off, 0x10000);
|
||||
off += 0x10000;
|
||||
len -= 0x10000;
|
||||
}
|
||||
|
||||
writeChunk(buf, off, len);
|
||||
}
|
||||
|
||||
private void writeChunk(byte[] buf, int off, int len) throws IOException {
|
||||
out.write(0x01);
|
||||
out.write((len - 1) >>> 8);
|
||||
out.write(len - 1);
|
||||
out.write(buf, off, len);
|
||||
}
|
||||
|
||||
private void writeEndMarker() throws IOException {
|
||||
// TODO: Flush incomplete chunk.
|
||||
out.write(0x00);
|
||||
}
|
||||
|
||||
public void flush() throws IOException {
|
||||
throw new UnsupportedOptionsException(
|
||||
"Flushing LZMA2OutputStream not implemented yet");
|
||||
}
|
||||
|
||||
public void finish() throws IOException {
|
||||
writeEndMarker();
|
||||
out.finish();
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
writeEndMarker();
|
||||
out.close();
|
||||
}
|
||||
}
|
@ -1,60 +0,0 @@
|
||||
/*
|
||||
* MemoryLimitException
|
||||
*
|
||||
* Author: Lasse Collin <lasse.collin@tukaani.org>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz;
|
||||
|
||||
/**
|
||||
* Thrown when the memory usage limit given to the XZ decompressor
|
||||
* would be exceeded.
|
||||
* <p>
|
||||
* The amount of memory required and the memory usage limit are
|
||||
* included in the error detail message in human readable format.
|
||||
*/
|
||||
public class MemoryLimitException extends XZIOException {
|
||||
private static final long serialVersionUID = 3L;
|
||||
|
||||
private final int memoryNeeded;
|
||||
private final int memoryLimit;
|
||||
|
||||
/**
|
||||
* Creates a new MemoryLimitException.
|
||||
* <p>
|
||||
* The amount of memory needed and the memory usage limit are
|
||||
* included in the error detail message.
|
||||
*
|
||||
* @param memoryNeeded amount of memory needed as kibibytes (KiB)
|
||||
* @param memoryLimit specified memory usage limit as kibibytes (KiB)
|
||||
*/
|
||||
public MemoryLimitException(int memoryNeeded, int memoryLimit) {
|
||||
super("" + memoryNeeded + " KiB of memory would be needed; limit was "
|
||||
+ memoryLimit + " KiB");
|
||||
|
||||
this.memoryNeeded = memoryNeeded;
|
||||
this.memoryLimit = memoryLimit;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets how much memory is required to decompress the data.
|
||||
*
|
||||
* @return amount of memory needed as kibibytes (KiB)
|
||||
*/
|
||||
public int getMemoryNeeded() {
|
||||
return memoryNeeded;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets what the memory usage limit was at the time the exception
|
||||
* was created.
|
||||
*
|
||||
* @return memory usage limit as kibibytes (KiB)
|
||||
*/
|
||||
public int getMemoryLimit() {
|
||||
return memoryLimit;
|
||||
}
|
||||
}
|
@ -1,33 +0,0 @@
|
||||
/*
|
||||
* RawCoder
|
||||
*
|
||||
* Author: Lasse Collin <lasse.collin@tukaani.org>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz;
|
||||
|
||||
class RawCoder {
|
||||
static void validate(FilterCoder[] filters)
|
||||
throws UnsupportedOptionsException {
|
||||
for (int i = 0; i < filters.length - 1; ++i)
|
||||
if (!filters[i].nonLastOK())
|
||||
throw new UnsupportedOptionsException(
|
||||
"Unsupported XZ filter chain");
|
||||
|
||||
if (!filters[filters.length - 1].lastOK())
|
||||
throw new UnsupportedOptionsException(
|
||||
"Unsupported XZ filter chain");
|
||||
|
||||
int changesSizeCount = 0;
|
||||
for (int i = 0; i < filters.length; ++i)
|
||||
if (filters[i].changesSize())
|
||||
++changesSizeCount;
|
||||
|
||||
if (changesSizeCount > 3)
|
||||
throw new UnsupportedOptionsException(
|
||||
"Unsupported XZ filter chain");
|
||||
}
|
||||
}
|
@ -1,285 +0,0 @@
|
||||
/*
|
||||
* SingleXZInputStream
|
||||
*
|
||||
* Author: Lasse Collin <lasse.collin@tukaani.org>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.DataInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.EOFException;
|
||||
import org.tukaani.xz.common.DecoderUtil;
|
||||
import org.tukaani.xz.common.StreamFlags;
|
||||
import org.tukaani.xz.index.IndexHash;
|
||||
import org.tukaani.xz.check.Check;
|
||||
|
||||
/**
|
||||
* Decompresses exactly one XZ Stream in streamed mode (no seeking).
|
||||
* The decompression stops after the first XZ Stream has been decompressed,
|
||||
* and the read position in the input stream is left at the first byte
|
||||
* after the end of the XZ Stream. This can be useful when XZ data has
|
||||
* been stored inside some other file format or protocol.
|
||||
* <p>
|
||||
* Unless you know what you are doing, don't use this class to decompress
|
||||
* standalone .xz files. For that purpose, use <code>XZInputStream</code>.
|
||||
*
|
||||
* @see XZInputStream
|
||||
*/
|
||||
public class SingleXZInputStream extends InputStream {
|
||||
private InputStream in;
|
||||
private int memoryLimit;
|
||||
private StreamFlags streamHeaderFlags;
|
||||
private Check check;
|
||||
private BlockInputStream blockDecoder = null;
|
||||
private final IndexHash indexHash = new IndexHash();
|
||||
private boolean endReached = false;
|
||||
private IOException exception = null;
|
||||
|
||||
/**
|
||||
* Creates a new input stream that decompresses exactly one XZ Stream
|
||||
* from <code>in</code>.
|
||||
* <p>
|
||||
* This constructor reads and parses the XZ Stream Header (12 bytes)
|
||||
* from <code>in</code>. The header of the first Block is not read
|
||||
* until <code>read</code> is called.
|
||||
*
|
||||
* @param in input stream from which XZ-compressed
|
||||
* data is read
|
||||
*
|
||||
* @throws XZFormatException
|
||||
* input is not in the XZ format
|
||||
*
|
||||
* @throws CorruptedInputException
|
||||
* XZ header CRC32 doesn't match
|
||||
*
|
||||
* @throws UnsupportedOptionsException
|
||||
* XZ header is valid but specifies options
|
||||
* not supported by this implementation
|
||||
*
|
||||
* @throws EOFException
|
||||
* less than 12 bytes of input was available
|
||||
* from <code>in</code>
|
||||
*
|
||||
* @throws IOException may be thrown by <code>in</code>
|
||||
*/
|
||||
public SingleXZInputStream(InputStream in) throws IOException {
|
||||
initialize(in, -1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new single-stream XZ decompressor with optional
|
||||
* memory usage limit.
|
||||
* <p>
|
||||
* This is identical to <code>SingleXZInputStream(InputStream)</code>
|
||||
* except that this takes also the <code>memoryLimit</code> argument.
|
||||
*
|
||||
* @param in input stream from which XZ-compressed
|
||||
* data is read
|
||||
*
|
||||
* @param memoryLimit memory usage limit as kibibytes (KiB)
|
||||
* or -1 to impose no memory usage limit
|
||||
*
|
||||
* @throws XZFormatException
|
||||
* input is not in the XZ format
|
||||
*
|
||||
* @throws CorruptedInputException
|
||||
* XZ header CRC32 doesn't match
|
||||
*
|
||||
* @throws UnsupportedOptionsException
|
||||
* XZ header is valid but specifies options
|
||||
* not supported by this implementation
|
||||
*
|
||||
* @throws EOFException
|
||||
* less than 12 bytes of input was available
|
||||
* from <code>in</code>
|
||||
*
|
||||
* @throws IOException may be thrown by <code>in</code>
|
||||
*/
|
||||
public SingleXZInputStream(InputStream in, int memoryLimit)
|
||||
throws IOException {
|
||||
initialize(in, memoryLimit);
|
||||
}
|
||||
|
||||
SingleXZInputStream(InputStream in, int memoryLimit,
|
||||
byte[] streamHeader) throws IOException {
|
||||
initialize(in, memoryLimit, streamHeader);
|
||||
}
|
||||
|
||||
private void initialize(InputStream in, int memoryLimit)
|
||||
throws IOException {
|
||||
byte[] streamHeader = new byte[DecoderUtil.STREAM_HEADER_SIZE];
|
||||
new DataInputStream(in).readFully(streamHeader);
|
||||
initialize(in, memoryLimit, streamHeader);
|
||||
}
|
||||
|
||||
private void initialize(InputStream in, int memoryLimit,
|
||||
byte[] streamHeader) throws IOException {
|
||||
this.in = in;
|
||||
this.memoryLimit = memoryLimit;
|
||||
streamHeaderFlags = DecoderUtil.decodeStreamHeader(streamHeader);
|
||||
check = Check.getInstance(streamHeaderFlags.checkType);
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the ID of the integrity check used in this XZ Stream.
|
||||
*
|
||||
* @return the Check ID specified in the XZ Stream Header
|
||||
*/
|
||||
public int getCheckType() {
|
||||
return streamHeaderFlags.checkType;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the name of the integrity check used in this XZ Stream.
|
||||
*
|
||||
* @return the name of the check specified in the XZ Stream Header
|
||||
*/
|
||||
public String getCheckName() {
|
||||
return check.getName();
|
||||
}
|
||||
|
||||
/**
|
||||
* Decompresses the next byte from this input stream.
|
||||
* <p>
|
||||
* Reading lots of data with <code>read()</code> from this input stream
|
||||
* may be inefficient. Wrap it in <code>java.io.BufferedInputStream</code>
|
||||
* if you need to read lots of data one byte at a time.
|
||||
*
|
||||
* @return the next decompressed byte, or <code>-1</code>
|
||||
* to indicate the end of the compressed stream
|
||||
*
|
||||
* @throws CorruptedInputException
|
||||
* @throws UnsupportedOptionsException
|
||||
* @throws MemoryLimitException
|
||||
*
|
||||
* @throws EOFException
|
||||
* compressed input is truncated or corrupt
|
||||
*
|
||||
* @throws IOException may be thrown by <code>in</code>
|
||||
*/
|
||||
public int read() throws IOException {
|
||||
byte[] buf = new byte[1];
|
||||
return read(buf, 0, 1) == -1 ? -1 : (buf[0] & 0xFF);
|
||||
}
|
||||
|
||||
/**
|
||||
* Decompresses into an array of bytes.
|
||||
* <p>
|
||||
* If <code>len</code> is zero, no bytes are read and <code>0</code>
|
||||
* is returned. Otherwise this will try to decompress <code>len</code>
|
||||
* bytes of uncompressed data. Less than <code>len</code> bytes may
|
||||
* be read only in the following situations:
|
||||
* <ul>
|
||||
* <li>The end of the compressed data was reached successfully.</li>
|
||||
* <li>An error is detected after at least one but less <code>len</code>
|
||||
* bytes have already been successfully decompressed.
|
||||
* The next call with non-zero <code>len</code> will immediately
|
||||
* throw the pending exception.</li>
|
||||
* <li>An exception is thrown.</li>
|
||||
* </ul>
|
||||
*
|
||||
* @param buf target buffer for uncompressed data
|
||||
* @param off start offset in <code>buf</code>
|
||||
* @param len maximum number of uncompressed bytes to read
|
||||
*
|
||||
* @return number of bytes read, or <code>-1</code> to indicate
|
||||
* the end of the compressed stream
|
||||
*
|
||||
* @throws CorruptedInputException
|
||||
* @throws UnsupportedOptionsException
|
||||
* @throws MemoryLimitException
|
||||
*
|
||||
* @throws EOFException
|
||||
* compressed input is truncated or corrupt
|
||||
*
|
||||
* @throws IOException may be thrown by <code>in</code>
|
||||
*/
|
||||
public int read(byte[] buf, int off, int len) throws IOException {
|
||||
if (off < 0 || len < 0 || off + len < 0 || off + len > buf.length)
|
||||
throw new IllegalArgumentException();
|
||||
|
||||
if (len == 0)
|
||||
return 0;
|
||||
|
||||
if (exception != null)
|
||||
throw exception;
|
||||
|
||||
if (endReached)
|
||||
return -1;
|
||||
|
||||
int size = 0;
|
||||
|
||||
try {
|
||||
while (len > 0) {
|
||||
if (blockDecoder == null) {
|
||||
try {
|
||||
blockDecoder = new BlockInputStream(in, check,
|
||||
memoryLimit);
|
||||
} catch (IndexIndicatorException e) {
|
||||
indexHash.validate(in);
|
||||
validateStreamFooter();
|
||||
endReached = true;
|
||||
return size > 0 ? size : -1;
|
||||
}
|
||||
}
|
||||
|
||||
int ret = blockDecoder.read(buf, off, len);
|
||||
|
||||
if (ret > 0) {
|
||||
size += ret;
|
||||
off += ret;
|
||||
len -= ret;
|
||||
} else if (ret == -1) {
|
||||
indexHash.add(blockDecoder.getUnpaddedSize(),
|
||||
blockDecoder.getUncompressedSize());
|
||||
blockDecoder = null;
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
exception = e;
|
||||
if (size == 0)
|
||||
throw e;
|
||||
}
|
||||
|
||||
return size;
|
||||
}
|
||||
|
||||
private void validateStreamFooter() throws IOException {
|
||||
byte[] buf = new byte[DecoderUtil.STREAM_HEADER_SIZE];
|
||||
new DataInputStream(in).readFully(buf);
|
||||
StreamFlags streamFooterFlags = DecoderUtil.decodeStreamFooter(buf);
|
||||
|
||||
if (!DecoderUtil.areStreamFlagsEqual(streamHeaderFlags,
|
||||
streamFooterFlags)
|
||||
|| indexHash.getIndexSize() != streamFooterFlags.backwardSize)
|
||||
throw new CorruptedInputException(
|
||||
"XZ Stream Footer does not match Stream Header");
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of uncompressed bytes that can be read
|
||||
* without blocking. The value is returned with an assumption
|
||||
* that the compressed input data will be valid. If the compressed
|
||||
* data is corrupt, <code>CorruptedInputException</code> may get
|
||||
* thrown before the number of bytes claimed to be available have
|
||||
* been read from this input stream.
|
||||
*
|
||||
* @return the number of uncompressed bytes that can be read
|
||||
* without blocking
|
||||
*/
|
||||
public int available() throws IOException {
|
||||
return blockDecoder == null ? 0 : blockDecoder.available();
|
||||
}
|
||||
|
||||
/**
|
||||
* Calls <code>in.close()</code>.
|
||||
*/
|
||||
public void close() throws IOException {
|
||||
in.close();
|
||||
}
|
||||
}
|
@ -1,34 +0,0 @@
|
||||
/*
|
||||
* UnsupportedOptionsException
|
||||
*
|
||||
* Author: Lasse Collin <lasse.collin@tukaani.org>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz;
|
||||
|
||||
/**
|
||||
* Thrown when compression options not supported by this implementation
|
||||
* are detected. Some other implementation might support those options.
|
||||
*/
|
||||
public class UnsupportedOptionsException extends XZIOException {
|
||||
private static final long serialVersionUID = 3L;
|
||||
|
||||
/**
|
||||
* Creates a new UnsupportedOptionsException with null
|
||||
* as its error detail message.
|
||||
*/
|
||||
public UnsupportedOptionsException() {}
|
||||
|
||||
/**
|
||||
* Creates a new UnsupportedOptionsException with the given
|
||||
* error detail message.
|
||||
*
|
||||
* @param s error detail message
|
||||
*/
|
||||
public UnsupportedOptionsException(String s) {
|
||||
super(s);
|
||||
}
|
||||
}
|
@ -1,53 +0,0 @@
|
||||
/*
|
||||
* XZ
|
||||
*
|
||||
* Author: Lasse Collin <lasse.collin@tukaani.org>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz;
|
||||
|
||||
/**
|
||||
* XZ constants.
|
||||
*/
|
||||
public class XZ {
|
||||
/**
|
||||
* XZ Header Magic Bytes begin a XZ file.
|
||||
* This can be useful to detect XZ compressed data.
|
||||
*/
|
||||
public static final byte[] HEADER_MAGIC = {
|
||||
(byte)0xFD, '7', 'z', 'X', 'Z', '\0' };
|
||||
|
||||
/**
|
||||
* XZ Footer Magic Bytes are the last bytes of a XZ Stream.
|
||||
*/
|
||||
public static final byte[] FOOTER_MAGIC = { 'Y', 'Z' };
|
||||
|
||||
/**
|
||||
* Integrity check ID indicating that no integrity check is calculated.
|
||||
* <p>
|
||||
* Omitting the integrity check is strongly discouraged except when
|
||||
* the integrity of the data will be verified by other means anyway,
|
||||
* and calculating the check twice would be useless.
|
||||
*/
|
||||
public static final int CHECK_NONE = 0;
|
||||
|
||||
/**
|
||||
* Integrity check ID for CRC32.
|
||||
*/
|
||||
public static final int CHECK_CRC32 = 1;
|
||||
|
||||
/**
|
||||
* Integrity check ID for CRC64.
|
||||
*/
|
||||
public static final int CHECK_CRC64 = 4;
|
||||
|
||||
/**
|
||||
* Integrity check ID for SHA-256.
|
||||
*/
|
||||
public static final int CHECK_SHA256 = 10;
|
||||
|
||||
private XZ() {}
|
||||
}
|
@ -1,24 +0,0 @@
|
||||
/*
|
||||
* XZFormatException
|
||||
*
|
||||
* Author: Lasse Collin <lasse.collin@tukaani.org>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz;
|
||||
|
||||
/**
|
||||
* Thrown when the input data is not in the XZ format.
|
||||
*/
|
||||
public class XZFormatException extends XZIOException {
|
||||
private static final long serialVersionUID = 3L;
|
||||
|
||||
/**
|
||||
* Creates a new exception with the default error detail message.
|
||||
*/
|
||||
public XZFormatException() {
|
||||
super("Input is not in the XZ format");
|
||||
}
|
||||
}
|
@ -1,28 +0,0 @@
|
||||
/*
|
||||
* XZIOException
|
||||
*
|
||||
* Author: Lasse Collin <lasse.collin@tukaani.org>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz;
|
||||
|
||||
/**
|
||||
* Generic IOException specific to this package.
|
||||
* All IOExceptions thrown by this package are extended from XZIOException.
|
||||
* This way it is easier to distinguish exceptions thrown by the XZ code
|
||||
* from other IOExceptions.
|
||||
*/
|
||||
public class XZIOException extends java.io.IOException {
|
||||
private static final long serialVersionUID = 3L;
|
||||
|
||||
public XZIOException() {
|
||||
super();
|
||||
}
|
||||
|
||||
public XZIOException(String s) {
|
||||
super(s);
|
||||
}
|
||||
}
|
@ -1,257 +0,0 @@
|
||||
/*
|
||||
* XZInputStream
|
||||
*
|
||||
* Author: Lasse Collin <lasse.collin@tukaani.org>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.DataInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.EOFException;
|
||||
import org.tukaani.xz.common.DecoderUtil;
|
||||
|
||||
/**
|
||||
* Decompresses a .xz file in streamed mode (no seeking).
|
||||
* <p>
|
||||
* Use this to decompress regular standalone .xz files. This reads from
|
||||
* its input stream until the end of the input or until an error occurs.
|
||||
* This supports decompressing concatenated .xz files.
|
||||
*
|
||||
* @see SingleXZInputStream
|
||||
*/
|
||||
public class XZInputStream extends InputStream {
|
||||
private final int memoryLimit;
|
||||
private final InputStream in;
|
||||
private SingleXZInputStream xzIn;
|
||||
private boolean endReached = false;
|
||||
private IOException exception = null;
|
||||
|
||||
/**
|
||||
* Creates a new input stream that decompresses XZ-compressed data
|
||||
* from <code>in</code>.
|
||||
* <p>
|
||||
* This constructor reads and parses the XZ Stream Header (12 bytes)
|
||||
* from <code>in</code>. The header of the first Block is not read
|
||||
* until <code>read</code> is called.
|
||||
*
|
||||
* @param in input stream from which XZ-compressed
|
||||
* data is read
|
||||
*
|
||||
* @throws XZFormatException
|
||||
* input is not in the XZ format
|
||||
*
|
||||
* @throws CorruptedInputException
|
||||
* XZ header CRC32 doesn't match
|
||||
*
|
||||
* @throws UnsupportedOptionsException
|
||||
* XZ header is valid but specifies options
|
||||
* not supported by this implementation
|
||||
*
|
||||
* @throws EOFException
|
||||
* less than 12 bytes of input was available
|
||||
* from <code>in</code>
|
||||
*
|
||||
* @throws IOException may be thrown by <code>in</code>
|
||||
*/
|
||||
public XZInputStream(InputStream in) throws IOException {
|
||||
this.in = in;
|
||||
this.memoryLimit = -1;
|
||||
this.xzIn = new SingleXZInputStream(in, -1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new input stream that decompresses XZ-compressed data
|
||||
* from <code>in</code>.
|
||||
* <p>
|
||||
* This is identical to <code>XZInputStream(InputStream)</code> except
|
||||
* that this takes also the <code>memoryLimit</code> argument.
|
||||
*
|
||||
* @param in input stream from which XZ-compressed
|
||||
* data is read
|
||||
*
|
||||
* @param memoryLimit memory usage limit as kibibytes (KiB)
|
||||
* or -1 to impose no memory usage limit
|
||||
*
|
||||
* @throws XZFormatException
|
||||
* input is not in the XZ format
|
||||
*
|
||||
* @throws CorruptedInputException
|
||||
* XZ header CRC32 doesn't match
|
||||
*
|
||||
* @throws UnsupportedOptionsException
|
||||
* XZ header is valid but specifies options
|
||||
* not supported by this implementation
|
||||
*
|
||||
* @throws EOFException
|
||||
* less than 12 bytes of input was available
|
||||
* from <code>in</code>
|
||||
*
|
||||
* @throws IOException may be thrown by <code>in</code>
|
||||
*/
|
||||
public XZInputStream(InputStream in, int memoryLimit) throws IOException {
|
||||
this.in = in;
|
||||
this.memoryLimit = memoryLimit;
|
||||
this.xzIn = new SingleXZInputStream(in, memoryLimit);
|
||||
}
|
||||
|
||||
/**
|
||||
* Decompresses the next byte from this input stream.
|
||||
* <p>
|
||||
* Reading lots of data with <code>read()</code> from this input stream
|
||||
* may be inefficient. Wrap it in <code>java.io.BufferedInputStream</code>
|
||||
* if you need to read lots of data one byte at a time.
|
||||
*
|
||||
* @return the next decompressed byte, or <code>-1</code>
|
||||
* to indicate the end of the compressed stream
|
||||
*
|
||||
* @throws CorruptedInputException
|
||||
* @throws UnsupportedOptionsException
|
||||
* @throws MemoryLimitException
|
||||
*
|
||||
* @throws EOFException
|
||||
* compressed input is truncated or corrupt
|
||||
*
|
||||
* @throws IOException may be thrown by <code>in</code>
|
||||
*/
|
||||
public int read() throws IOException {
|
||||
byte[] buf = new byte[1];
|
||||
return read(buf, 0, 1) == -1 ? -1 : (buf[0] & 0xFF);
|
||||
}
|
||||
|
||||
/**
|
||||
* Decompresses into an array of bytes.
|
||||
* <p>
|
||||
* If <code>len</code> is zero, no bytes are read and <code>0</code>
|
||||
* is returned. Otherwise this will try to decompress <code>len</code>
|
||||
* bytes of uncompressed data. Less than <code>len</code> bytes may
|
||||
* be read only in the following situations:
|
||||
* <ul>
|
||||
* <li>The end of the compressed data was reached successfully.</li>
|
||||
* <li>An error is detected after at least one but less <code>len</code>
|
||||
* bytes have already been successfully decompressed.
|
||||
* The next call with non-zero <code>len</code> will immediately
|
||||
* throw the pending exception.</li>
|
||||
* <li>An exception is thrown.</li>
|
||||
* </ul>
|
||||
*
|
||||
* @param buf target buffer for uncompressed data
|
||||
* @param off start offset in <code>buf</code>
|
||||
* @param len maximum number of uncompressed bytes to read
|
||||
*
|
||||
* @return number of bytes read, or <code>-1</code> to indicate
|
||||
* the end of the compressed stream
|
||||
*
|
||||
* @throws CorruptedInputException
|
||||
* @throws UnsupportedOptionsException
|
||||
* @throws MemoryLimitException
|
||||
*
|
||||
* @throws EOFException
|
||||
* compressed input is truncated or corrupt
|
||||
*
|
||||
* @throws IOException may be thrown by <code>in</code>
|
||||
*/
|
||||
public int read(byte[] buf, int off, int len) throws IOException {
|
||||
if (off < 0 || len < 0 || off + len < 0 || off + len > buf.length)
|
||||
throw new IllegalArgumentException();
|
||||
|
||||
if (len == 0)
|
||||
return 0;
|
||||
|
||||
if (exception != null)
|
||||
throw exception;
|
||||
|
||||
if (endReached)
|
||||
return -1;
|
||||
|
||||
int size = 0;
|
||||
|
||||
try {
|
||||
while (len > 0) {
|
||||
if (xzIn == null) {
|
||||
prepareNextStream();
|
||||
if (endReached)
|
||||
return size == 0 ? -1 : size;
|
||||
}
|
||||
|
||||
int ret = xzIn.read(buf, off, len);
|
||||
|
||||
if (ret > 0) {
|
||||
size += ret;
|
||||
off += ret;
|
||||
len -= ret;
|
||||
} else if (ret == -1) {
|
||||
xzIn = null;
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
exception = e;
|
||||
if (size == 0)
|
||||
throw e;
|
||||
}
|
||||
|
||||
return size;
|
||||
}
|
||||
|
||||
private void prepareNextStream() throws IOException {
|
||||
DataInputStream inData = new DataInputStream(in);
|
||||
byte[] buf = new byte[DecoderUtil.STREAM_HEADER_SIZE];
|
||||
|
||||
// The size of Stream Padding must be a multiple of four bytes,
|
||||
// all bytes zero.
|
||||
do {
|
||||
// First try to read one byte to see if we have reached the end
|
||||
// of the file.
|
||||
int ret = inData.read(buf, 0, 1);
|
||||
if (ret == -1) {
|
||||
endReached = true;
|
||||
return;
|
||||
}
|
||||
|
||||
// Since we got one byte of input, there must be at least
|
||||
// three more available in a valid file.
|
||||
inData.readFully(buf, 1, 3);
|
||||
|
||||
} while (buf[0] == 0 && buf[1] == 0 && buf[2] == 0 && buf[3] == 0);
|
||||
|
||||
// Not all bytes are zero. In a valid Stream it indicates the
|
||||
// beginning of the next Stream. Read the rest of the Stream Header
|
||||
// and initialize the XZ decoder.
|
||||
inData.readFully(buf, 4, DecoderUtil.STREAM_HEADER_SIZE - 4);
|
||||
|
||||
try {
|
||||
xzIn = new SingleXZInputStream(in, memoryLimit, buf);
|
||||
} catch (XZFormatException e) {
|
||||
// Since this isn't the first .xz Stream, it is more
|
||||
// logical to tell that the data is corrupt.
|
||||
throw new CorruptedInputException(
|
||||
"Garbage after a valid XZ Stream");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of uncompressed bytes that can be read
|
||||
* without blocking. The value is returned with an assumption
|
||||
* that the compressed input data will be valid. If the compressed
|
||||
* data is corrupt, <code>CorruptedInputException</code> may get
|
||||
* thrown before the number of bytes claimed to be available have
|
||||
* been read from this input stream.
|
||||
*
|
||||
* @return the number of uncompressed bytes that can be read
|
||||
* without blocking
|
||||
*/
|
||||
public int available() throws IOException {
|
||||
return xzIn == null ? 0 : xzIn.available();
|
||||
}
|
||||
|
||||
/**
|
||||
* Calls <code>in.close()</code>.
|
||||
*/
|
||||
public void close() throws IOException {
|
||||
in.close();
|
||||
}
|
||||
}
|
@ -1,290 +0,0 @@
|
||||
/*
|
||||
* XZOutputStream
|
||||
*
|
||||
* Author: Lasse Collin <lasse.collin@tukaani.org>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz;
|
||||
|
||||
import java.io.OutputStream;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.tukaani.xz.common.EncoderUtil;
|
||||
import org.tukaani.xz.common.StreamFlags;
|
||||
import org.tukaani.xz.check.Check;
|
||||
import org.tukaani.xz.index.IndexEncoder;
|
||||
|
||||
/**
|
||||
* Compresses into the .xz file format.
|
||||
*/
|
||||
public class XZOutputStream extends FinishableOutputStream {
|
||||
private OutputStream out;
|
||||
private final StreamFlags streamFlags = new StreamFlags();
|
||||
private Check check;
|
||||
private final IndexEncoder index = new IndexEncoder();
|
||||
private FilterEncoder[] filters;
|
||||
private BlockOutputStream blockEncoder = null;
|
||||
private IOException exception = null;
|
||||
private boolean finished = false;
|
||||
|
||||
/**
|
||||
* Creates a new output stream that compressed data into the .xz format.
|
||||
* This is takes options for one filter as an argument. This constructor
|
||||
* is equivalent to passing a single-member filterOptions array to the
|
||||
* other constructor.
|
||||
*
|
||||
* @param out output stream to which the compressed data
|
||||
* will be written
|
||||
*
|
||||
* @param filterOptions
|
||||
* filter options to use
|
||||
*
|
||||
* @param checkType type of the integrity check,
|
||||
* for example XZ.CHECK_CRC64
|
||||
*
|
||||
* @throws UnsupportedOptionsException
|
||||
* invalid filter chain
|
||||
*
|
||||
* @throws IOException may be thrown from <code>out</code>
|
||||
*/
|
||||
public XZOutputStream(OutputStream out, FilterOptions filterOptions,
|
||||
int checkType) throws IOException {
|
||||
FilterOptions[] ops = new FilterOptions[1];
|
||||
ops[0] = filterOptions;
|
||||
initialize(out, ops, checkType);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new output stream that compressed data into the .xz format.
|
||||
* This takes an array of filter options, allowing the caller to specify
|
||||
* a filter chain with 1-4 filters.
|
||||
*
|
||||
* @param out output stream to which the compressed data
|
||||
* will be written
|
||||
*
|
||||
* @param filterOptions
|
||||
* array of filter options to use
|
||||
*
|
||||
* @param checkType type of the integrity check,
|
||||
* for example XZ.CHECK_CRC64
|
||||
*
|
||||
* @throws UnsupportedOptionsException
|
||||
* invalid filter chain
|
||||
*
|
||||
* @throws IOException may be thrown from <code>out</code>
|
||||
*/
|
||||
public XZOutputStream(OutputStream out, FilterOptions[] filterOptions,
|
||||
int checkType) throws IOException {
|
||||
initialize(out, filterOptions, checkType);
|
||||
}
|
||||
|
||||
private void initialize(OutputStream out, FilterOptions[] filterOptions,
|
||||
int checkType) throws IOException {
|
||||
this.out = out;
|
||||
updateFilters(filterOptions);
|
||||
|
||||
streamFlags.checkType = checkType;
|
||||
check = Check.getInstance(checkType);
|
||||
|
||||
encodeStreamHeader();
|
||||
}
|
||||
|
||||
/**
|
||||
* Updates the filter chain.
|
||||
* <p>
|
||||
* Currently this cannot be used to update e.g. LZMA2 options in the
|
||||
* middle of a XZ Block. Use <code>flush()</code> to finish the current
|
||||
* XZ Block before calling this function. The new filter chain will then
|
||||
* be used for the next XZ Block.
|
||||
*/
|
||||
public void updateFilters(FilterOptions[] filterOptions)
|
||||
throws XZIOException {
|
||||
if (blockEncoder != null)
|
||||
throw new UnsupportedOptionsException("Changing filter options "
|
||||
+ "in the middle of a XZ Block not implemented");
|
||||
|
||||
if (filterOptions.length < 1 || filterOptions.length > 4)
|
||||
throw new UnsupportedOptionsException(
|
||||
"XZ filter chain must be 1-4 filters");
|
||||
|
||||
FilterEncoder[] newFilters = new FilterEncoder[filterOptions.length];
|
||||
for (int i = 0; i < filterOptions.length; ++i)
|
||||
newFilters[i] = filterOptions[i].getFilterEncoder();
|
||||
|
||||
RawCoder.validate(newFilters);
|
||||
filters = newFilters;
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes one byte to be compressed.
|
||||
*
|
||||
* @throws XZIOException
|
||||
* XZ stream has grown too big
|
||||
* @throws IOException may be thrown by the underlying output stream
|
||||
*/
|
||||
public void write(int b) throws IOException {
|
||||
byte[] buf = new byte[] { (byte)b };
|
||||
write(buf, 0, 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes an array of bytes to be compressed.
|
||||
* The compressors tend to do internal buffering and thus the written
|
||||
* data won't be readable from the compressed output immediately.
|
||||
* Use <code>flush()</code> to force everything written so far to
|
||||
* be written to the underlaying output stream, but be aware that
|
||||
* flushing reduces compression ratio.
|
||||
*
|
||||
* @param buf buffer of bytes to be written
|
||||
* @param off start offset in <code>buf</code>
|
||||
* @param len number of bytes to write
|
||||
*
|
||||
* @throws XZIOException
|
||||
* XZ stream has grown too big
|
||||
* @throws XZIOException
|
||||
* <code>finish()</code> or <code>close()</code>
|
||||
* was already called
|
||||
* @throws IOException may be thrown by the underlying output stream
|
||||
*/
|
||||
public void write(byte[] buf, int off, int len) throws IOException {
|
||||
if (off < 0 || len < 0 || off + len < 0 || off + len > buf.length)
|
||||
throw new IllegalArgumentException();
|
||||
|
||||
if (len == 0)
|
||||
return;
|
||||
|
||||
if (finished)
|
||||
exception = new XZIOException(
|
||||
"XZOutputStream.write was called on a finished stream");
|
||||
|
||||
if (exception != null)
|
||||
throw exception;
|
||||
|
||||
if (blockEncoder == null)
|
||||
blockEncoder = new BlockOutputStream(out, filters, check);
|
||||
|
||||
try {
|
||||
blockEncoder.write(buf, off, len);
|
||||
} catch (IOException e) {
|
||||
exception = e;
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Flushes the encoder and calls <code>out.flush()</code>.
|
||||
* <p>
|
||||
* FIXME: I haven't decided yet how this will work in the final version.
|
||||
* In the current implementation, flushing finishes the current .xz Block.
|
||||
* This is equivalent to LZMA_FULL_FLUSH in liblzma (XZ Utils).
|
||||
* Equivalent of liblzma's LZMA_SYNC_FLUSH might be implemented in
|
||||
* the future, and perhaps should be what <code>flush()</code> should do.
|
||||
*/
|
||||
public void flush() throws IOException {
|
||||
if (exception != null)
|
||||
throw exception;
|
||||
|
||||
if (blockEncoder != null) {
|
||||
try {
|
||||
blockEncoder.finish();
|
||||
index.add(blockEncoder.getUnpaddedSize(),
|
||||
blockEncoder.getUncompressedSize());
|
||||
blockEncoder = null;
|
||||
} catch (IOException e) {
|
||||
exception = e;
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
out.flush();
|
||||
}
|
||||
|
||||
/**
|
||||
* Finishes compression without closing the underlying stream.
|
||||
* No more data can be written to this stream after finishing
|
||||
* (calling <code>write</code> with an empty buffer is OK).
|
||||
* <p>
|
||||
* Repeated calls to <code>finish()</code> do nothing unless
|
||||
* an exception was thrown by this stream earlier. In that case
|
||||
* the same exception is thrown again.
|
||||
* <p>
|
||||
* After finishing, the stream may be closed normally with
|
||||
* <code>close()</code>. If the stream will be closed anyway, there
|
||||
* usually is no need to call <code>finish()</code> separately.
|
||||
*/
|
||||
public void finish() throws IOException {
|
||||
if (!finished) {
|
||||
// flush() checks for pending exceptions so we don't need to
|
||||
// worry about it here.
|
||||
flush();
|
||||
|
||||
try {
|
||||
index.encode(out);
|
||||
encodeStreamFooter();
|
||||
finished = true;
|
||||
} catch (IOException e) {
|
||||
exception = e;
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Finishes compression and closes the underlying stream.
|
||||
* The underlying stream <code>out</code> is closed even if finishing
|
||||
* fails. If both finishing and closing fail, the exception thrown
|
||||
* by <code>finish()</code> is thrown and the exception from the failed
|
||||
* <code>out.close()</code> is lost.
|
||||
*/
|
||||
public void close() throws IOException {
|
||||
// If finish() throws an exception, it stores the exception to
|
||||
// the variable "exception". So we can ignore the possible
|
||||
// exception here.
|
||||
try {
|
||||
finish();
|
||||
} catch (IOException e) {}
|
||||
|
||||
try {
|
||||
out.close();
|
||||
} catch (IOException e) {
|
||||
// Remember the exception but only if there is no previous
|
||||
// pending exception.
|
||||
if (exception == null)
|
||||
exception = e;
|
||||
}
|
||||
|
||||
if (exception != null)
|
||||
throw exception;
|
||||
}
|
||||
|
||||
private void encodeStreamFlags(byte[] buf, int off) {
|
||||
buf[off] = 0x00;
|
||||
buf[off + 1] = (byte)streamFlags.checkType;
|
||||
}
|
||||
|
||||
private void encodeStreamHeader() throws IOException {
|
||||
out.write(XZ.HEADER_MAGIC);
|
||||
|
||||
byte[] buf = new byte[2];
|
||||
encodeStreamFlags(buf, 0);
|
||||
out.write(buf);
|
||||
|
||||
EncoderUtil.writeCRC32(out, buf);
|
||||
}
|
||||
|
||||
private void encodeStreamFooter() throws IOException {
|
||||
byte[] buf = new byte[6];
|
||||
long backwardSize = index.getIndexSize() / 4 - 1;
|
||||
for (int i = 0; i < 4; ++i)
|
||||
buf[i] = (byte)(backwardSize >>> (i * 8));
|
||||
|
||||
encodeStreamFlags(buf, 4);
|
||||
|
||||
EncoderUtil.writeCRC32(out, buf);
|
||||
out.write(buf);
|
||||
out.write(XZ.FOOTER_MAGIC);
|
||||
}
|
||||
}
|
@ -1,33 +0,0 @@
|
||||
/*
|
||||
* CRC32
|
||||
*
|
||||
* Author: Lasse Collin <lasse.collin@tukaani.org>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz.check;
|
||||
|
||||
public class CRC32 extends Check {
|
||||
private final java.util.zip.CRC32 state = new java.util.zip.CRC32();
|
||||
|
||||
public CRC32() {
|
||||
size = 4;
|
||||
name = "CRC32";
|
||||
}
|
||||
|
||||
public void update(byte[] buf, int off, int len) {
|
||||
state.update(buf, off, len);
|
||||
}
|
||||
|
||||
public byte[] finish() {
|
||||
long value = state.getValue();
|
||||
byte[] buf = new byte[] { (byte)(value),
|
||||
(byte)(value >>> 8),
|
||||
(byte)(value >>> 16),
|
||||
(byte)(value >>> 24) };
|
||||
state.reset();
|
||||
return buf;
|
||||
}
|
||||
}
|
@ -1,54 +0,0 @@
|
||||
/*
|
||||
* CRC64
|
||||
*
|
||||
* Author: Lasse Collin <lasse.collin@tukaani.org>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz.check;
|
||||
|
||||
public class CRC64 extends Check {
|
||||
private static final long poly = 0xC96C5795D7870F42L;
|
||||
private static final long[] crcTable = new long[256];
|
||||
|
||||
private long crc = -1;
|
||||
|
||||
static {
|
||||
for (int b = 0; b < crcTable.length; ++b) {
|
||||
long r = b;
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
if ((r & 1) == 1)
|
||||
r = (r >>> 1) ^ poly;
|
||||
else
|
||||
r >>>= 1;
|
||||
}
|
||||
|
||||
crcTable[b] = r;
|
||||
}
|
||||
}
|
||||
|
||||
public CRC64() {
|
||||
size = 8;
|
||||
name = "CRC64";
|
||||
}
|
||||
|
||||
public void update(byte[] buf, int off, int len) {
|
||||
int end = off + len;
|
||||
|
||||
while (off < end)
|
||||
crc = crcTable[(buf[off++] ^ (int)crc) & 0xFF] ^ (crc >>> 8);
|
||||
}
|
||||
|
||||
public byte[] finish() {
|
||||
long value = ~crc;
|
||||
crc = -1;
|
||||
|
||||
byte[] buf = new byte[8];
|
||||
for (int i = 0; i < buf.length; ++i)
|
||||
buf[i] = (byte)(value >> (i * 8));
|
||||
|
||||
return buf;
|
||||
}
|
||||
}
|
@ -1,57 +0,0 @@
|
||||
/*
|
||||
* Check
|
||||
*
|
||||
* Author: Lasse Collin <lasse.collin@tukaani.org>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz.check;
|
||||
|
||||
import org.tukaani.xz.XZ;
|
||||
import org.tukaani.xz.UnsupportedOptionsException;
|
||||
|
||||
public abstract class Check {
|
||||
int size;
|
||||
String name;
|
||||
|
||||
public abstract void update(byte[] buf, int off, int len);
|
||||
public abstract byte[] finish();
|
||||
|
||||
public void update(byte[] buf) {
|
||||
update(buf, 0, buf.length);
|
||||
}
|
||||
|
||||
public int getSize() {
|
||||
return size;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public static Check getInstance(int checkType)
|
||||
throws UnsupportedOptionsException {
|
||||
switch (checkType) {
|
||||
case XZ.CHECK_NONE:
|
||||
return new None();
|
||||
|
||||
case XZ.CHECK_CRC32:
|
||||
return new CRC32();
|
||||
|
||||
case XZ.CHECK_CRC64:
|
||||
return new CRC64();
|
||||
|
||||
case XZ.CHECK_SHA256:
|
||||
try {
|
||||
return new SHA256();
|
||||
} catch (java.security.NoSuchAlgorithmException e) {}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
throw new UnsupportedOptionsException(
|
||||
"Unsupported Check ID " + checkType);
|
||||
}
|
||||
}
|
@ -1,24 +0,0 @@
|
||||
/*
|
||||
* None
|
||||
*
|
||||
* Author: Lasse Collin <lasse.collin@tukaani.org>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz.check;
|
||||
|
||||
public class None extends Check {
|
||||
public None() {
|
||||
size = 0;
|
||||
name = "None";
|
||||
}
|
||||
|
||||
public void update(byte[] buf, int off, int len) {}
|
||||
|
||||
public byte[] finish() {
|
||||
byte[] empty = new byte[0];
|
||||
return empty;
|
||||
}
|
||||
}
|
@ -1,30 +0,0 @@
|
||||
/*
|
||||
* SHA256
|
||||
*
|
||||
* Author: Lasse Collin <lasse.collin@tukaani.org>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz.check;
|
||||
|
||||
public class SHA256 extends Check {
|
||||
private final java.security.MessageDigest sha256;
|
||||
|
||||
public SHA256() throws java.security.NoSuchAlgorithmException {
|
||||
size = 32;
|
||||
name = "SHA-256";
|
||||
sha256 = java.security.MessageDigest.getInstance("SHA-256");
|
||||
}
|
||||
|
||||
public void update(byte[] buf, int off, int len) {
|
||||
sha256.update(buf, off, len);
|
||||
}
|
||||
|
||||
public byte[] finish() {
|
||||
byte[] buf = sha256.digest();
|
||||
sha256.reset();
|
||||
return buf;
|
||||
}
|
||||
}
|
@ -1,121 +0,0 @@
|
||||
/*
|
||||
* DecoderUtil
|
||||
*
|
||||
* Author: Lasse Collin <lasse.collin@tukaani.org>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz.common;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.EOFException;
|
||||
import java.util.zip.CRC32;
|
||||
import org.tukaani.xz.XZ;
|
||||
import org.tukaani.xz.XZFormatException;
|
||||
import org.tukaani.xz.CorruptedInputException;
|
||||
import org.tukaani.xz.UnsupportedOptionsException;
|
||||
|
||||
public class DecoderUtil extends Util {
|
||||
public static boolean isCRC32Valid(byte[] buf, int off, int len,
|
||||
int ref_off) {
|
||||
CRC32 crc32 = new CRC32();
|
||||
crc32.update(buf, off, len);
|
||||
long value = crc32.getValue();
|
||||
|
||||
for (int i = 0; i < 4; ++i)
|
||||
if ((byte)(value >>> (i * 8)) != buf[ref_off + i])
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public static StreamFlags decodeStreamHeader(byte[] buf)
|
||||
throws IOException {
|
||||
for (int i = 0; i < XZ.HEADER_MAGIC.length; ++i)
|
||||
if (buf[i] != XZ.HEADER_MAGIC[i])
|
||||
throw new XZFormatException();
|
||||
|
||||
if (!isCRC32Valid(buf, XZ.HEADER_MAGIC.length, 2,
|
||||
XZ.HEADER_MAGIC.length + 2))
|
||||
throw new CorruptedInputException("XZ Stream Header is corrupt");
|
||||
|
||||
try {
|
||||
return decodeStreamFlags(buf, XZ.HEADER_MAGIC.length);
|
||||
} catch (UnsupportedOptionsException e) {
|
||||
throw new UnsupportedOptionsException(
|
||||
"Unsupported options in XZ Stream Header");
|
||||
}
|
||||
}
|
||||
|
||||
public static StreamFlags decodeStreamFooter(byte[] buf)
|
||||
throws IOException {
|
||||
if (buf[10] != XZ.FOOTER_MAGIC[0] || buf[11] != XZ.FOOTER_MAGIC[1]) {
|
||||
// NOTE: The exception could be XZFormatException too.
|
||||
// It depends on the situation which one is better.
|
||||
throw new CorruptedInputException("XZ Stream Footer is corrupt");
|
||||
}
|
||||
|
||||
if (!isCRC32Valid(buf, 4, 6, 0))
|
||||
throw new CorruptedInputException("XZ Stream Footer is corrupt");
|
||||
|
||||
StreamFlags streamFlags;
|
||||
try {
|
||||
streamFlags = decodeStreamFlags(buf, 8);
|
||||
} catch (UnsupportedOptionsException e) {
|
||||
throw new UnsupportedOptionsException(
|
||||
"Unsupported options in XZ Stream Footer");
|
||||
}
|
||||
|
||||
streamFlags.backwardSize = 0;
|
||||
for (int i = 0; i < 4; ++i)
|
||||
streamFlags.backwardSize |= (buf[i + 4] & 0xFF) << (i * 8);
|
||||
|
||||
streamFlags.backwardSize = (streamFlags.backwardSize + 1) * 4;
|
||||
|
||||
return streamFlags;
|
||||
}
|
||||
|
||||
private static StreamFlags decodeStreamFlags(byte[] buf, int off)
|
||||
throws UnsupportedOptionsException {
|
||||
if (buf[off] != 0x00 || (buf[off + 1] & 0xFF) >= 0x10)
|
||||
throw new UnsupportedOptionsException();
|
||||
|
||||
StreamFlags streamFlags = new StreamFlags();
|
||||
streamFlags.checkType = buf[off + 1];
|
||||
|
||||
return streamFlags;
|
||||
}
|
||||
|
||||
public static boolean areStreamFlagsEqual(StreamFlags a, StreamFlags b) {
|
||||
// backwardSize is intentionally not compared.
|
||||
return a.checkType == b.checkType;
|
||||
}
|
||||
|
||||
public static long decodeVLI(InputStream in) throws IOException {
|
||||
int b = in.read();
|
||||
if (b == -1)
|
||||
throw new EOFException();
|
||||
|
||||
long num = b & 0x7F;
|
||||
int i = 0;
|
||||
|
||||
while ((b & 0x80) != 0x00) {
|
||||
if (++i >= VLI_SIZE_MAX)
|
||||
throw new CorruptedInputException();
|
||||
|
||||
b = in.read();
|
||||
if (b == -1)
|
||||
throw new EOFException();
|
||||
|
||||
if (b == 0x00)
|
||||
throw new CorruptedInputException();
|
||||
|
||||
num |= (long)(b & 0x7F) << (i * 7);
|
||||
}
|
||||
|
||||
return num;
|
||||
}
|
||||
}
|
@ -1,36 +0,0 @@
|
||||
/*
|
||||
* EncoderUtil
|
||||
*
|
||||
* Author: Lasse Collin <lasse.collin@tukaani.org>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz.common;
|
||||
|
||||
import java.io.OutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.zip.CRC32;
|
||||
|
||||
public class EncoderUtil extends Util {
|
||||
public static void writeCRC32(OutputStream out, byte[] buf)
|
||||
throws IOException {
|
||||
CRC32 crc32 = new CRC32();
|
||||
crc32.update(buf);
|
||||
long value = crc32.getValue();
|
||||
|
||||
for (int i = 0; i < 4; ++i)
|
||||
out.write((byte)(value >>> (i * 8)));
|
||||
}
|
||||
|
||||
public static void encodeVLI(OutputStream out, long num)
|
||||
throws IOException {
|
||||
while (num >= 0x80) {
|
||||
out.write((byte)(num | 0x80));
|
||||
num >>>= 7;
|
||||
}
|
||||
|
||||
out.write((byte)num);
|
||||
}
|
||||
}
|
@ -1,15 +0,0 @@
|
||||
/*
|
||||
* StreamFlags
|
||||
*
|
||||
* Author: Lasse Collin <lasse.collin@tukaani.org>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz.common;
|
||||
|
||||
public class StreamFlags {
|
||||
public int checkType = -1;
|
||||
public long backwardSize = -1;
|
||||
}
|
@ -1,28 +0,0 @@
|
||||
/*
|
||||
* Util
|
||||
*
|
||||
* Author: Lasse Collin <lasse.collin@tukaani.org>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz.common;
|
||||
|
||||
public class Util {
|
||||
public static final int STREAM_HEADER_SIZE = 12;
|
||||
public static final long BACKWARD_SIZE_MAX = 1L << 34;
|
||||
public static final int BLOCK_HEADER_SIZE_MAX = 1024;
|
||||
public static final long VLI_MAX = Long.MAX_VALUE;
|
||||
public static final int VLI_SIZE_MAX = 9;
|
||||
|
||||
public static int getVLISize(long num) {
|
||||
int size = 0;
|
||||
do {
|
||||
++size;
|
||||
num >>= 7;
|
||||
} while (num != 0);
|
||||
|
||||
return size;
|
||||
}
|
||||
}
|
@ -1,27 +0,0 @@
|
||||
/*
|
||||
* DeltaCoder
|
||||
*
|
||||
* Author: Lasse Collin <lasse.collin@tukaani.org>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz.delta;
|
||||
|
||||
abstract class DeltaCoder {
|
||||
static final int DISTANCE_MIN = 1;
|
||||
static final int DISTANCE_MAX = 256;
|
||||
static final int DISTANCE_MASK = DISTANCE_MAX - 1;
|
||||
|
||||
final int distance;
|
||||
final byte[] history = new byte[DISTANCE_MAX];
|
||||
int pos = 0;
|
||||
|
||||
public DeltaCoder(int distance) {
|
||||
if (distance < DISTANCE_MIN || distance > DISTANCE_MAX)
|
||||
throw new IllegalArgumentException();
|
||||
|
||||
this.distance = distance;
|
||||
}
|
||||
}
|
@ -1,24 +0,0 @@
|
||||
/*
|
||||
* DeltaDecoder
|
||||
*
|
||||
* Author: Lasse Collin <lasse.collin@tukaani.org>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz.delta;
|
||||
|
||||
public class DeltaDecoder extends DeltaCoder {
|
||||
public DeltaDecoder(int distance) {
|
||||
super(distance);
|
||||
}
|
||||
|
||||
public void decode(byte[] buf, int off, int len) {
|
||||
int end = off + len;
|
||||
for (int i = off; i < end; ++i) {
|
||||
buf[i] += history[(distance + pos) & DISTANCE_MASK];
|
||||
history[pos-- & DISTANCE_MASK] = buf[i];
|
||||
}
|
||||
}
|
||||
}
|
@ -1,56 +0,0 @@
|
||||
/*
|
||||
* IndexBase
|
||||
*
|
||||
* Author: Lasse Collin <lasse.collin@tukaani.org>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz.index;
|
||||
|
||||
import org.tukaani.xz.common.Util;
|
||||
import org.tukaani.xz.XZIOException;
|
||||
|
||||
abstract class IndexBase {
|
||||
private final XZIOException invalidIndexException;
|
||||
long blocksSum = 0;
|
||||
long uncompressedSum = 0;
|
||||
long indexListSize = 0;
|
||||
long recordCount = 0;
|
||||
|
||||
IndexBase(XZIOException invalidIndexException) {
|
||||
this.invalidIndexException = invalidIndexException;
|
||||
}
|
||||
|
||||
private long getUnpaddedIndexSize() {
|
||||
// Index Indicator + Number of Records + List of Records + CRC32
|
||||
return 1 + Util.getVLISize(recordCount) + indexListSize + 4;
|
||||
}
|
||||
|
||||
public long getIndexSize() {
|
||||
return (getUnpaddedIndexSize() + 3) & ~3;
|
||||
}
|
||||
|
||||
long getStreamSize() {
|
||||
return Util.STREAM_HEADER_SIZE + blocksSum + getIndexSize()
|
||||
+ Util.STREAM_HEADER_SIZE;
|
||||
}
|
||||
|
||||
int getIndexPaddingSize() {
|
||||
return (int)((4 - getUnpaddedIndexSize()) & 3);
|
||||
}
|
||||
|
||||
void add(long unpaddedSize, long uncompressedSize) throws XZIOException {
|
||||
blocksSum += (unpaddedSize + 3) & ~3;
|
||||
uncompressedSum += uncompressedSize;
|
||||
indexListSize += Util.getVLISize(unpaddedSize)
|
||||
+ Util.getVLISize(uncompressedSize);
|
||||
++recordCount;
|
||||
|
||||
if (blocksSum < 0 || uncompressedSum < 0
|
||||
|| getIndexSize() > Util.BACKWARD_SIZE_MAX
|
||||
|| getStreamSize() < 0)
|
||||
throw invalidIndexException;
|
||||
}
|
||||
}
|
@ -1,59 +0,0 @@
|
||||
/*
|
||||
* IndexEncoder
|
||||
*
|
||||
* Author: Lasse Collin <lasse.collin@tukaani.org>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz.index;
|
||||
|
||||
import java.io.OutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.zip.CheckedOutputStream;
|
||||
import org.tukaani.xz.common.EncoderUtil;
|
||||
import org.tukaani.xz.XZIOException;
|
||||
|
||||
public class IndexEncoder extends IndexBase {
|
||||
private final ArrayList<IndexRecord> records = new ArrayList<>();
|
||||
|
||||
public IndexEncoder() {
|
||||
super(new XZIOException("XZ Stream or its Index has grown too big"));
|
||||
}
|
||||
|
||||
public void add(long unpaddedSize, long uncompressedSize)
|
||||
throws XZIOException {
|
||||
super.add(unpaddedSize, uncompressedSize);
|
||||
records.add(new IndexRecord(unpaddedSize, uncompressedSize));
|
||||
}
|
||||
|
||||
public void encode(OutputStream out) throws IOException {
|
||||
java.util.zip.CRC32 crc32 = new java.util.zip.CRC32();
|
||||
CheckedOutputStream outChecked = new CheckedOutputStream(out, crc32);
|
||||
|
||||
// Index Indicator
|
||||
outChecked.write(0x00);
|
||||
|
||||
// Number of Records
|
||||
EncoderUtil.encodeVLI(outChecked, recordCount);
|
||||
|
||||
// List of Records
|
||||
for (Iterator i = records.iterator(); i.hasNext(); ) {
|
||||
IndexRecord record = (IndexRecord)i.next();
|
||||
EncoderUtil.encodeVLI(outChecked, record.unpadded);
|
||||
EncoderUtil.encodeVLI(outChecked, record.uncompressed);
|
||||
}
|
||||
|
||||
// Index Padding
|
||||
for (int i = getIndexPaddingSize(); i > 0; --i)
|
||||
outChecked.write(0x00);
|
||||
|
||||
// CRC32
|
||||
long value = crc32.getValue();
|
||||
for (int i = 0; i < 4; ++i)
|
||||
out.write((byte)(value >>> (i * 8)));
|
||||
}
|
||||
}
|
@ -1,94 +0,0 @@
|
||||
/*
|
||||
* IndexHash
|
||||
*
|
||||
* Author: Lasse Collin <lasse.collin@tukaani.org>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz.index;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.DataInputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.util.Arrays;
|
||||
import java.util.zip.CheckedInputStream;
|
||||
import org.tukaani.xz.common.DecoderUtil;
|
||||
import org.tukaani.xz.XZIOException;
|
||||
import org.tukaani.xz.CorruptedInputException;
|
||||
|
||||
public class IndexHash extends IndexBase {
|
||||
private org.tukaani.xz.check.Check hash;
|
||||
|
||||
public IndexHash() {
|
||||
super(new CorruptedInputException());
|
||||
|
||||
try {
|
||||
hash = new org.tukaani.xz.check.SHA256();
|
||||
} catch (java.security.NoSuchAlgorithmException e) {
|
||||
hash = new org.tukaani.xz.check.CRC32();
|
||||
}
|
||||
}
|
||||
|
||||
public void add(long unpaddedSize, long uncompressedSize)
|
||||
throws XZIOException {
|
||||
super.add(unpaddedSize, uncompressedSize);
|
||||
|
||||
ByteBuffer buf = ByteBuffer.allocate(2 * 8);
|
||||
buf.putLong(unpaddedSize);
|
||||
buf.putLong(uncompressedSize);
|
||||
hash.update(buf.array());
|
||||
}
|
||||
|
||||
public void validate(InputStream in) throws IOException {
|
||||
// Index Indicator (0x00) has already been read by BlockInputStream
|
||||
// so add 0x00 to the CRC32 here.
|
||||
java.util.zip.CRC32 crc32 = new java.util.zip.CRC32();
|
||||
crc32.update('\0');
|
||||
CheckedInputStream inChecked = new CheckedInputStream(in, crc32);
|
||||
|
||||
// Get and validate the Number of Records field.
|
||||
long storedRecordCount = DecoderUtil.decodeVLI(inChecked);
|
||||
if (storedRecordCount != recordCount)
|
||||
throw new CorruptedInputException("XZ Index is corrupt");
|
||||
|
||||
// Decode and hash the Index field and compare it to
|
||||
// the hash value calculated from the decoded Blocks.
|
||||
IndexHash stored = new IndexHash();
|
||||
for (long i = 0; i < recordCount; ++i) {
|
||||
long unpaddedSize = DecoderUtil.decodeVLI(inChecked);
|
||||
long uncompressedSize = DecoderUtil.decodeVLI(inChecked);
|
||||
|
||||
try {
|
||||
stored.add(unpaddedSize, uncompressedSize);
|
||||
} catch (XZIOException e) {
|
||||
throw new CorruptedInputException("XZ Index is corrupt");
|
||||
}
|
||||
|
||||
if (stored.blocksSum > blocksSum
|
||||
|| stored.uncompressedSum > uncompressedSum
|
||||
|| stored.indexListSize > indexListSize)
|
||||
throw new CorruptedInputException("XZ Index is corrupt");
|
||||
}
|
||||
|
||||
if (stored.blocksSum != blocksSum
|
||||
|| stored.uncompressedSum != uncompressedSum
|
||||
|| stored.indexListSize != indexListSize
|
||||
|| !Arrays.equals(stored.hash.finish(), hash.finish()))
|
||||
throw new CorruptedInputException("XZ Index is corrupt");
|
||||
|
||||
// Index Padding
|
||||
DataInputStream inData = new DataInputStream(inChecked);
|
||||
for (int i = getIndexPaddingSize(); i > 0; --i)
|
||||
if (inData.readUnsignedByte() != 0x00)
|
||||
throw new CorruptedInputException("XZ Index is corrupt");
|
||||
|
||||
// CRC32
|
||||
long value = crc32.getValue();
|
||||
for (int i = 0; i < 4; ++i)
|
||||
if (((value >>> (i * 8)) & 0xFF) != inData.readUnsignedByte())
|
||||
throw new CorruptedInputException("XZ Index is corrupt");
|
||||
}
|
||||
}
|
@ -1,20 +0,0 @@
|
||||
/*
|
||||
* IndexRecord
|
||||
*
|
||||
* Author: Lasse Collin <lasse.collin@tukaani.org>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz.index;
|
||||
|
||||
public class IndexRecord {
|
||||
public final long unpadded;
|
||||
public final long uncompressed;
|
||||
|
||||
IndexRecord(long unpadded, long uncompressed) {
|
||||
this.unpadded = unpadded;
|
||||
this.uncompressed = uncompressed;
|
||||
}
|
||||
}
|
@ -1,126 +0,0 @@
|
||||
/*
|
||||
* LZDecoder
|
||||
*
|
||||
* Authors: Lasse Collin <lasse.collin@tukaani.org>
|
||||
* Igor Pavlov <http://7-zip.org/>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz.lz;
|
||||
|
||||
import java.io.DataInputStream;
|
||||
import java.io.IOException;
|
||||
import org.tukaani.xz.CorruptedInputException;
|
||||
|
||||
public final class LZDecoder {
|
||||
private final byte[] buf;
|
||||
private int start = 0;
|
||||
private int pos = 0;
|
||||
private int full = 0;
|
||||
private int limit = 0;
|
||||
private int pendingLen = 0;
|
||||
private int pendingDist = 0;
|
||||
|
||||
public LZDecoder(int dictSize, byte[] presetDict) {
|
||||
buf = new byte[dictSize];
|
||||
|
||||
if (presetDict != null) {
|
||||
pos = Math.min(presetDict.length, dictSize);
|
||||
full = pos;
|
||||
start = pos;
|
||||
System.arraycopy(presetDict, presetDict.length - pos, buf, 0, pos);
|
||||
}
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
start = 0;
|
||||
pos = 0;
|
||||
full = 0;
|
||||
limit = 0;
|
||||
buf[buf.length - 1] = 0x00;
|
||||
}
|
||||
|
||||
public void setLimit(int outMax) {
|
||||
if (buf.length - pos <= outMax)
|
||||
limit = buf.length;
|
||||
else
|
||||
limit = pos + outMax;
|
||||
}
|
||||
|
||||
public boolean hasSpace() {
|
||||
return pos < limit;
|
||||
}
|
||||
|
||||
public boolean hasPending() {
|
||||
return pendingLen > 0;
|
||||
}
|
||||
|
||||
public int getPos() {
|
||||
return pos;
|
||||
}
|
||||
|
||||
public int getByte(int dist) {
|
||||
int offset = pos - dist - 1;
|
||||
if (dist >= pos)
|
||||
offset += buf.length;
|
||||
|
||||
return buf[offset] & 0xFF;
|
||||
}
|
||||
|
||||
public void putByte(byte b) {
|
||||
buf[pos++] = b;
|
||||
|
||||
if (full < pos)
|
||||
full = pos;
|
||||
}
|
||||
|
||||
public void repeat(int dist, int len) throws IOException {
|
||||
if (dist < 0 || dist >= full)
|
||||
throw new CorruptedInputException();
|
||||
|
||||
int left = Math.min(limit - pos, len);
|
||||
pendingLen = len - left;
|
||||
pendingDist = dist;
|
||||
|
||||
int back = pos - dist - 1;
|
||||
if (dist >= pos)
|
||||
back += buf.length;
|
||||
|
||||
do {
|
||||
buf[pos++] = buf[back++];
|
||||
if (back == buf.length)
|
||||
back = 0;
|
||||
} while (--left > 0);
|
||||
|
||||
if (full < pos)
|
||||
full = pos;
|
||||
}
|
||||
|
||||
public void repeatPending() throws IOException {
|
||||
if (pendingLen > 0)
|
||||
repeat(pendingDist, pendingLen);
|
||||
}
|
||||
|
||||
public void copyUncompressed(DataInputStream inData, int len)
|
||||
throws IOException {
|
||||
int copySize = Math.min(buf.length - pos, len);
|
||||
inData.readFully(buf, pos, copySize);
|
||||
pos += copySize;
|
||||
|
||||
if (full < pos)
|
||||
full = pos;
|
||||
}
|
||||
|
||||
public int flush(byte[] out, int outOff) {
|
||||
int copySize = pos - start;
|
||||
if (pos == buf.length)
|
||||
pos = 0;
|
||||
|
||||
System.arraycopy(buf, start, out, outOff, copySize);
|
||||
start = pos;
|
||||
|
||||
return copySize;
|
||||
}
|
||||
}
|
@ -1,139 +0,0 @@
|
||||
/*
|
||||
* LZMACoder
|
||||
*
|
||||
* Authors: Lasse Collin <lasse.collin@tukaani.org>
|
||||
* Igor Pavlov <http://7-zip.org/>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz.lzma;
|
||||
|
||||
import org.tukaani.xz.rangecoder.RangeCoder;
|
||||
|
||||
abstract class LZMACoder {
|
||||
static final int POS_STATES_MAX = 1 << 4;
|
||||
|
||||
static final int MATCH_LEN_MIN = 2;
|
||||
static final int MATCH_LEN_MAX = MATCH_LEN_MIN + LengthCoder.LOW_SYMBOLS
|
||||
+ LengthCoder.MID_SYMBOLS
|
||||
+ LengthCoder.HIGH_SYMBOLS - 1;
|
||||
|
||||
static final int DIST_STATES = 4;
|
||||
static final int DIST_SLOTS = 1 << 6;
|
||||
static final int DIST_MODEL_START = 4;
|
||||
static final int DIST_MODEL_END = 14;
|
||||
|
||||
static final int ALIGN_BITS = 4;
|
||||
static final int ALIGN_SIZE = 1 << ALIGN_BITS;
|
||||
static final int ALIGN_MASK = ALIGN_SIZE - 1;
|
||||
|
||||
static final int REPS = 4;
|
||||
|
||||
final int posMask;
|
||||
|
||||
final int[] rep = new int[4];
|
||||
final State state = new State();
|
||||
|
||||
final short[][] isMatch = new short[State.STATES][POS_STATES_MAX];
|
||||
final short[] isRep = new short[State.STATES];
|
||||
final short[] isRep0 = new short[State.STATES];
|
||||
final short[] isRep1 = new short[State.STATES];
|
||||
final short[] isRep2 = new short[State.STATES];
|
||||
final short[][] isRep0Long = new short[State.STATES][POS_STATES_MAX];
|
||||
final short[][] distSlots = new short[DIST_STATES][DIST_SLOTS];
|
||||
final short[][] distSpecial = { new short[2], new short[2],
|
||||
new short[4], new short[4],
|
||||
new short[8], new short[8],
|
||||
new short[16], new short[16],
|
||||
new short[32], new short[32] };
|
||||
final short[] distAlign = new short[ALIGN_SIZE];
|
||||
|
||||
static int getDistState(int len) {
|
||||
return len < DIST_STATES + MATCH_LEN_MIN
|
||||
? len - MATCH_LEN_MIN
|
||||
: DIST_STATES - 1;
|
||||
}
|
||||
|
||||
LZMACoder(int pb) {
|
||||
posMask = (1 << pb) - 1;
|
||||
}
|
||||
|
||||
void reset() {
|
||||
rep[0] = 0;
|
||||
rep[1] = 0;
|
||||
rep[2] = 0;
|
||||
rep[3] = 0;
|
||||
state.reset();
|
||||
|
||||
for (int i = 0; i < isMatch.length; ++i)
|
||||
RangeCoder.initProbs(isMatch[i]);
|
||||
|
||||
RangeCoder.initProbs(isRep);
|
||||
RangeCoder.initProbs(isRep0);
|
||||
RangeCoder.initProbs(isRep1);
|
||||
RangeCoder.initProbs(isRep2);
|
||||
|
||||
for (int i = 0; i < isRep0Long.length; ++i)
|
||||
RangeCoder.initProbs(isRep0Long[i]);
|
||||
|
||||
for (int i = 0; i < distSlots.length; ++i)
|
||||
RangeCoder.initProbs(distSlots[i]);
|
||||
|
||||
for (int i = 0; i < distSpecial.length; ++i)
|
||||
RangeCoder.initProbs(distSpecial[i]);
|
||||
|
||||
RangeCoder.initProbs(distAlign);
|
||||
}
|
||||
|
||||
|
||||
abstract static class LiteralCoder {
|
||||
private final int lc;
|
||||
private final int literalPosMask;
|
||||
|
||||
LiteralCoder(int lc, int lp) {
|
||||
this.lc = lc;
|
||||
this.literalPosMask = (1 << lp) - 1;
|
||||
}
|
||||
|
||||
final int getSubcoderIndex(int prevByte, int pos) {
|
||||
int low = prevByte >> (8 - lc);
|
||||
int high = (pos & literalPosMask) << lc;
|
||||
return low + high;
|
||||
}
|
||||
|
||||
|
||||
abstract class LiteralSubcoder {
|
||||
final short[] probs = new short[0x300];
|
||||
|
||||
void reset() {
|
||||
RangeCoder.initProbs(probs);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
abstract static class LengthCoder {
|
||||
static final int LOW_SYMBOLS = 1 << 3;
|
||||
static final int MID_SYMBOLS = 1 << 3;
|
||||
static final int HIGH_SYMBOLS = 1 << 8;
|
||||
|
||||
final short[] choice = new short[2];
|
||||
final short[][] low = new short[POS_STATES_MAX][LOW_SYMBOLS];
|
||||
final short[][] mid = new short[POS_STATES_MAX][MID_SYMBOLS];
|
||||
final short[] high = new short[HIGH_SYMBOLS];
|
||||
|
||||
void reset() {
|
||||
RangeCoder.initProbs(choice);
|
||||
|
||||
for (int i = 0; i < low.length; ++i)
|
||||
RangeCoder.initProbs(low[i]);
|
||||
|
||||
for (int i = 0; i < low.length; ++i)
|
||||
RangeCoder.initProbs(mid[i]);
|
||||
|
||||
RangeCoder.initProbs(high);
|
||||
}
|
||||
}
|
||||
}
|
@ -1,189 +0,0 @@
|
||||
/*
|
||||
* LZMADecoder
|
||||
*
|
||||
* Authors: Lasse Collin <lasse.collin@tukaani.org>
|
||||
* Igor Pavlov <http://7-zip.org/>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz.lzma;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.tukaani.xz.lz.LZDecoder;
|
||||
import org.tukaani.xz.rangecoder.RangeDecoder;
|
||||
import org.tukaani.xz.CorruptedInputException;
|
||||
|
||||
public final class LZMADecoder extends LZMACoder {
|
||||
private final LZDecoder lz;
|
||||
private final RangeDecoder rc;
|
||||
private final LiteralDecoder literalDecoder;
|
||||
private final LengthDecoder matchLenDecoder = new LengthDecoder();
|
||||
private final LengthDecoder repLenDecoder = new LengthDecoder();
|
||||
|
||||
public LZMADecoder(LZDecoder lz, RangeDecoder rc, int lc, int lp, int pb) {
|
||||
super(pb);
|
||||
this.lz = lz;
|
||||
this.rc = rc;
|
||||
this.literalDecoder = new LiteralDecoder(lc, lp);
|
||||
reset();
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
super.reset();
|
||||
literalDecoder.reset();
|
||||
matchLenDecoder.reset();
|
||||
repLenDecoder.reset();
|
||||
}
|
||||
|
||||
public void decode() throws IOException {
|
||||
lz.repeatPending();
|
||||
|
||||
while (lz.hasSpace()) {
|
||||
int posState = lz.getPos() & posMask;
|
||||
|
||||
if (rc.decodeBit(isMatch[state.get()], posState) == 0) {
|
||||
literalDecoder.decode();
|
||||
} else {
|
||||
int len = rc.decodeBit(isRep, state.get()) == 0
|
||||
? decodeMatch(posState)
|
||||
: decodeRepMatch(posState);
|
||||
lz.repeat(rep[0], len);
|
||||
}
|
||||
}
|
||||
|
||||
rc.normalize();
|
||||
|
||||
if (!rc.isInBufferOK())
|
||||
throw new CorruptedInputException();
|
||||
}
|
||||
|
||||
private int decodeMatch(int posState) throws IOException {
|
||||
state.updateMatch();
|
||||
|
||||
rep[3] = rep[2];
|
||||
rep[2] = rep[1];
|
||||
rep[1] = rep[0];
|
||||
|
||||
int len = matchLenDecoder.decode(posState);
|
||||
int distSlot = rc.decodeBitTree(distSlots[getDistState(len)]);
|
||||
|
||||
if (distSlot < DIST_MODEL_START) {
|
||||
rep[0] = distSlot;
|
||||
} else {
|
||||
int limit = (distSlot >> 1) - 1;
|
||||
rep[0] = (2 | (distSlot & 1)) << limit;
|
||||
|
||||
if (distSlot < DIST_MODEL_END) {
|
||||
rep[0] |= rc.decodeReverseBitTree(
|
||||
distSpecial[distSlot - DIST_MODEL_START]);
|
||||
} else {
|
||||
rep[0] |= rc.decodeDirectBits(limit - ALIGN_BITS)
|
||||
<< ALIGN_BITS;
|
||||
rep[0] |= rc.decodeReverseBitTree(distAlign);
|
||||
}
|
||||
}
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
private int decodeRepMatch(int posState) throws IOException {
|
||||
if (rc.decodeBit(isRep0, state.get()) == 0) {
|
||||
if (rc.decodeBit(isRep0Long[state.get()], posState) == 0) {
|
||||
state.updateShortRep();
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
int tmp;
|
||||
|
||||
if (rc.decodeBit(isRep1, state.get()) == 0) {
|
||||
tmp = rep[1];
|
||||
} else {
|
||||
if (rc.decodeBit(isRep2, state.get()) == 0) {
|
||||
tmp = rep[2];
|
||||
} else {
|
||||
tmp = rep[3];
|
||||
rep[3] = rep[2];
|
||||
}
|
||||
|
||||
rep[2] = rep[1];
|
||||
}
|
||||
|
||||
rep[1] = rep[0];
|
||||
rep[0] = tmp;
|
||||
}
|
||||
|
||||
state.updateLongRep();
|
||||
|
||||
return repLenDecoder.decode(posState);
|
||||
}
|
||||
|
||||
|
||||
private class LiteralDecoder extends LiteralCoder {
|
||||
final LiteralSubdecoder[] subdecoders;
|
||||
|
||||
LiteralDecoder(int lc, int lp) {
|
||||
super(lc, lp);
|
||||
|
||||
subdecoders = new LiteralSubdecoder[1 << (lc + lp)];
|
||||
for (int i = 0; i < subdecoders.length; ++i)
|
||||
subdecoders[i] = new LiteralSubdecoder();
|
||||
}
|
||||
|
||||
void reset() {
|
||||
for (int i = 0; i < subdecoders.length; ++i)
|
||||
subdecoders[i].reset();
|
||||
}
|
||||
|
||||
void decode() throws IOException {
|
||||
int i = getSubcoderIndex(lz.getByte(0), lz.getPos());
|
||||
subdecoders[i].decode();
|
||||
}
|
||||
|
||||
|
||||
private class LiteralSubdecoder extends LiteralSubcoder {
|
||||
void decode() throws IOException {
|
||||
int symbol = 1;
|
||||
|
||||
if (state.isLiteral()) {
|
||||
do {
|
||||
symbol = (symbol << 1) | rc.decodeBit(probs, symbol);
|
||||
} while (symbol < 0x100);
|
||||
|
||||
} else {
|
||||
int matchByte = lz.getByte(rep[0]);
|
||||
int offset = 0x100;
|
||||
int matchBit;
|
||||
int bit;
|
||||
|
||||
do {
|
||||
matchByte <<= 1;
|
||||
matchBit = matchByte & offset;
|
||||
bit = rc.decodeBit(probs, offset + matchBit + symbol);
|
||||
symbol = (symbol << 1) | bit;
|
||||
offset &= (-bit) ^ ~matchBit;
|
||||
} while (symbol < 0x100);
|
||||
}
|
||||
|
||||
lz.putByte((byte)symbol);
|
||||
state.updateLiteral();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private class LengthDecoder extends LengthCoder {
|
||||
int decode(int posState) throws IOException {
|
||||
if (rc.decodeBit(choice, 0) == 0)
|
||||
return rc.decodeBitTree(low[posState]) + MATCH_LEN_MIN;
|
||||
|
||||
if (rc.decodeBit(choice, 1) == 0)
|
||||
return rc.decodeBitTree(mid[posState])
|
||||
+ MATCH_LEN_MIN + LOW_SYMBOLS;
|
||||
|
||||
return rc.decodeBitTree(high)
|
||||
+ MATCH_LEN_MIN + LOW_SYMBOLS + MID_SYMBOLS;
|
||||
}
|
||||
}
|
||||
}
|
@ -1,65 +0,0 @@
|
||||
/*
|
||||
* State
|
||||
*
|
||||
* Authors: Lasse Collin <lasse.collin@tukaani.org>
|
||||
* Igor Pavlov <http://7-zip.org/>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz.lzma;
|
||||
|
||||
final class State {
|
||||
static final int STATES = 12;
|
||||
|
||||
private static final int LIT_STATES = 7;
|
||||
|
||||
private static final int LIT_LIT = 0;
|
||||
private static final int MATCH_LIT_LIT = 1;
|
||||
private static final int REP_LIT_LIT = 2;
|
||||
private static final int SHORTREP_LIT_LIT = 3;
|
||||
private static final int MATCH_LIT = 4;
|
||||
private static final int REP_LIT = 5;
|
||||
private static final int SHORTREP_LIT = 6;
|
||||
private static final int LIT_MATCH = 7;
|
||||
private static final int LIT_LONGREP = 8;
|
||||
private static final int LIT_SHORTREP = 9;
|
||||
private static final int NONLIT_MATCH = 10;
|
||||
private static final int NONLIT_REP = 11;
|
||||
|
||||
private int state;
|
||||
|
||||
void reset() {
|
||||
state = LIT_LIT;
|
||||
}
|
||||
|
||||
int get() {
|
||||
return state;
|
||||
}
|
||||
|
||||
void updateLiteral() {
|
||||
if (state <= SHORTREP_LIT_LIT)
|
||||
state = LIT_LIT;
|
||||
else if (state <= LIT_SHORTREP)
|
||||
state -= 3;
|
||||
else
|
||||
state -= 6;
|
||||
}
|
||||
|
||||
void updateMatch() {
|
||||
state = state < LIT_STATES ? LIT_MATCH : NONLIT_MATCH;
|
||||
}
|
||||
|
||||
void updateLongRep() {
|
||||
state = state < LIT_STATES ? LIT_LONGREP : NONLIT_REP;
|
||||
}
|
||||
|
||||
void updateShortRep() {
|
||||
state = state < LIT_STATES ? LIT_SHORTREP : NONLIT_REP;
|
||||
}
|
||||
|
||||
boolean isLiteral() {
|
||||
return state < LIT_STATES;
|
||||
}
|
||||
}
|
@ -1,21 +0,0 @@
|
||||
/**
|
||||
* XZ data compression support.
|
||||
* <p>
|
||||
* In the (very) long term, this aims to be a complete implementation of
|
||||
* XZ data compression in Java. Currently only streamed decompression is
|
||||
* supported.
|
||||
* <p>
|
||||
* For the latest source code, see the
|
||||
* <a href="http://tukaani.org/xz/java.html">home page of XZ in Java</a>.
|
||||
*
|
||||
* <h3>Decompression notes</h3>
|
||||
*
|
||||
* If you are decompressing complete files and your application knows
|
||||
* exactly how much uncompressed data there should be, it is still good
|
||||
* to try reading one more byte by calling <code>read()</code> and checking
|
||||
* that it returns <code>-1</code>. This way the decompressor will parse the
|
||||
* file footers and verify the integrity checks, giving the caller more
|
||||
* confidence that the uncompressed data is valid. (This advice seems to
|
||||
* apply to <code>java.util.zip.GZIPInputStream</code> too.)
|
||||
*/
|
||||
package org.tukaani.xz;
|
@ -1,25 +0,0 @@
|
||||
/*
|
||||
* RangeCoder
|
||||
*
|
||||
* Authors: Lasse Collin <lasse.collin@tukaani.org>
|
||||
* Igor Pavlov <http://7-zip.org/>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz.rangecoder;
|
||||
|
||||
public abstract class RangeCoder {
|
||||
static final int SHIFT_BITS = 8;
|
||||
static final int TOP_MASK = 0xFF000000;
|
||||
static final int BIT_MODEL_TOTAL_BITS = 11;
|
||||
static final int BIT_MODEL_TOTAL = 1 << BIT_MODEL_TOTAL_BITS;
|
||||
static final short PROB_INIT = (short)(BIT_MODEL_TOTAL / 2);
|
||||
static final int MOVE_BITS = 5;
|
||||
|
||||
public static void initProbs(short[] probs) {
|
||||
for (int i = 0; i < probs.length; ++i)
|
||||
probs[i] = PROB_INIT;
|
||||
}
|
||||
}
|
@ -1,129 +0,0 @@
|
||||
/*
|
||||
* RangeDecoder
|
||||
*
|
||||
* Authors: Lasse Collin <lasse.collin@tukaani.org>
|
||||
* Igor Pavlov <http://7-zip.org/>
|
||||
*
|
||||
* This file has been put into the public domain.
|
||||
* You can do whatever you want with this file.
|
||||
*/
|
||||
|
||||
package org.tukaani.xz.rangecoder;
|
||||
|
||||
import java.io.DataInputStream;
|
||||
import java.io.IOException;
|
||||
import org.tukaani.xz.CorruptedInputException;
|
||||
|
||||
public final class RangeDecoder extends RangeCoder {
|
||||
private static final int INIT_SIZE = 5;
|
||||
|
||||
private final byte[] buf;
|
||||
private int pos = 0;
|
||||
private int end = 0;
|
||||
|
||||
private int range = 0;
|
||||
private int code = 0;
|
||||
|
||||
public RangeDecoder(int inputSizeMax) {
|
||||
buf = new byte[inputSizeMax - INIT_SIZE];
|
||||
}
|
||||
|
||||
public void prepareInputBuffer(DataInputStream in, int len)
|
||||
throws IOException {
|
||||
if (len < INIT_SIZE)
|
||||
throw new CorruptedInputException();
|
||||
|
||||
if (in.readUnsignedByte() != 0x00)
|
||||
throw new CorruptedInputException();
|
||||
|
||||
code = in.readInt();
|
||||
range = 0xFFFFFFFF;
|
||||
|
||||
pos = 0;
|
||||
end = len - INIT_SIZE;
|
||||
in.readFully(buf, 0, end);
|
||||
}
|
||||
|
||||
public boolean isInBufferOK() {
|
||||
return pos <= end;
|
||||
}
|
||||
|
||||
public boolean isFinished() {
|
||||
return pos == end && code == 0;
|
||||
}
|
||||
|
||||
public void normalize() throws IOException {
|
||||
if ((range & TOP_MASK) == 0) {
|
||||
try {
|
||||
// If the input is corrupt, this might throw
|
||||
// ArrayIndexOutOfBoundsException.
|
||||
code = (code << SHIFT_BITS) | (buf[pos++] & 0xFF);
|
||||
range <<= SHIFT_BITS;
|
||||
} catch (ArrayIndexOutOfBoundsException e) {
|
||||
throw new CorruptedInputException();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public int decodeBit(short[] probs, int index) throws IOException {
|
||||
normalize();
|
||||
|
||||
int prob = probs[index];
|
||||
int bound = (range >>> BIT_MODEL_TOTAL_BITS) * prob;
|
||||
int bit;
|
||||
|
||||
// Compare code and bound as if they were unsigned 32-bit integers.
|
||||
if ((code ^ 0x80000000) < (bound ^ 0x80000000)) {
|
||||
range = bound;
|
||||
probs[index] = (short)(
|
||||
prob + ((BIT_MODEL_TOTAL - prob) >>> MOVE_BITS));
|
||||
bit = 0;
|
||||
} else {
|
||||
range -= bound;
|
||||
code -= bound;
|
||||
probs[index] = (short)(prob - (prob >>> MOVE_BITS));
|
||||
bit = 1;
|
||||
}
|
||||
|
||||
return bit;
|
||||
}
|
||||
|
||||
public int decodeBitTree(short[] probs) throws IOException {
|
||||
int symbol = 1;
|
||||
|
||||
do {
|
||||
symbol = (symbol << 1) | decodeBit(probs, symbol);
|
||||
} while (symbol < probs.length);
|
||||
|
||||
return symbol - probs.length;
|
||||
}
|
||||
|
||||
public int decodeReverseBitTree(short[] probs) throws IOException {
|
||||
int symbol = 1;
|
||||
int i = 0;
|
||||
int result = 0;
|
||||
|
||||
do {
|
||||
int bit = decodeBit(probs, symbol);
|
||||
symbol = (symbol << 1) | bit;
|
||||
result |= bit << i++;
|
||||
} while (symbol < probs.length);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
public int decodeDirectBits(int count) throws IOException {
|
||||
int result = 0;
|
||||
|
||||
do {
|
||||
normalize();
|
||||
|
||||
range >>>= 1;
|
||||
int t = (code - range) >>> 31;
|
||||
code -= range & (t - 1);
|
||||
result = (result << 1) | (1 - t);
|
||||
} while (--count != 0);
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user