(control) Fully automatic conversion

Removed the need to have to run an external tool to pre-process the data in order to load stackexchange-style data into the search engine.

Removed the tool itself.

This stirred up some issues with the dependencies, that were due to both third-party:ing xz and importing it as a dependency.  This has been fixed, and :third-party:xz was removed.
This commit is contained in:
Viktor Lofgren 2024-01-22 13:01:09 +01:00
parent 3a325845c7
commit 40c9d2050f
74 changed files with 330 additions and 4057 deletions

View File

@ -13,7 +13,6 @@ java {
dependencies { dependencies {
implementation libs.bundles.slf4j implementation libs.bundles.slf4j
implementation 'org.tukaani:xz:1.8'
implementation project(':code:libraries:blocking-thread-pool') implementation project(':code:libraries:blocking-thread-pool')
implementation project(':code:common:model') implementation project(':code:common:model')
implementation libs.notnull implementation libs.notnull
@ -26,6 +25,7 @@ dependencies {
implementation libs.zstd implementation libs.zstd
implementation libs.trove implementation libs.trove
implementation libs.commons.compress implementation libs.commons.compress
implementation libs.xz
testImplementation libs.bundles.slf4j.test testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit testImplementation libs.bundles.junit

View File

@ -36,8 +36,8 @@ public class StackExchangePostsDb {
public static void create(String domain, public static void create(String domain,
Path sqliteFile, Path sqliteFile,
Path stackExchange7zFile) { Path stackExchange7zFile) {
if (Files.exists(sqliteFile)) Files.deleteIfExists(sqliteFile);
Files.delete(sqliteFile);
String connStr = "jdbc:sqlite:" + sqliteFile; String connStr = "jdbc:sqlite:" + sqliteFile;
try (var connection = DriverManager.getConnection(connStr); try (var connection = DriverManager.getConnection(connStr);

View File

@ -95,8 +95,6 @@ dependencies {
testImplementation libs.bundles.junit testImplementation libs.bundles.junit
testImplementation libs.mockito testImplementation libs.mockito
implementation 'org.tukaani:xz:1.8'
testImplementation project(':code:processes:test-data') testImplementation project(':code:processes:test-data')
testImplementation project(':code:processes:crawling-process') testImplementation project(':code:processes:crawling-process')
} }

View File

@ -240,50 +240,57 @@ public class ConverterMain extends ProcessMainClass {
var msgOpt = getMessage(inbox, nu.marginalia.mqapi.converting.ConvertRequest.class.getSimpleName()); var msgOpt = getMessage(inbox, nu.marginalia.mqapi.converting.ConvertRequest.class.getSimpleName());
var msg = msgOpt.orElseThrow(() -> new RuntimeException("No message received")); var msg = msgOpt.orElseThrow(() -> new RuntimeException("No message received"));
var request = gson.fromJson(msg.payload(), nu.marginalia.mqapi.converting.ConvertRequest.class); try {
var request = gson.fromJson(msg.payload(), nu.marginalia.mqapi.converting.ConvertRequest.class);
return switch(request.action) { return switch (request.action) {
case ConvertCrawlData -> { case ConvertCrawlData -> {
var crawlData = fileStorageService.getStorage(request.crawlStorage); var crawlData = fileStorageService.getStorage(request.crawlStorage);
var processData = fileStorageService.getStorage(request.processedDataStorage); var processData = fileStorageService.getStorage(request.processedDataStorage);
var plan = new CrawlPlan(null, var plan = new CrawlPlan(null,
new CrawlPlan.WorkDir(crawlData.path(), "crawler.log"), new CrawlPlan.WorkDir(crawlData.path(), "crawler.log"),
new CrawlPlan.WorkDir(processData.path(), "processor.log")); new CrawlPlan.WorkDir(processData.path(), "processor.log"));
yield new ConvertCrawlDataAction(plan, msg, inbox); yield new ConvertCrawlDataAction(plan, msg, inbox);
} }
case SideloadEncyclopedia -> { case SideloadEncyclopedia -> {
var processData = fileStorageService.getStorage(request.processedDataStorage); var processData = fileStorageService.getStorage(request.processedDataStorage);
yield new SideloadAction(sideloadSourceFactory.sideloadEncyclopediaMarginaliaNu(Path.of(request.inputSource), request.baseUrl), yield new SideloadAction(sideloadSourceFactory.sideloadEncyclopediaMarginaliaNu(Path.of(request.inputSource), request.baseUrl),
processData.asPath(), processData.asPath(),
msg, inbox); msg, inbox);
} }
case SideloadDirtree -> { case SideloadDirtree -> {
var processData = fileStorageService.getStorage(request.processedDataStorage); var processData = fileStorageService.getStorage(request.processedDataStorage);
yield new SideloadAction( yield new SideloadAction(
sideloadSourceFactory.sideloadDirtree(Path.of(request.inputSource)), sideloadSourceFactory.sideloadDirtree(Path.of(request.inputSource)),
processData.asPath(), processData.asPath(),
msg, inbox); msg, inbox);
} }
case SideloadWarc -> { case SideloadWarc -> {
var processData = fileStorageService.getStorage(request.processedDataStorage); var processData = fileStorageService.getStorage(request.processedDataStorage);
yield new SideloadAction( yield new SideloadAction(
sideloadSourceFactory.sideloadWarc(Path.of(request.inputSource)), sideloadSourceFactory.sideloadWarc(Path.of(request.inputSource)),
processData.asPath(), processData.asPath(),
msg, inbox); msg, inbox);
} }
case SideloadStackexchange -> { case SideloadStackexchange -> {
var processData = fileStorageService.getStorage(request.processedDataStorage); var processData = fileStorageService.getStorage(request.processedDataStorage);
yield new SideloadAction(sideloadSourceFactory.sideloadStackexchange(Path.of(request.inputSource)), yield new SideloadAction(sideloadSourceFactory.sideloadStackexchange(Path.of(request.inputSource)),
processData.asPath(), processData.asPath(),
msg, inbox); msg, inbox);
} }
}; };
}
catch (Exception ex) {
inbox.sendResponse(msg, MqInboxResponse.err(STR."\{ex.getClass().getSimpleName()}: \{ex.getMessage()}"));
throw ex;
}
} }
private Optional<MqMessage> getMessage(MqSingleShotInbox inbox, String expectedFunction) throws SQLException, InterruptedException { private Optional<MqMessage> getMessage(MqSingleShotInbox inbox, String expectedFunction) throws SQLException, InterruptedException {

View File

@ -16,6 +16,7 @@ import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.sql.SQLException; import java.sql.SQLException;
import java.util.Collection; import java.util.Collection;
import java.util.List;
public class SideloadSourceFactory { public class SideloadSourceFactory {
private final Gson gson; private final Gson gson;
@ -57,14 +58,21 @@ public class SideloadSourceFactory {
return warcSideloadFactory.createSideloaders(pathToWarcFiles); return warcSideloadFactory.createSideloaders(pathToWarcFiles);
} }
/** Do not use, this code isn't finished */
public Collection<? extends SideloadSource> sideloadStackexchange(Path pathToDbFileRoot) throws IOException { public Collection<? extends SideloadSource> sideloadStackexchange(Path pathToDbFileRoot) throws IOException {
try (var dirs = Files.walk(pathToDbFileRoot)) { if (Files.isRegularFile(pathToDbFileRoot)) {
return dirs return List.of(new StackexchangeSideloader(pathToDbFileRoot, sentenceExtractorProvider, documentKeywordExtractor));
.filter(Files::isRegularFile) }
.filter(f -> f.toFile().getName().endsWith(".db")) else if (Files.isDirectory(pathToDbFileRoot)) {
.map(dbFile -> new StackexchangeSideloader(dbFile, sentenceExtractorProvider, documentKeywordExtractor)) try (var dirs = Files.walk(pathToDbFileRoot)) {
.toList(); return dirs
.filter(Files::isRegularFile)
.filter(f -> f.toFile().getName().endsWith(".db"))
.map(dbFile -> new StackexchangeSideloader(dbFile, sentenceExtractorProvider, documentKeywordExtractor))
.toList();
}
}
else { // unix socket, etc
throw new IllegalArgumentException("Path to stackexchange db file(s) must be a file or directory");
} }
} }
} }

View File

@ -21,23 +21,33 @@ public class WarcSideloadFactory {
} }
public Collection<? extends SideloadSource> createSideloaders(Path pathToWarcFiles) throws IOException { public Collection<? extends SideloadSource> createSideloaders(Path pathToWarcFiles) throws IOException {
final List<Path> files = new ArrayList<>();
try (var stream = Files.list(pathToWarcFiles)) {
stream
.filter(Files::isRegularFile)
.filter(this::isWarcFile)
.forEach(files::add);
if (Files.isRegularFile(pathToWarcFiles)) {
return List.of(new WarcSideloader(pathToWarcFiles, processing));
} }
else if (Files.isDirectory(pathToWarcFiles)) {
List<WarcSideloader> sources = new ArrayList<>(); final List<Path> files = new ArrayList<>();
for (Path file : files) { try (var stream = Files.list(pathToWarcFiles)) {
sources.add(new WarcSideloader(file, processing)); stream
.filter(Files::isRegularFile)
.filter(this::isWarcFile)
.forEach(files::add);
}
List<WarcSideloader> sources = new ArrayList<>();
for (Path file : files) {
sources.add(new WarcSideloader(file, processing));
}
return sources;
}
else {
throw new IllegalArgumentException("Path " + pathToWarcFiles + " is neither a file nor a directory");
} }
return sources;
} }
private boolean isWarcFile(Path path) { private boolean isWarcFile(Path path) {

View File

@ -40,6 +40,7 @@ dependencies {
implementation project(':code:process-models:crawling-model') implementation project(':code:process-models:crawling-model')
implementation project(':code:features-crawl:link-parser') implementation project(':code:features-crawl:link-parser')
implementation project(':code:features-convert:data-extractors') implementation project(':code:features-convert:data-extractors')
implementation project(':code:features-convert:stackexchange-xml')
implementation project(':code:features-index:index-journal') implementation project(':code:features-index:index-journal')
implementation project(':code:api:index-api') implementation project(':code:api:index-api')
implementation project(':code:api:query-api') implementation project(':code:api:query-api')

View File

@ -10,6 +10,8 @@ import nu.marginalia.actor.state.Resume;
import nu.marginalia.encyclopedia.EncyclopediaConverter; import nu.marginalia.encyclopedia.EncyclopediaConverter;
import nu.marginalia.process.ProcessOutboxes; import nu.marginalia.process.ProcessOutboxes;
import nu.marginalia.process.ProcessService; import nu.marginalia.process.ProcessService;
import nu.marginalia.sideload.SideloadHelper;
import nu.marginalia.sideload.StackExchangeSideloadHelper;
import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorageBaseType; import nu.marginalia.storage.model.FileStorageBaseType;
import nu.marginalia.storage.model.FileStorageId; import nu.marginalia.storage.model.FileStorageId;
@ -21,11 +23,8 @@ import nu.marginalia.mqapi.converting.ConvertRequest;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.zip.CRC32;
@Singleton @Singleton
public class ConvertActor extends RecordActorPrototype { public class ConvertActor extends RecordActorPrototype {
@ -109,7 +108,7 @@ public class ConvertActor extends RecordActorPrototype {
if (source.toLowerCase().endsWith(".zim")) { if (source.toLowerCase().endsWith(".zim")) {
// If we're fed a ZIM file, we need to convert it to a sqlite database first // If we're fed a ZIM file, we need to convert it to a sqlite database first
String hash = getCrc32FileHash(sourcePath); String hash = SideloadHelper.getCrc32FileHash(sourcePath);
// To avoid re-converting the same file, we'll assign the file a name based on its hash // To avoid re-converting the same file, we'll assign the file a name based on its hash
// and the original filename. This way, if we're fed the same file again, we'll be able to just // and the original filename. This way, if we're fed the same file again, we'll be able to just
@ -179,6 +178,10 @@ public class ConvertActor extends RecordActorPrototype {
storageService.setFileStorageState(processedArea.id(), FileStorageState.NEW); storageService.setFileStorageState(processedArea.id(), FileStorageState.NEW);
// Convert stackexchange data to sqlite database
// (we can't use a Predigest- step here because the conversion is too complicated)
StackExchangeSideloadHelper.convertStackexchangeData(sourcePath);
// Pre-send convert request // Pre-send convert request
yield new ConvertWait( yield new ConvertWait(
@ -200,21 +203,7 @@ public class ConvertActor extends RecordActorPrototype {
}; };
} }
private String getCrc32FileHash(Path file) throws IOException {
ByteBuffer buffer = ByteBuffer.allocate(8192);
try (var channel = Files.newByteChannel(file)) {
CRC32 crc = new CRC32();
while (channel.read(buffer) > 0) {
buffer.flip();
crc.update(buffer);
buffer.clear();
}
return Long.toHexString(crc.getValue());
}
}
@Override @Override
public String describe() { public String describe() {

View File

@ -0,0 +1,25 @@
package nu.marginalia.sideload;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.zip.CRC32;
public class SideloadHelper {
public static String getCrc32FileHash(Path file) throws IOException {
ByteBuffer buffer = ByteBuffer.allocate(8192);
try (var channel = Files.newByteChannel(file)) {
CRC32 crc = new CRC32();
while (channel.read(buffer) > 0) {
buffer.flip();
crc.update(buffer);
buffer.clear();
}
return Long.toHexString(crc.getValue());
}
}
}

View File

@ -0,0 +1,102 @@
package nu.marginalia.sideload;
import nu.marginalia.integration.stackexchange.sqlite.StackExchangePostsDb;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.util.Optional;
import java.util.zip.CRC32;
/** Contains helper functions for pre-converting stackexchange style 7z
* files to marginalia-digestible sqlite databases*/
public class StackExchangeSideloadHelper {
private static final Logger logger = LoggerFactory.getLogger(StackExchangeSideloadHelper.class);
/** Looks for stackexchange 7z files in the given path and converts them to sqlite databases.
* The function is idempotent, so it is safe to call it multiple times on the same path
* (it will not re-convert files that have already been successfully converted)
* */
public static void convertStackexchangeData(Path sourcePath) {
if (Files.isDirectory(sourcePath)) {
try (var contents = Files.list(sourcePath)) {
contents.filter(Files::isRegularFile)
.parallel()
.forEach(StackExchangeSideloadHelper::convertSingleStackexchangeFile);
} catch (IOException ex) {
logger.warn("Failed to convert stackexchange 7z file to sqlite database", ex);
}
} else if (Files.isRegularFile(sourcePath)) {
convertSingleStackexchangeFile(sourcePath);
}
}
private static void convertSingleStackexchangeFile(Path sourcePath) {
String fileName = sourcePath.toFile().getName();
if (fileName.endsWith(".db")) return;
if (!fileName.endsWith(".7z")) return;
Optional<String> domain = getStackexchangeDomainFromFilename(fileName);
if (domain.isEmpty())
return;
try {
Path destPath = getStackexchangeDbPath(sourcePath);
if (Files.exists(destPath)) return;
Path tempFile = Files.createTempFile(destPath.getParent(), "processed", "db.tmp");
try {
logger.info("Converting stackexchange 7z file {} to sqlite database", sourcePath);
StackExchangePostsDb.create(domain.get(), tempFile, sourcePath);
logger.info("Finished converting stackexchange 7z file {} to sqlite database", sourcePath);
Files.move(tempFile, destPath, StandardCopyOption.REPLACE_EXISTING);
} catch (Exception e) {
logger.error("Failed to convert stackexchange 7z file to sqlite database", e);
Files.deleteIfExists(tempFile);
Files.deleteIfExists(destPath);
}
} catch (IOException ex) {
logger.warn("Failed to convert stackexchange 7z file to sqlite database", ex);
}
}
private static Path getStackexchangeDbPath(Path sourcePath) throws IOException {
String fileName = sourcePath.toFile().getName();
String hash = SideloadHelper.getCrc32FileHash(sourcePath);
return sourcePath.getParent().resolve(STR."\{fileName}.\{hash}.db");
}
private static Optional<String> getStackexchangeDomainFromFilename(String fileName) {
// We are only interested in .tld.7z files
if (!fileName.endsWith(".7z") && fileName.length() > 7)
return Optional.empty();
// Stackoverflow is special, because it has one 7z file per site
// (we only want Posts)
if (fileName.equals("stackoverflow-Posts.7z"))
return Optional.of("stackoverflow.com");
else if (fileName.startsWith("stackoverflow.com-")) {
return Optional.empty();
}
// For stackexchange, we filter out the meta archives
// We are not interested in the meta files
if (fileName.startsWith("meta."))
return Optional.empty();
if (fileName.contains(".meta."))
return Optional.empty();
// Pattern is 'foobar.stackexchange.com.7z'
return Optional.of(fileName.substring(0, fileName.length() - 3));
}
}

View File

@ -1,40 +0,0 @@
plugins {
id 'java'
id 'application'
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(21))
}
}
application {
mainClass = 'nu.marginalia.tools.StackexchangeConverter'
applicationName = 'stackexchange-converter'
}
tasks.distZip.enabled = false
dependencies {
implementation project(':code:features-convert:stackexchange-xml')
implementation libs.bundles.slf4j
implementation libs.notnull
implementation libs.guice
implementation libs.jsoup
implementation libs.trove
implementation libs.fastutil
implementation libs.bundles.nlp
implementation libs.commons.lang3
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
}

View File

@ -1,24 +0,0 @@
This tool converts from stackexchange's 7z-compressed XML
format to a sqlite database that is digestible by the search engine.
See [features-convert/stackexchange-xml](../../features-convert/stackexchange-xml) for
an explanation why this is necessary.
Stackexchange's data dumps can be downloaded from archive.org
here: [https://archive.org/details/stackexchange](https://archive.org/details/stackexchange)
<b>Usage</b>
```shell
$ stackexchange-converter domain-name input.7z output.db
```
Stackexchange is relatively conservative about allowing
new questions, so this is a job that doesn't run more than once.
<b>Note</b>: Reading and writing these db files is *absurdly* slow
on a mechanical hard-drive.
## See Also
* [features-convert/stackexchange-xml](../../features-convert/stackexchange-xml)

View File

@ -1,31 +0,0 @@
package nu.marginalia.tools;
import nu.marginalia.integration.stackexchange.sqlite.StackExchangePostsDb;
import java.nio.file.Files;
import java.nio.file.Path;
public class StackexchangeConverter {
public static void main(String[] args) {
if (args.length != 3) {
System.err.println("Converts a stackexchange Posts 7z file to a Marginalia-digestible sqlite-db\n");
System.err.println("Arguments: domain-name input-file.7z output-file.db");
return;
}
String domain = args[0];
Path inputFile = Path.of(args[1]);
Path outputFile = Path.of(args[2]);
if (!Files.exists(inputFile))
System.err.println("Input file " + inputFile + " does not exists");
System.out.println("Converting " + inputFile);
StackExchangePostsDb.create(domain, outputFile, inputFile);
System.out.println("... done!");
}
}

BIN
doc/images/convert_2.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 62 KiB

BIN
doc/images/load_warc.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 35 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 48 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 44 KiB

View File

@ -1,23 +1,121 @@
# Sideloading How-To # Sideloading How-To
(This document is a bit of a draft to get this down in writing
while it's still fresh in my head.)
Some websites are much larger than others, this includes Some websites are much larger than others, this includes
Wikipedia, Stack Overflow, and a few others. They are so Wikipedia, Stack Overflow, and a few others. They are so
large they are impractical to crawl in the traditional fashion, large they are impractical to crawl in the traditional fashion,
but luckily they make available data dumps that can be processed but luckily they make available data dumps that can be processed
and loaded into the search engine through other means. and loaded into the search engine through other means.
## Notes on Docker To this end, it's possible to sideload data into the search engine
from other sources than the web crawler.
If you're running the system in docker, you'll need to provide the paths ## Index Nodes
to the data in a way where it is available to the docker container.
In practice, if you want to sideload data, you need to do it on
a separate index node. Index nodes are separate instances of the
index software. The default configuration is to have two index nodes,
one for the web crawler, and one for sideloaded data.
The need for a separate node is due to incompatibilities in the work flows.
It is also a good idea in general, as very large domains can easily be so large that the entire time budget
for the query is spent sifting through documents from that one domain, this is
especially true with something like Wikipedia, which has a lot of documents at
least tangentially related to any given topic.
This how-to assumes that you are operating on index-node 2.
## Notes on the upload directory
This is written assuming that the system is installed with the `install.sh`
script, which deploys the system with docker-compose, and has a directory
structure like
```
...
index-1/backup/
index-1/index/
index-1/storage/
index-1/uploads/
index-1/work/
index-2/backup/
index-2/index/
index-2/storage/
index-2/uploads/
index-2/work/
...
```
We're going to be putting files in the **uploads** directories. If you have installed
the system in some other way, or changed the configuration significantly, you need
to adjust the paths accordingly.
## Sideloading
The sideloading actions are available through Actions menu in each node.
![Sideload menu](images/sideload_menu.png)
## Sideloading WARCs
WARC files are the standard format for web archives. They can be created e.g. with wget.
The Marginalia software can read WARC files directly, and sideload them into the index,
as long as each warc file contains only one domain.
Let's for example archive www.marginalia.nu (I own this domain, so feel free to try this at home)
```bash
$ wget -r --warc-file=marginalia www.marginalia.nu
```
**Note** If you intend to do this on other websites, you should probably add a `--wait` parameter to wget,
e.g. `wget --wait=1 -r --warc-file=...` to avoid hammering the website with requests and getting blocked.
This will take a moment, and create a file called `marginalia.warc.gz`. We move it to the
upload directory of the index node, and sideload it through the Actions menu.
```bash
$ mkdir -p index-2/uploads/marginalia-warc
$ mv marginalia.warc.gz index-2/uploads/marginalia-warc
```
Go to the Actions menu, and select the "Sideload WARC" action. This will show a list of
subdirectories in the Uploads directory. Select the directory containing the WARC file, and
click "Sideload".
![Sideload WARC screenshot](images/sideload_warc.png)
This should take you to the node overview, where you can see the progress of the sideloading.
It will take a moment, as the WARC file is being processed.
![Processing in progress](images/convert_2.png)
It will not be loaded automatically. This is to permit you to sideload multiple sources.
When you are ready to load it, go to the Actions menu, and select "Load Crawl Data".
![Load Crawl Data](images/load_warc.png)
Select all the sources you want to load, and click "Load". This will load the data into the
index, and make it available for searching.
## Sideloading Wikipedia
Due to licensing incompatibilities with OpenZim's GPL-2 and AGPL, the workflow
depends on using the conversion process from [https://encyclopedia.marginalia.nu/](https://encyclopedia.marginalia.nu/)
to pre-digest the data.
Build the [encyclopedia.marginalia.nu Code](https://github.com/MarginaliaSearch/encyclopedia.marginalia.nu)
and follow the instructions for downloading a ZIM file, and then run something like
```$./encyclopedia convert file.zim articles.db```
This db-file can be processed and loaded into the search engine through the
Actions view.
FIXME: It will currently only point to en.wikipedia.org, this should be
made configurable.
Either mount the data into the executor's container, or copy it into e.g.
the data directory, which is mounted into the container as `/wmsa/data`.
For a test deployment, a file placed in `run/data/foo` will be available
in the container as `/wmsa/data/foo`.
## Sideloading a directory tree ## Sideloading a directory tree
@ -98,23 +196,6 @@ python-3.11.5/[...]
This yaml-file can be processed and loaded into the search engine through the This yaml-file can be processed and loaded into the search engine through the
Actions view. Actions view.
## Sideloading Wikipedia
For now, this workflow depends on using the conversion process from
[https://encyclopedia.marginalia.nu/](https://encyclopedia.marginalia.nu/)
to pre-digest the data. This is because it uses OpenZIM which has a
license that is incompatible with this project.
Build the [encyclopedia.marginalia.nu Code](https://github.com/MarginaliaSearch/encyclopedia.marginalia.nu)
and follow the instructions for downloading a ZIM file, and then run something like
```$./encyclopedia convert file.zim articles.db```
This db-file can be processed and loaded into the search engine through the
Actions view.
FIXME: It will currently only point to en.wikipedia.org, this should be
made configurable.
## Sideloading Stack Overflow/Stackexchange ## Sideloading Stack Overflow/Stackexchange

View File

@ -85,11 +85,9 @@ include 'code:tools:term-frequency-extractor'
include 'code:tools:experiment-runner' include 'code:tools:experiment-runner'
include 'code:tools:screenshot-capture-tool' include 'code:tools:screenshot-capture-tool'
include 'code:tools:load-test' include 'code:tools:load-test'
include 'code:tools:stackexchange-converter'
include 'code:tools:crawl-data-unfcker' include 'code:tools:crawl-data-unfcker'
include 'third-party:porterstemmer' include 'third-party:porterstemmer'
include 'third-party:xz'
include 'third-party:symspell' include 'third-party:symspell'
include 'third-party:rdrpostagger' include 'third-party:rdrpostagger'
include 'third-party:openzim' include 'third-party:openzim'
@ -164,7 +162,7 @@ dependencyResolutionManagement {
library('httpcomponents.client','org.apache.httpcomponents','httpclient').version('4.5.13') library('httpcomponents.client','org.apache.httpcomponents','httpclient').version('4.5.13')
library('commons.net', 'commons-net','commons-net').version('3.9.0') library('commons.net', 'commons-net','commons-net').version('3.9.0')
library('commons.lang3', 'org.apache.commons','commons-lang3').version('3.12.0') library('commons.lang3', 'org.apache.commons','commons-lang3').version('3.12.0')
library('commons.compress','org.apache.commons','commons-compress').version('1.21') library('commons.compress','org.apache.commons','commons-compress').version('1.25.0')
library('commons.io','commons-io','commons-io').version('2.11.0') library('commons.io','commons-io','commons-io').version('2.11.0')
library('commons.codec', 'commons-codec', 'commons-codec').version('1.16.0') library('commons.codec', 'commons-codec', 'commons-codec').version('1.16.0')
@ -185,6 +183,7 @@ dependencyResolutionManagement {
library('zstd','com.github.luben','zstd-jni').version('1.5.2-2') library('zstd','com.github.luben','zstd-jni').version('1.5.2-2')
library('lz4','org.lz4','lz4-java').version('1.8.0') library('lz4','org.lz4','lz4-java').version('1.8.0')
library('xz','org.tukaani','xz').version('1.9')
library('flyway.core','org.flywaydb','flyway-core').version('10.4.1') library('flyway.core','org.flywaydb','flyway-core').version('10.4.1')
library('flyway.mysql','org.flywaydb','flyway-mysql').version('10.4.1') library('flyway.mysql','org.flywaydb','flyway-mysql').version('10.4.1')

View File

@ -17,7 +17,6 @@ dependencies {
implementation project(':code:libraries:blocking-thread-pool') implementation project(':code:libraries:blocking-thread-pool')
implementation project(':third-party:xz')
implementation project(':third-party:openzim') implementation project(':third-party:openzim')
} }

View File

@ -16,7 +16,7 @@ dependencies {
implementation libs.databind implementation libs.databind
implementation libs.bundles.gson implementation libs.bundles.gson
implementation project(':third-party:xz') implementation libs.xz
} }
test { test {

View File

@ -1,16 +0,0 @@
plugins {
id 'java'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(21))
}
}
dependencies {
}
test {
useJUnitPlatform()
}

View File

@ -1,9 +0,0 @@
# XZ
[XZ for Java](https://tukaani.org/xz/) - Public Domain
"XZ Utils is free general-purpose data compression software with a high compression ratio.
XZ Utils were written for POSIX-like systems, but also work on some not-so-POSIX systems.
XZ Utils are the successor to LZMA Utils."
Needed for [openzim](../openzim) to deal with modern zim files.

View File

@ -1,212 +0,0 @@
/*
* BlockInputStream
*
* Author: Lasse Collin <lasse.collin@tukaani.org>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz;
import java.io.InputStream;
import java.io.DataInputStream;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.util.Arrays;
import org.tukaani.xz.common.DecoderUtil;
import org.tukaani.xz.check.Check;
class BlockInputStream extends InputStream {
private final InputStream in;
private final DataInputStream inData;
private final CountingInputStream inCounted;
private InputStream filterChain;
private final Check check;
private long uncompressedSizeInHeader = -1;
private long compressedSizeInHeader = -1;
private long compressedSizeLimit;
private final int headerSize;
private long uncompressedSize = 0;
public BlockInputStream(InputStream in, Check check, int memoryLimit)
throws IOException, IndexIndicatorException {
this.in = in;
this.check = check;
inData = new DataInputStream(in);
byte[] buf = new byte[DecoderUtil.BLOCK_HEADER_SIZE_MAX];
// Block Header Size or Index Indicator
inData.readFully(buf, 0, 1);
// See if this begins the Index field.
if (buf[0] == 0x00)
throw new IndexIndicatorException();
// Read the rest of the Block Header.
headerSize = 4 * (buf[0] + 1);
inData.readFully(buf, 1, headerSize - 1);
// Validate the CRC32.
if (!DecoderUtil.isCRC32Valid(buf, 0, headerSize - 4, headerSize - 4))
throw new CorruptedInputException("XZ Block Header is corrupt");
// Check for reserved bits in Block Flags.
if ((buf[1] & 0x3C) != 0)
throw new UnsupportedOptionsException(
"Unsupported options in XZ Block Header");
// Memory for the Filter Flags field
int filterCount = (buf[1] & 0x03) + 1;
long[] filterIDs = new long[filterCount];
byte[][] filterProps = new byte[filterCount][];
// Use a stream to parse the fields after the Block Flags field.
// Exclude the CRC32 field at the end.
ByteArrayInputStream bufStream = new ByteArrayInputStream(
buf, 2, headerSize - 6);
try {
// Set the maximum valid compressed size. This is overriden
// by the value from the Compressed Size field if it is present.
compressedSizeLimit = (DecoderUtil.VLI_MAX & ~3)
- headerSize - check.getSize();
// Decode and validate Compressed Size if the relevant flag
// is set in Block Flags.
if ((buf[1] & 0x40) != 0x00) {
compressedSizeInHeader = DecoderUtil.decodeVLI(bufStream);
if (compressedSizeInHeader == 0
|| compressedSizeInHeader > compressedSizeLimit)
throw new CorruptedInputException();
compressedSizeLimit = compressedSizeInHeader;
}
// Decode Uncompressed Size if the relevant flag is set
// in Block Flags.
if ((buf[1] & 0x80) != 0x00)
uncompressedSizeInHeader = DecoderUtil.decodeVLI(bufStream);
// Decode Filter Flags.
for (int i = 0; i < filterCount; ++i) {
filterIDs[i] = DecoderUtil.decodeVLI(bufStream);
long filterPropsSize = DecoderUtil.decodeVLI(bufStream);
if (filterPropsSize > bufStream.available())
throw new CorruptedInputException();
filterProps[i] = new byte[(int)filterPropsSize];
bufStream.read(filterProps[i]);
}
} catch (IOException e) {
throw new CorruptedInputException("XZ Block Header is corrupt");
}
// Check that the remaining bytes are zero.
for (int i = bufStream.available(); i > 0; --i)
if (bufStream.read() != 0x00)
throw new UnsupportedOptionsException(
"Unsupported options in XZ Block Header");
// Check if the Filter IDs are supported, decode
// the Filter Properties, and check that they are
// supported by this decoder implementation.
FilterDecoder[] filters = new FilterDecoder[filterIDs.length];
for (int i = 0; i < filters.length; ++i) {
if (filterIDs[i] == LZMA2Coder.FILTER_ID)
filters[i] = new LZMA2Decoder(filterProps[i]);
else if (filterIDs[i] == DeltaCoder.FILTER_ID)
filters[i] = new DeltaDecoder(filterProps[i]);
else
throw new UnsupportedOptionsException(
"Unknown Filter ID " + filterIDs[i]);
}
RawCoder.validate(filters);
// Check the memory usage limit.
if (memoryLimit >= 0) {
int memoryNeeded = 0;
for (int i = 0; i < filters.length; ++i)
memoryNeeded += filters[i].getMemoryUsage();
if (memoryNeeded > memoryLimit)
throw new MemoryLimitException(memoryNeeded, memoryLimit);
}
// Use an input size counter to calculate
// the size of the Compressed Data field.
inCounted = new CountingInputStream(in);
// Initialize the filter chain.
filterChain = inCounted;
for (int i = filters.length - 1; i >= 0; --i)
filterChain = filters[i].getInputStream(filterChain);
}
public int read() throws IOException {
byte[] buf = new byte[1];
return read(buf, 0, 1) == -1 ? -1 : (buf[0] & 0xFF);
}
public int read(byte[] buf, int off, int len) throws IOException {
int ret = filterChain.read(buf, off, len);
long compressedSize = inCounted.getSize();
if (ret > 0) {
check.update(buf, off, ret);
uncompressedSize += ret;
// Catch invalid values.
if (compressedSize < 0
|| compressedSize > compressedSizeLimit
|| uncompressedSize < 0
|| (uncompressedSizeInHeader != -1
&& uncompressedSize > uncompressedSizeInHeader))
throw new CorruptedInputException();
} else if (ret == -1) {
// Validate Compressed Size and Uncompressed Size if they were
// present in Block Header.
if ((compressedSizeInHeader != -1
&& compressedSizeInHeader != compressedSize)
|| (uncompressedSizeInHeader != -1
&& uncompressedSizeInHeader != uncompressedSize))
throw new CorruptedInputException();
// Block Padding bytes must be zeros.
for (long i = compressedSize; (i & 3) != 0; ++i)
if (inData.readUnsignedByte() != 0x00)
throw new CorruptedInputException();
// Validate the integrity check.
byte[] storedCheck = new byte[check.getSize()];
inData.readFully(storedCheck);
if (!Arrays.equals(check.finish(), storedCheck))
throw new CorruptedInputException("Integrity ("
+ check.getName() + ") check does not match");
}
return ret;
}
public int available() throws IOException {
return filterChain.available();
}
public long getUnpaddedSize() {
return headerSize + inCounted.getSize() + check.getSize();
}
public long getUncompressedSize() {
return uncompressedSize;
}
}

View File

@ -1,128 +0,0 @@
/*
* BlockOutputStream
*
* Author: Lasse Collin <lasse.collin@tukaani.org>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz;
import java.io.OutputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import org.tukaani.xz.common.EncoderUtil;
import org.tukaani.xz.check.Check;
class BlockOutputStream extends FinishableOutputStream {
private final OutputStream out;
private final CountingOutputStream outCounted;
private FinishableOutputStream filterChain;
private final Check check;
private final int headerSize;
private final long compressedSizeLimit;
private long uncompressedSize = 0;
public BlockOutputStream(OutputStream out, FilterEncoder[] filters,
Check check) throws IOException {
this.out = out;
this.check = check;
// Initialize the filter chain.
outCounted = new CountingOutputStream(out);
filterChain = outCounted;
for (int i = 0; i < filters.length; ++i)
filterChain = filters[i].getOutputStream(filterChain);
// Prepare to encode the Block Header field.
ByteArrayOutputStream bufStream = new ByteArrayOutputStream();
// Write a dummy Block Header Size field. The real value is written
// once everything else except CRC32 has been written.
bufStream.write(0x00);
// Write Block Flags. Storing Compressed Size or Uncompressed Size
// isn't supported for now.
bufStream.write(filters.length - 1);
// List of Filter Flags
for (int i = 0; i < filters.length; ++i) {
EncoderUtil.encodeVLI(bufStream, filters[i].getFilterID());
byte[] filterProps = filters[i].getFilterProps();
EncoderUtil.encodeVLI(bufStream, filterProps.length);
bufStream.write(filterProps);
}
// Header Padding
while ((bufStream.size() & 3) != 0)
bufStream.write(0x00);
byte[] buf = bufStream.toByteArray();
// Total size of the Block Header: Take the size of the CRC32 field
// into account.
headerSize = buf.length + 4;
// This is just a sanity check.
if (headerSize > EncoderUtil.BLOCK_HEADER_SIZE_MAX)
throw new UnsupportedOptionsException();
// Block Header Size
buf[0] = (byte)(buf.length / 4);
// Write the Block Header field to the output stream.
out.write(buf);
EncoderUtil.writeCRC32(out, buf);
// Calculate the maximum allowed size of the Compressed Data field.
// It is hard to exceed it so this is mostly to be pedantic.
compressedSizeLimit = (EncoderUtil.VLI_MAX & ~3)
- headerSize - check.getSize();
}
public void write(int b) throws IOException {
byte[] buf = new byte[1];
buf[0] = (byte)b;
write(buf, 0, 1);
}
public void write(byte[] buf, int off, int len) throws IOException {
filterChain.write(buf, off, len);
check.update(buf, off, len);
uncompressedSize += len;
validate();
}
public void finish() throws IOException {
// Finish the Compressed Data field.
filterChain.finish();
validate();
// Block Padding
for (long i = outCounted.getSize(); (i & 3) != 0; ++i)
out.write(0x00);
// Check
out.write(check.finish());
}
private void validate() throws IOException {
long compressedSize = outCounted.getSize();
// It is very hard to trigger this exception.
// This is just to be pedantic.
if (compressedSize < 0 || compressedSize > compressedSizeLimit
|| uncompressedSize < 0)
throw new XZIOException("XZ Stream has grown too big");
}
public long getUnpaddedSize() {
return headerSize + outCounted.getSize() + check.getSize();
}
public long getUncompressedSize() {
return uncompressedSize;
}
}

View File

@ -1,37 +0,0 @@
/*
* CorruptedInputException
*
* Author: Lasse Collin <lasse.collin@tukaani.org>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz;
/**
* Thrown when the compressed input data is corrupt.
* However, it is possible that some or all of the data
* already read from the input stream was corrupt too.
*/
public class CorruptedInputException extends XZIOException {
private static final long serialVersionUID = 3L;
/**
* Creates a new CorruptedInputException with
* the default error detail message.
*/
public CorruptedInputException() {
super("Compressed data is corrupt");
}
/**
* Creates a new CorruptedInputException with
* the specified error detail message.
*
* @param s error detail message
*/
public CorruptedInputException(String s) {
super(s);
}
}

View File

@ -1,42 +0,0 @@
/*
* CountingInputStream
*
* Author: Lasse Collin <lasse.collin@tukaani.org>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz;
import java.io.FilterInputStream;
import java.io.InputStream;
import java.io.IOException;
class CountingInputStream extends FilterInputStream {
private long size = 0;
public CountingInputStream(InputStream in) {
super(in);
}
public int read() throws IOException {
int ret = in.read();
if (ret != -1 && size >= 0)
++size;
return ret;
}
public int read(byte[] b, int off, int len) throws IOException {
int ret = in.read(b, off, len);
if (ret > 0 && size >= 0)
size += ret;
return ret;
}
public long getSize() {
return size;
}
}

View File

@ -1,46 +0,0 @@
/*
* CountingOutputStream
*
* Author: Lasse Collin <lasse.collin@tukaani.org>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz;
import java.io.OutputStream;
import java.io.IOException;
class CountingOutputStream extends FinishableOutputStream {
private final OutputStream out;
private long size = 0;
public CountingOutputStream(OutputStream out) {
this.out = out;
}
public void write(int b) throws IOException {
out.write(b);
if (size >= 0)
++size;
}
public void write(byte[] b, int off, int len) throws IOException {
out.write(b, off, len);
if (size >= 0)
size += len;
}
public void flush() throws IOException {
out.flush();
}
public void close() throws IOException {
out.close();
}
public long getSize() {
return size;
}
}

View File

@ -1,26 +0,0 @@
/*
* DeltaCoder
*
* Author: Lasse Collin <lasse.collin@tukaani.org>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz;
abstract class DeltaCoder implements FilterCoder {
public static final long FILTER_ID = 0x03;
public boolean changesSize() {
return false;
}
public boolean nonLastOK() {
return true;
}
public boolean lastOK() {
return false;
}
}

View File

@ -1,32 +0,0 @@
/*
* DeltaDecoder
*
* Author: Lasse Collin <lasse.collin@tukaani.org>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz;
import java.io.InputStream;
class DeltaDecoder extends DeltaCoder implements FilterDecoder {
private final int distance;
DeltaDecoder(byte[] props) throws UnsupportedOptionsException {
if (props.length != 1)
throw new UnsupportedOptionsException(
"Unsupported Delta filter properties");
distance = (props[0] & 0xFF) + 1;
}
public int getMemoryUsage() {
return 1;
}
public InputStream getInputStream(InputStream in) {
return new DeltaInputStream(in, distance);
}
}

View File

@ -1,105 +0,0 @@
/*
* DeltaInputStream
*
* Author: Lasse Collin <lasse.collin@tukaani.org>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz;
import java.io.InputStream;
import java.io.IOException;
import org.tukaani.xz.delta.DeltaDecoder;
/**
* Decodes Delta-filtered data.
* <p>
* The delta filter doesn't change the size of the data and thus it
* cannot have an end-of-payload marker. It will simply decode until
* its input stream indicates end of input.
*/
public class DeltaInputStream extends InputStream {
/**
* Smallest supported delta calculation distance.
*/
public static final int DISTANCE_MIN = 1;
/**
* Largest supported delta calculation distance.
*/
public static final int DISTANCE_MAX = 256;
private final InputStream in;
private final DeltaDecoder delta;
/**
* Creates a new Delta decoder with the given delta calculation distance.
*
* @param in input stream from which Delta filtered data
* is read
*
* @param distance delta calculation distance, must be in the
* range [<code>DISTANCE_MIN</code>,
* <code>DISTANCE_MAX</code>]
*/
public DeltaInputStream(InputStream in, int distance) {
this.in = in;
this.delta = new DeltaDecoder(distance);
}
/**
* Decode the next byte from this input stream.
*
* @return the next decoded byte, or <code>-1</code> to indicate
* the end of input on the input stream <code>in</code>
*
* @throws IOException may be thrown by <code>in</code>
*/
public int read() throws IOException {
byte[] buf = new byte[1];
return read(buf, 0, 1) == -1 ? -1 : (buf[0] & 0xFF);
}
/**
* Decode into an array of bytes.
* <p>
* This calls <code>in.read(buf, off, len)</code> and defilters the
* returned data.
*
* @param buf target buffer for decoded data
* @param off start offset in <code>buf</code>
* @param len maximum number of bytes to read
*
* @return number of bytes read, or <code>-1</code> to indicate
* the end of the input stream <code>in</code>
*
* @throws IOException may be thrown by underlaying input
* stream <code>in</code>
*/
public int read(byte[] buf, int off, int len) throws IOException {
int size = in.read(buf, off, len);
if (size == -1)
return -1;
delta.decode(buf, off, size);
return size;
}
/**
* Calls <code>in.available()</code>.
*
* @return the value returned by <code>in.available()</code>
*/
public int available() throws IOException {
return in.available();
}
/**
* Calls <code>in.close()</code>.
*/
public void close() throws IOException {
in.close();
}
}

View File

@ -1,16 +0,0 @@
/*
* FilterCoder
*
* Author: Lasse Collin <lasse.collin@tukaani.org>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz;
interface FilterCoder {
boolean changesSize();
boolean nonLastOK();
boolean lastOK();
}

View File

@ -1,17 +0,0 @@
/*
* FilterDecoder
*
* Author: Lasse Collin <lasse.collin@tukaani.org>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz;
import java.io.InputStream;
interface FilterDecoder extends FilterCoder {
int getMemoryUsage();
InputStream getInputStream(InputStream in);
}

View File

@ -1,16 +0,0 @@
/*
* FilterEncoder
*
* Author: Lasse Collin <lasse.collin@tukaani.org>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz;
interface FilterEncoder extends FilterCoder {
long getFilterID();
byte[] getFilterProps();
FinishableOutputStream getOutputStream(FinishableOutputStream out);
}

View File

@ -1,28 +0,0 @@
/*
* FilterOptions
*
* Authors: Lasse Collin <lasse.collin@tukaani.org>
* Igor Pavlov <http://7-zip.org/>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz;
import java.io.InputStream;
import java.io.IOException;
public abstract class FilterOptions implements Cloneable {
public abstract int getEncoderMemoryUsage();
public abstract FinishableOutputStream getOutputStream(
FinishableOutputStream out);
public abstract int getDecoderMemoryUsage();
public abstract InputStream getInputStream(InputStream in)
;
abstract FilterEncoder getFilterEncoder();
FilterOptions() {}
}

View File

@ -1,31 +0,0 @@
/*
* FinishableOutputStream
*
* Author: Lasse Collin <lasse.collin@tukaani.org>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz;
import java.io.OutputStream;
import java.io.IOException;
/**
* Output stream that supports finishing without closing
* the underlying stream.
*/
public abstract class FinishableOutputStream extends OutputStream {
/**
* Finish the stream without closing the underlying stream.
* No more data may be written to the stream after finishing.
* <p>
* The <code>finish</code> method of <code>FinishableOutputStream</code>
* does nothing. Subclasses should override it if they need finishing
* support, which is the case, for example, with compressors.
*
* @throws IOException
*/
public void finish() throws IOException {}
}

View File

@ -1,14 +0,0 @@
/*
* IndexIndicatorException
*
* Author: Lasse Collin <lasse.collin@tukaani.org>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz;
class IndexIndicatorException extends Exception {
private static final long serialVersionUID = 1L;
}

View File

@ -1,26 +0,0 @@
/*
* LZMA2Coder
*
* Author: Lasse Collin <lasse.collin@tukaani.org>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz;
abstract class LZMA2Coder implements FilterCoder {
public static final long FILTER_ID = 0x21;
public boolean changesSize() {
return true;
}
public boolean nonLastOK() {
return false;
}
public boolean lastOK() {
return true;
}
}

View File

@ -1,35 +0,0 @@
/*
* LZMA2Decoder
*
* Author: Lasse Collin <lasse.collin@tukaani.org>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz;
import java.io.InputStream;
class LZMA2Decoder extends LZMA2Coder implements FilterDecoder {
private int dictSize;
LZMA2Decoder(byte[] props) throws UnsupportedOptionsException {
// Up to 1.5 GiB dictionary is supported. The bigger ones
// are too big for int.
if (props.length != 1 || (props[0] & 0xFF) > 37)
throw new UnsupportedOptionsException(
"Unsupported LZMA2 properties");
dictSize = 2 | (props[0] & 1);
dictSize <<= (props[0] >>> 1) + 11;
}
public int getMemoryUsage() {
return LZMA2InputStream.getMemoryUsage(dictSize);
}
public InputStream getInputStream(InputStream in) {
return new LZMA2InputStream(in, dictSize);
}
}

View File

@ -1,35 +0,0 @@
/*
* LZMA2Encoder
*
* Author: Lasse Collin <lasse.collin@tukaani.org>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz;
class LZMA2Encoder extends LZMA2Coder implements FilterEncoder {
private final LZMA2Options options;
private final byte[] props = new byte[1];
LZMA2Encoder(LZMA2Options options) {
// Make a private copy so that the caller is free to change its copy.
this.options = (LZMA2Options)options.clone();
// TODO: Props!!!
}
public long getFilterID() {
return FILTER_ID;
}
public byte[] getFilterProps() {
return props;
}
public FinishableOutputStream getOutputStream(FinishableOutputStream out) {
return options.getOutputStream(out);
}
}

View File

@ -1,328 +0,0 @@
/*
* LZMA2InputStream
*
* Authors: Lasse Collin <lasse.collin@tukaani.org>
* Igor Pavlov <http://7-zip.org/>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz;
import java.io.InputStream;
import java.io.DataInputStream;
import java.io.IOException;
import org.tukaani.xz.lz.LZDecoder;
import org.tukaani.xz.rangecoder.RangeDecoder;
import org.tukaani.xz.lzma.LZMADecoder;
/**
* Decompresses a raw LZMA2 stream.
*/
public class LZMA2InputStream extends InputStream {
/**
* Smallest valid LZMA2 dictionary size.
* <p>
* Very tiny dictionaries would be a performance problem, so
* the minimum is 4 KiB.
*/
public static final int DICT_SIZE_MIN = 4096;
/**
* Largest dictionary size supported by this implementation.
* <p>
* The LZMA2 algorithm allows dictionaries up to one byte less than 4 GiB.
* This implementation supports only 16 bytes less than 2 GiB for raw
* LZMA2 streams, and for .xz files the maximum is 1.5 GiB. This
* limitation is due to Java using signed 32-bit integers for array
* indexing. The limitation shouldn't matter much in practice since so
* huge dictionaries are not normally used.
*/
public static final int DICT_SIZE_MAX = Integer.MAX_VALUE & ~15;
private static final int COMPRESSED_SIZE_MAX = 1 << 16;
private final DataInputStream in;
private final LZDecoder lz;
private final RangeDecoder rc = new RangeDecoder(COMPRESSED_SIZE_MAX);
private LZMADecoder lzma;
private int uncompressedSize = 0;
private boolean isLZMAChunk;
private boolean needDictReset = true;
private boolean needProps = true;
private boolean endReached = false;
private IOException exception = null;
/**
* Gets approximate decompressor memory requirements as kibibytes for
* the given dictionary size.
*
* @param dictSize LZMA2 dictionary size as bytes, must be
* in the range [<code>DICT_SIZE_MIN</code>,
* <code>DICT_SIZE_MAX</code>]
*
* @return approximate memory requirements as kibibytes (KiB)
*/
public static int getMemoryUsage(int dictSize) {
// The base state is aroudn 30-40 KiB (probabilities etc.),
// range decoder needs COMPRESSED_SIZE_MAX bytes for buffering,
// and LZ decoder needs a dictionary buffer.
return 40 + COMPRESSED_SIZE_MAX / 1024 + getDictSize(dictSize) / 1024;
}
private static int getDictSize(int dictSize) {
if (dictSize < DICT_SIZE_MIN || dictSize > DICT_SIZE_MAX)
throw new IllegalArgumentException(
"Unsupported dictionary size " + dictSize);
// Round dictionary size upward to a multiple of 16. This way LZMA
// can use LZDecoder.getPos() for calculating LZMA's posMask.
// Note that this check is needed only for raw LZMA2 streams; it is
// redundant with .xz.
return (dictSize + 15) & ~15;
}
/**
* Creates a new input stream that decompresses raw LZMA2 data
* from <code>in</code>.
* <p>
* The caller needs to know the dictionary size used when compressing;
* the dictionary size isn't stored as part of a raw LZMA2 stream.
* <p>
* Specifying a too small dictionary size will prevent decompressing
* the stream. Specifying a too big dictionary is waste of memory but
* decompression will work.
* <p>
* There is no need to specify a dictionary bigger than
* the uncompressed size of the data even if a bigger dictionary
* was used when compressing. If you know the uncompressed size
* of the data, this might allow saving some memory.
*
* @param in input stream from which LZMA2-compressed
* data is read
*
* @param dictSize LZMA2 dictionary size as bytes, must be
* in the range [<code>DICT_SIZE_MIN</code>,
* <code>DICT_SIZE_MAX</code>]
*/
public LZMA2InputStream(InputStream in, int dictSize) {
this.in = new DataInputStream(in);
this.lz = new LZDecoder(getDictSize(dictSize), null);
}
/**
* Creates a new LZMA2 decompressor using a preset dictionary.
* <p>
* This is like <code>LZMAInputStream()</code> except that the
* dictionary may be initialized using a preset dictionary.
* If a preset dictionary was used when compressing the data, the
* same preset dictionary must be provided when decompressing.
*
* @param in input stream from which LZMA2-compressed
* data is read
*
* @param dictSize LZMA2 dictionary size as bytes, must be
* in the range [<code>DICT_SIZE_MIN</code>,
* <code>DICT_SIZE_MAX</code>]
*
* @param presetDict preset dictionary or <code>null</code>
* to use no preset dictionary
*/
public LZMA2InputStream(InputStream in, int dictSize, byte[] presetDict) {
this.in = new DataInputStream(in);
this.lz = new LZDecoder(getDictSize(dictSize), presetDict);
if (presetDict.length > 0)
needDictReset = false;
}
/**
* Decompresses the next byte from this input stream.
* <p>
* Reading lots of data with <code>read()</code> from this input stream
* may be inefficient. Wrap it in <code>java.io.BufferedInputStream</code>
* if you need to read lots of data one byte at a time.
*
* @return the next decompressed byte, or <code>-1</code>
* to indicate the end of the compressed stream
*
* @throws CorruptedInputException
*
* @throws EOFException
* compressed input is truncated or corrupt
*
* @throws IOException may be thrown by <code>in</code>
*/
public int read() throws IOException {
byte[] buf = new byte[1];
return read(buf, 0, 1) == -1 ? -1 : (buf[0] & 0xFF);
}
/**
* Decompresses into an array of bytes.
* <p>
* If <code>len</code> is zero, no bytes are read and <code>0</code>
* is returned. Otherwise this will block until <code>len</code>
* bytes have been decompressed, the end of LZMA2 stream is reached,
* or an exception is thrown.
*
* @param buf target buffer for uncompressed data
* @param off start offset in <code>buf</code>
* @param len maximum number of uncompressed bytes to read
*
* @return number of bytes read, or <code>-1</code> to indicate
* the end of the compressed stream
*
* @throws CorruptedInputException
*
* @throws EOFException
* compressed input is truncated or corrupt
*
* @throws IOException may be thrown by <code>in</code>
*/
public int read(byte[] buf, int off, int len) throws IOException {
if (off < 0 || len < 0 || off + len < 0 || off + len > buf.length)
throw new IllegalArgumentException();
if (len == 0)
return 0;
if (exception != null)
throw exception;
if (endReached)
return -1;
try {
int size = 0;
while (len > 0) {
if (uncompressedSize == 0) {
decodeChunkHeader();
if (endReached)
return size == 0 ? -1 : size;
}
int copySizeMax = Math.min(uncompressedSize, len);
if (!isLZMAChunk) {
lz.copyUncompressed(in, copySizeMax);
} else {
lz.setLimit(copySizeMax);
lzma.decode();
}
int copiedSize = lz.flush(buf, off);
off += copiedSize;
len -= copiedSize;
size += copiedSize;
uncompressedSize -= copiedSize;
if (uncompressedSize == 0)
if (!rc.isFinished() || lz.hasPending())
throw new CorruptedInputException();
}
return size;
} catch (IOException e) {
exception = e;
throw e;
}
}
private void decodeChunkHeader() throws IOException {
int control = in.readUnsignedByte();
if (control == 0x00) {
endReached = true;
return;
}
if (control >= 0xE0 || control == 0x01) {
needProps = true;
needDictReset = false;
lz.reset();
} else if (needDictReset) {
throw new CorruptedInputException();
}
if (control >= 0x80) {
isLZMAChunk = true;
uncompressedSize = (control & 0x1F) << 16;
uncompressedSize += in.readUnsignedShort() + 1;
int compressedSize = in.readUnsignedShort() + 1;
if (control >= 0xC0) {
needProps = false;
decodeProps();
} else if (needProps) {
throw new CorruptedInputException();
} else if (control >= 0xA0) {
lzma.reset();
}
rc.prepareInputBuffer(in, compressedSize);
} else if (control > 0x02) {
throw new CorruptedInputException();
} else {
isLZMAChunk = false;
uncompressedSize = in.readUnsignedShort() + 1;
}
}
private void decodeProps() throws IOException {
int props = in.readUnsignedByte();
if (props > (4 * 5 + 4) * 9 + 8)
throw new CorruptedInputException();
int pb = props / (9 * 5);
props -= pb * 9 * 5;
int lp = props / 9;
int lc = props - lp * 9;
if (lc + lp > 4)
throw new CorruptedInputException();
lzma = new LZMADecoder(lz, rc, lc, lp, pb);
}
/**
* Returns the number of uncompressed bytes that can be read
* without blocking. The value is returned with an assumption
* that the compressed input data will be valid. If the compressed
* data is corrupt, <code>CorruptedInputException</code> may get
* thrown before the number of bytes claimed to be available have
* been read from this input stream.
* <p>
* In LZMAInputStream, the return value will be non-zero when the
* decompressor is in the middle of an LZMA2 chunk. The return value
* will then be the number of uncompressed bytes remaining from that
* chunk.
*
* @return the number of uncompressed bytes that can be read
* without blocking
*/
public int available() {
return uncompressedSize;
}
/**
* Calls <code>in.close()</code>.
*/
public void close() throws IOException {
in.close();
}
}

View File

@ -1,139 +0,0 @@
/*
* LZMA2Options
*
* Author: Lasse Collin <lasse.collin@tukaani.org>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz;
import java.io.InputStream;
import java.io.IOException;
/**
* Options for LZMA2.
* <p>
* FIXME: This is unfinished and things might change.
*/
public class LZMA2Options extends FilterOptions {
/**
* Default compression preset.
*/
public static final int PRESET_DEFAULT = 6;
/**
* Minimum dictionary size.
*/
public static final int DICT_SIZE_MIN = 4096;
/**
* Maximum dictionary size for compression.
* <p>
* FIXME? Decompression dictionary size can be bigger.
*/
public static final int DICT_SIZE_MAX = 128 << 20;
/**
* Maximum value for lc + lp.
*/
public static final int LC_LP_MAX = 4;
/**
* Maximum value for pb.
*/
public static final int PB_MAX = 4;
/**
* Compression mode: uncompressed.
* The data is wrapped into a LZMA2 stream without compression.
*/
public static final int MODE_UNCOMPRESSED = 0;
/**
* Compression mode: fast.
* This is usually combined with a hash chain match finder.
*/
public static final int MODE_FAST = 1;
/**
* Compression mode: normal.
* This is usually combined with a binary tree match finder.
*/
public static final int MODE_NORMAL = 2;
/**
* Minimum value for <code>niceLen</code>.
*/
public static final int NICE_LEN_MIN = 8;
/**
* Maximum value for <code>niceLen</code>.
*/
public static final int NICE_LEN_MAX = 273;
/**
* Match finder: Hash Chain 2-3-4
*/
public static final int MF_HC4 = 0x04;
/**
* Match finder: Binary tree 2-3-4
*/
public static final int MF_BT4 = 0x14;
private int dictSize;
/*
public int lc;
public int lp;
public int pb;
public int mode;
public int niceLen;
public int mf;
public int depth;
*/
public LZMA2Options() {
setPreset(PRESET_DEFAULT);
}
public LZMA2Options(int preset) {
setPreset(preset);
}
public void setPreset(int preset) {
// TODO
dictSize = 8 << 20;
}
public int getEncoderMemoryUsage() {
return LZMA2OutputStream.getMemoryUsage(this);
}
public FinishableOutputStream getOutputStream(FinishableOutputStream out) {
return new LZMA2OutputStream(out, this);
}
public int getDecoderMemoryUsage() {
return LZMA2InputStream.getMemoryUsage(dictSize);
}
public InputStream getInputStream(InputStream in) {
return new LZMA2InputStream(in, dictSize);
}
FilterEncoder getFilterEncoder() {
return new LZMA2Encoder(this);
}
public Object clone() {
try {
return super.clone();
} catch (CloneNotSupportedException e) {
// Never reached
throw new RuntimeException();
}
}
}

View File

@ -1,77 +0,0 @@
/*
* LZMA2OutputStream
*
* Authors: Lasse Collin <lasse.collin@tukaani.org>
* Igor Pavlov <http://7-zip.org/>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz;
import java.io.IOException;
//
// TODO: This creates a valid LZMA2 stream but it doesn't compress.
// So this is useless except for testing the .xz container support.
//
class LZMA2OutputStream extends FinishableOutputStream {
private final FinishableOutputStream out;
static int getMemoryUsage(LZMA2Options options) {
// TODO
return 1;
}
LZMA2OutputStream(FinishableOutputStream out, LZMA2Options options) {
this.out = out;
}
public void write(int b) throws IOException {
byte[] buf = new byte[1];
buf[0] = (byte)b;
write(buf, 0, 1);
}
public void write(byte[] buf, int off, int len) throws IOException {
if (off < 0 || len < 0 || off + len < 0 || off + len > buf.length)
throw new IllegalArgumentException();
while (off > 0x10000) {
writeChunk(buf, off, 0x10000);
off += 0x10000;
len -= 0x10000;
}
writeChunk(buf, off, len);
}
private void writeChunk(byte[] buf, int off, int len) throws IOException {
out.write(0x01);
out.write((len - 1) >>> 8);
out.write(len - 1);
out.write(buf, off, len);
}
private void writeEndMarker() throws IOException {
// TODO: Flush incomplete chunk.
out.write(0x00);
}
public void flush() throws IOException {
throw new UnsupportedOptionsException(
"Flushing LZMA2OutputStream not implemented yet");
}
public void finish() throws IOException {
writeEndMarker();
out.finish();
}
public void close() throws IOException {
writeEndMarker();
out.close();
}
}

View File

@ -1,60 +0,0 @@
/*
* MemoryLimitException
*
* Author: Lasse Collin <lasse.collin@tukaani.org>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz;
/**
* Thrown when the memory usage limit given to the XZ decompressor
* would be exceeded.
* <p>
* The amount of memory required and the memory usage limit are
* included in the error detail message in human readable format.
*/
public class MemoryLimitException extends XZIOException {
private static final long serialVersionUID = 3L;
private final int memoryNeeded;
private final int memoryLimit;
/**
* Creates a new MemoryLimitException.
* <p>
* The amount of memory needed and the memory usage limit are
* included in the error detail message.
*
* @param memoryNeeded amount of memory needed as kibibytes (KiB)
* @param memoryLimit specified memory usage limit as kibibytes (KiB)
*/
public MemoryLimitException(int memoryNeeded, int memoryLimit) {
super("" + memoryNeeded + " KiB of memory would be needed; limit was "
+ memoryLimit + " KiB");
this.memoryNeeded = memoryNeeded;
this.memoryLimit = memoryLimit;
}
/**
* Gets how much memory is required to decompress the data.
*
* @return amount of memory needed as kibibytes (KiB)
*/
public int getMemoryNeeded() {
return memoryNeeded;
}
/**
* Gets what the memory usage limit was at the time the exception
* was created.
*
* @return memory usage limit as kibibytes (KiB)
*/
public int getMemoryLimit() {
return memoryLimit;
}
}

View File

@ -1,33 +0,0 @@
/*
* RawCoder
*
* Author: Lasse Collin <lasse.collin@tukaani.org>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz;
class RawCoder {
static void validate(FilterCoder[] filters)
throws UnsupportedOptionsException {
for (int i = 0; i < filters.length - 1; ++i)
if (!filters[i].nonLastOK())
throw new UnsupportedOptionsException(
"Unsupported XZ filter chain");
if (!filters[filters.length - 1].lastOK())
throw new UnsupportedOptionsException(
"Unsupported XZ filter chain");
int changesSizeCount = 0;
for (int i = 0; i < filters.length; ++i)
if (filters[i].changesSize())
++changesSizeCount;
if (changesSizeCount > 3)
throw new UnsupportedOptionsException(
"Unsupported XZ filter chain");
}
}

View File

@ -1,285 +0,0 @@
/*
* SingleXZInputStream
*
* Author: Lasse Collin <lasse.collin@tukaani.org>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz;
import java.io.InputStream;
import java.io.DataInputStream;
import java.io.IOException;
import java.io.EOFException;
import org.tukaani.xz.common.DecoderUtil;
import org.tukaani.xz.common.StreamFlags;
import org.tukaani.xz.index.IndexHash;
import org.tukaani.xz.check.Check;
/**
* Decompresses exactly one XZ Stream in streamed mode (no seeking).
* The decompression stops after the first XZ Stream has been decompressed,
* and the read position in the input stream is left at the first byte
* after the end of the XZ Stream. This can be useful when XZ data has
* been stored inside some other file format or protocol.
* <p>
* Unless you know what you are doing, don't use this class to decompress
* standalone .xz files. For that purpose, use <code>XZInputStream</code>.
*
* @see XZInputStream
*/
public class SingleXZInputStream extends InputStream {
private InputStream in;
private int memoryLimit;
private StreamFlags streamHeaderFlags;
private Check check;
private BlockInputStream blockDecoder = null;
private final IndexHash indexHash = new IndexHash();
private boolean endReached = false;
private IOException exception = null;
/**
* Creates a new input stream that decompresses exactly one XZ Stream
* from <code>in</code>.
* <p>
* This constructor reads and parses the XZ Stream Header (12 bytes)
* from <code>in</code>. The header of the first Block is not read
* until <code>read</code> is called.
*
* @param in input stream from which XZ-compressed
* data is read
*
* @throws XZFormatException
* input is not in the XZ format
*
* @throws CorruptedInputException
* XZ header CRC32 doesn't match
*
* @throws UnsupportedOptionsException
* XZ header is valid but specifies options
* not supported by this implementation
*
* @throws EOFException
* less than 12 bytes of input was available
* from <code>in</code>
*
* @throws IOException may be thrown by <code>in</code>
*/
public SingleXZInputStream(InputStream in) throws IOException {
initialize(in, -1);
}
/**
* Creates a new single-stream XZ decompressor with optional
* memory usage limit.
* <p>
* This is identical to <code>SingleXZInputStream(InputStream)</code>
* except that this takes also the <code>memoryLimit</code> argument.
*
* @param in input stream from which XZ-compressed
* data is read
*
* @param memoryLimit memory usage limit as kibibytes (KiB)
* or -1 to impose no memory usage limit
*
* @throws XZFormatException
* input is not in the XZ format
*
* @throws CorruptedInputException
* XZ header CRC32 doesn't match
*
* @throws UnsupportedOptionsException
* XZ header is valid but specifies options
* not supported by this implementation
*
* @throws EOFException
* less than 12 bytes of input was available
* from <code>in</code>
*
* @throws IOException may be thrown by <code>in</code>
*/
public SingleXZInputStream(InputStream in, int memoryLimit)
throws IOException {
initialize(in, memoryLimit);
}
SingleXZInputStream(InputStream in, int memoryLimit,
byte[] streamHeader) throws IOException {
initialize(in, memoryLimit, streamHeader);
}
private void initialize(InputStream in, int memoryLimit)
throws IOException {
byte[] streamHeader = new byte[DecoderUtil.STREAM_HEADER_SIZE];
new DataInputStream(in).readFully(streamHeader);
initialize(in, memoryLimit, streamHeader);
}
private void initialize(InputStream in, int memoryLimit,
byte[] streamHeader) throws IOException {
this.in = in;
this.memoryLimit = memoryLimit;
streamHeaderFlags = DecoderUtil.decodeStreamHeader(streamHeader);
check = Check.getInstance(streamHeaderFlags.checkType);
}
/**
* Gets the ID of the integrity check used in this XZ Stream.
*
* @return the Check ID specified in the XZ Stream Header
*/
public int getCheckType() {
return streamHeaderFlags.checkType;
}
/**
* Gets the name of the integrity check used in this XZ Stream.
*
* @return the name of the check specified in the XZ Stream Header
*/
public String getCheckName() {
return check.getName();
}
/**
* Decompresses the next byte from this input stream.
* <p>
* Reading lots of data with <code>read()</code> from this input stream
* may be inefficient. Wrap it in <code>java.io.BufferedInputStream</code>
* if you need to read lots of data one byte at a time.
*
* @return the next decompressed byte, or <code>-1</code>
* to indicate the end of the compressed stream
*
* @throws CorruptedInputException
* @throws UnsupportedOptionsException
* @throws MemoryLimitException
*
* @throws EOFException
* compressed input is truncated or corrupt
*
* @throws IOException may be thrown by <code>in</code>
*/
public int read() throws IOException {
byte[] buf = new byte[1];
return read(buf, 0, 1) == -1 ? -1 : (buf[0] & 0xFF);
}
/**
* Decompresses into an array of bytes.
* <p>
* If <code>len</code> is zero, no bytes are read and <code>0</code>
* is returned. Otherwise this will try to decompress <code>len</code>
* bytes of uncompressed data. Less than <code>len</code> bytes may
* be read only in the following situations:
* <ul>
* <li>The end of the compressed data was reached successfully.</li>
* <li>An error is detected after at least one but less <code>len</code>
* bytes have already been successfully decompressed.
* The next call with non-zero <code>len</code> will immediately
* throw the pending exception.</li>
* <li>An exception is thrown.</li>
* </ul>
*
* @param buf target buffer for uncompressed data
* @param off start offset in <code>buf</code>
* @param len maximum number of uncompressed bytes to read
*
* @return number of bytes read, or <code>-1</code> to indicate
* the end of the compressed stream
*
* @throws CorruptedInputException
* @throws UnsupportedOptionsException
* @throws MemoryLimitException
*
* @throws EOFException
* compressed input is truncated or corrupt
*
* @throws IOException may be thrown by <code>in</code>
*/
public int read(byte[] buf, int off, int len) throws IOException {
if (off < 0 || len < 0 || off + len < 0 || off + len > buf.length)
throw new IllegalArgumentException();
if (len == 0)
return 0;
if (exception != null)
throw exception;
if (endReached)
return -1;
int size = 0;
try {
while (len > 0) {
if (blockDecoder == null) {
try {
blockDecoder = new BlockInputStream(in, check,
memoryLimit);
} catch (IndexIndicatorException e) {
indexHash.validate(in);
validateStreamFooter();
endReached = true;
return size > 0 ? size : -1;
}
}
int ret = blockDecoder.read(buf, off, len);
if (ret > 0) {
size += ret;
off += ret;
len -= ret;
} else if (ret == -1) {
indexHash.add(blockDecoder.getUnpaddedSize(),
blockDecoder.getUncompressedSize());
blockDecoder = null;
}
}
} catch (IOException e) {
exception = e;
if (size == 0)
throw e;
}
return size;
}
private void validateStreamFooter() throws IOException {
byte[] buf = new byte[DecoderUtil.STREAM_HEADER_SIZE];
new DataInputStream(in).readFully(buf);
StreamFlags streamFooterFlags = DecoderUtil.decodeStreamFooter(buf);
if (!DecoderUtil.areStreamFlagsEqual(streamHeaderFlags,
streamFooterFlags)
|| indexHash.getIndexSize() != streamFooterFlags.backwardSize)
throw new CorruptedInputException(
"XZ Stream Footer does not match Stream Header");
}
/**
* Returns the number of uncompressed bytes that can be read
* without blocking. The value is returned with an assumption
* that the compressed input data will be valid. If the compressed
* data is corrupt, <code>CorruptedInputException</code> may get
* thrown before the number of bytes claimed to be available have
* been read from this input stream.
*
* @return the number of uncompressed bytes that can be read
* without blocking
*/
public int available() throws IOException {
return blockDecoder == null ? 0 : blockDecoder.available();
}
/**
* Calls <code>in.close()</code>.
*/
public void close() throws IOException {
in.close();
}
}

View File

@ -1,34 +0,0 @@
/*
* UnsupportedOptionsException
*
* Author: Lasse Collin <lasse.collin@tukaani.org>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz;
/**
* Thrown when compression options not supported by this implementation
* are detected. Some other implementation might support those options.
*/
public class UnsupportedOptionsException extends XZIOException {
private static final long serialVersionUID = 3L;
/**
* Creates a new UnsupportedOptionsException with null
* as its error detail message.
*/
public UnsupportedOptionsException() {}
/**
* Creates a new UnsupportedOptionsException with the given
* error detail message.
*
* @param s error detail message
*/
public UnsupportedOptionsException(String s) {
super(s);
}
}

View File

@ -1,53 +0,0 @@
/*
* XZ
*
* Author: Lasse Collin <lasse.collin@tukaani.org>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz;
/**
* XZ constants.
*/
public class XZ {
/**
* XZ Header Magic Bytes begin a XZ file.
* This can be useful to detect XZ compressed data.
*/
public static final byte[] HEADER_MAGIC = {
(byte)0xFD, '7', 'z', 'X', 'Z', '\0' };
/**
* XZ Footer Magic Bytes are the last bytes of a XZ Stream.
*/
public static final byte[] FOOTER_MAGIC = { 'Y', 'Z' };
/**
* Integrity check ID indicating that no integrity check is calculated.
* <p>
* Omitting the integrity check is strongly discouraged except when
* the integrity of the data will be verified by other means anyway,
* and calculating the check twice would be useless.
*/
public static final int CHECK_NONE = 0;
/**
* Integrity check ID for CRC32.
*/
public static final int CHECK_CRC32 = 1;
/**
* Integrity check ID for CRC64.
*/
public static final int CHECK_CRC64 = 4;
/**
* Integrity check ID for SHA-256.
*/
public static final int CHECK_SHA256 = 10;
private XZ() {}
}

View File

@ -1,24 +0,0 @@
/*
* XZFormatException
*
* Author: Lasse Collin <lasse.collin@tukaani.org>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz;
/**
* Thrown when the input data is not in the XZ format.
*/
public class XZFormatException extends XZIOException {
private static final long serialVersionUID = 3L;
/**
* Creates a new exception with the default error detail message.
*/
public XZFormatException() {
super("Input is not in the XZ format");
}
}

View File

@ -1,28 +0,0 @@
/*
* XZIOException
*
* Author: Lasse Collin <lasse.collin@tukaani.org>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz;
/**
* Generic IOException specific to this package.
* All IOExceptions thrown by this package are extended from XZIOException.
* This way it is easier to distinguish exceptions thrown by the XZ code
* from other IOExceptions.
*/
public class XZIOException extends java.io.IOException {
private static final long serialVersionUID = 3L;
public XZIOException() {
super();
}
public XZIOException(String s) {
super(s);
}
}

View File

@ -1,257 +0,0 @@
/*
* XZInputStream
*
* Author: Lasse Collin <lasse.collin@tukaani.org>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz;
import java.io.InputStream;
import java.io.DataInputStream;
import java.io.IOException;
import java.io.EOFException;
import org.tukaani.xz.common.DecoderUtil;
/**
* Decompresses a .xz file in streamed mode (no seeking).
* <p>
* Use this to decompress regular standalone .xz files. This reads from
* its input stream until the end of the input or until an error occurs.
* This supports decompressing concatenated .xz files.
*
* @see SingleXZInputStream
*/
public class XZInputStream extends InputStream {
private final int memoryLimit;
private final InputStream in;
private SingleXZInputStream xzIn;
private boolean endReached = false;
private IOException exception = null;
/**
* Creates a new input stream that decompresses XZ-compressed data
* from <code>in</code>.
* <p>
* This constructor reads and parses the XZ Stream Header (12 bytes)
* from <code>in</code>. The header of the first Block is not read
* until <code>read</code> is called.
*
* @param in input stream from which XZ-compressed
* data is read
*
* @throws XZFormatException
* input is not in the XZ format
*
* @throws CorruptedInputException
* XZ header CRC32 doesn't match
*
* @throws UnsupportedOptionsException
* XZ header is valid but specifies options
* not supported by this implementation
*
* @throws EOFException
* less than 12 bytes of input was available
* from <code>in</code>
*
* @throws IOException may be thrown by <code>in</code>
*/
public XZInputStream(InputStream in) throws IOException {
this.in = in;
this.memoryLimit = -1;
this.xzIn = new SingleXZInputStream(in, -1);
}
/**
* Creates a new input stream that decompresses XZ-compressed data
* from <code>in</code>.
* <p>
* This is identical to <code>XZInputStream(InputStream)</code> except
* that this takes also the <code>memoryLimit</code> argument.
*
* @param in input stream from which XZ-compressed
* data is read
*
* @param memoryLimit memory usage limit as kibibytes (KiB)
* or -1 to impose no memory usage limit
*
* @throws XZFormatException
* input is not in the XZ format
*
* @throws CorruptedInputException
* XZ header CRC32 doesn't match
*
* @throws UnsupportedOptionsException
* XZ header is valid but specifies options
* not supported by this implementation
*
* @throws EOFException
* less than 12 bytes of input was available
* from <code>in</code>
*
* @throws IOException may be thrown by <code>in</code>
*/
public XZInputStream(InputStream in, int memoryLimit) throws IOException {
this.in = in;
this.memoryLimit = memoryLimit;
this.xzIn = new SingleXZInputStream(in, memoryLimit);
}
/**
* Decompresses the next byte from this input stream.
* <p>
* Reading lots of data with <code>read()</code> from this input stream
* may be inefficient. Wrap it in <code>java.io.BufferedInputStream</code>
* if you need to read lots of data one byte at a time.
*
* @return the next decompressed byte, or <code>-1</code>
* to indicate the end of the compressed stream
*
* @throws CorruptedInputException
* @throws UnsupportedOptionsException
* @throws MemoryLimitException
*
* @throws EOFException
* compressed input is truncated or corrupt
*
* @throws IOException may be thrown by <code>in</code>
*/
public int read() throws IOException {
byte[] buf = new byte[1];
return read(buf, 0, 1) == -1 ? -1 : (buf[0] & 0xFF);
}
/**
* Decompresses into an array of bytes.
* <p>
* If <code>len</code> is zero, no bytes are read and <code>0</code>
* is returned. Otherwise this will try to decompress <code>len</code>
* bytes of uncompressed data. Less than <code>len</code> bytes may
* be read only in the following situations:
* <ul>
* <li>The end of the compressed data was reached successfully.</li>
* <li>An error is detected after at least one but less <code>len</code>
* bytes have already been successfully decompressed.
* The next call with non-zero <code>len</code> will immediately
* throw the pending exception.</li>
* <li>An exception is thrown.</li>
* </ul>
*
* @param buf target buffer for uncompressed data
* @param off start offset in <code>buf</code>
* @param len maximum number of uncompressed bytes to read
*
* @return number of bytes read, or <code>-1</code> to indicate
* the end of the compressed stream
*
* @throws CorruptedInputException
* @throws UnsupportedOptionsException
* @throws MemoryLimitException
*
* @throws EOFException
* compressed input is truncated or corrupt
*
* @throws IOException may be thrown by <code>in</code>
*/
public int read(byte[] buf, int off, int len) throws IOException {
if (off < 0 || len < 0 || off + len < 0 || off + len > buf.length)
throw new IllegalArgumentException();
if (len == 0)
return 0;
if (exception != null)
throw exception;
if (endReached)
return -1;
int size = 0;
try {
while (len > 0) {
if (xzIn == null) {
prepareNextStream();
if (endReached)
return size == 0 ? -1 : size;
}
int ret = xzIn.read(buf, off, len);
if (ret > 0) {
size += ret;
off += ret;
len -= ret;
} else if (ret == -1) {
xzIn = null;
}
}
} catch (IOException e) {
exception = e;
if (size == 0)
throw e;
}
return size;
}
private void prepareNextStream() throws IOException {
DataInputStream inData = new DataInputStream(in);
byte[] buf = new byte[DecoderUtil.STREAM_HEADER_SIZE];
// The size of Stream Padding must be a multiple of four bytes,
// all bytes zero.
do {
// First try to read one byte to see if we have reached the end
// of the file.
int ret = inData.read(buf, 0, 1);
if (ret == -1) {
endReached = true;
return;
}
// Since we got one byte of input, there must be at least
// three more available in a valid file.
inData.readFully(buf, 1, 3);
} while (buf[0] == 0 && buf[1] == 0 && buf[2] == 0 && buf[3] == 0);
// Not all bytes are zero. In a valid Stream it indicates the
// beginning of the next Stream. Read the rest of the Stream Header
// and initialize the XZ decoder.
inData.readFully(buf, 4, DecoderUtil.STREAM_HEADER_SIZE - 4);
try {
xzIn = new SingleXZInputStream(in, memoryLimit, buf);
} catch (XZFormatException e) {
// Since this isn't the first .xz Stream, it is more
// logical to tell that the data is corrupt.
throw new CorruptedInputException(
"Garbage after a valid XZ Stream");
}
}
/**
* Returns the number of uncompressed bytes that can be read
* without blocking. The value is returned with an assumption
* that the compressed input data will be valid. If the compressed
* data is corrupt, <code>CorruptedInputException</code> may get
* thrown before the number of bytes claimed to be available have
* been read from this input stream.
*
* @return the number of uncompressed bytes that can be read
* without blocking
*/
public int available() throws IOException {
return xzIn == null ? 0 : xzIn.available();
}
/**
* Calls <code>in.close()</code>.
*/
public void close() throws IOException {
in.close();
}
}

View File

@ -1,290 +0,0 @@
/*
* XZOutputStream
*
* Author: Lasse Collin <lasse.collin@tukaani.org>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz;
import java.io.OutputStream;
import java.io.IOException;
import org.tukaani.xz.common.EncoderUtil;
import org.tukaani.xz.common.StreamFlags;
import org.tukaani.xz.check.Check;
import org.tukaani.xz.index.IndexEncoder;
/**
* Compresses into the .xz file format.
*/
public class XZOutputStream extends FinishableOutputStream {
private OutputStream out;
private final StreamFlags streamFlags = new StreamFlags();
private Check check;
private final IndexEncoder index = new IndexEncoder();
private FilterEncoder[] filters;
private BlockOutputStream blockEncoder = null;
private IOException exception = null;
private boolean finished = false;
/**
* Creates a new output stream that compressed data into the .xz format.
* This is takes options for one filter as an argument. This constructor
* is equivalent to passing a single-member filterOptions array to the
* other constructor.
*
* @param out output stream to which the compressed data
* will be written
*
* @param filterOptions
* filter options to use
*
* @param checkType type of the integrity check,
* for example XZ.CHECK_CRC64
*
* @throws UnsupportedOptionsException
* invalid filter chain
*
* @throws IOException may be thrown from <code>out</code>
*/
public XZOutputStream(OutputStream out, FilterOptions filterOptions,
int checkType) throws IOException {
FilterOptions[] ops = new FilterOptions[1];
ops[0] = filterOptions;
initialize(out, ops, checkType);
}
/**
* Creates a new output stream that compressed data into the .xz format.
* This takes an array of filter options, allowing the caller to specify
* a filter chain with 1-4 filters.
*
* @param out output stream to which the compressed data
* will be written
*
* @param filterOptions
* array of filter options to use
*
* @param checkType type of the integrity check,
* for example XZ.CHECK_CRC64
*
* @throws UnsupportedOptionsException
* invalid filter chain
*
* @throws IOException may be thrown from <code>out</code>
*/
public XZOutputStream(OutputStream out, FilterOptions[] filterOptions,
int checkType) throws IOException {
initialize(out, filterOptions, checkType);
}
private void initialize(OutputStream out, FilterOptions[] filterOptions,
int checkType) throws IOException {
this.out = out;
updateFilters(filterOptions);
streamFlags.checkType = checkType;
check = Check.getInstance(checkType);
encodeStreamHeader();
}
/**
* Updates the filter chain.
* <p>
* Currently this cannot be used to update e.g. LZMA2 options in the
* middle of a XZ Block. Use <code>flush()</code> to finish the current
* XZ Block before calling this function. The new filter chain will then
* be used for the next XZ Block.
*/
public void updateFilters(FilterOptions[] filterOptions)
throws XZIOException {
if (blockEncoder != null)
throw new UnsupportedOptionsException("Changing filter options "
+ "in the middle of a XZ Block not implemented");
if (filterOptions.length < 1 || filterOptions.length > 4)
throw new UnsupportedOptionsException(
"XZ filter chain must be 1-4 filters");
FilterEncoder[] newFilters = new FilterEncoder[filterOptions.length];
for (int i = 0; i < filterOptions.length; ++i)
newFilters[i] = filterOptions[i].getFilterEncoder();
RawCoder.validate(newFilters);
filters = newFilters;
}
/**
* Writes one byte to be compressed.
*
* @throws XZIOException
* XZ stream has grown too big
* @throws IOException may be thrown by the underlying output stream
*/
public void write(int b) throws IOException {
byte[] buf = new byte[] { (byte)b };
write(buf, 0, 1);
}
/**
* Writes an array of bytes to be compressed.
* The compressors tend to do internal buffering and thus the written
* data won't be readable from the compressed output immediately.
* Use <code>flush()</code> to force everything written so far to
* be written to the underlaying output stream, but be aware that
* flushing reduces compression ratio.
*
* @param buf buffer of bytes to be written
* @param off start offset in <code>buf</code>
* @param len number of bytes to write
*
* @throws XZIOException
* XZ stream has grown too big
* @throws XZIOException
* <code>finish()</code> or <code>close()</code>
* was already called
* @throws IOException may be thrown by the underlying output stream
*/
public void write(byte[] buf, int off, int len) throws IOException {
if (off < 0 || len < 0 || off + len < 0 || off + len > buf.length)
throw new IllegalArgumentException();
if (len == 0)
return;
if (finished)
exception = new XZIOException(
"XZOutputStream.write was called on a finished stream");
if (exception != null)
throw exception;
if (blockEncoder == null)
blockEncoder = new BlockOutputStream(out, filters, check);
try {
blockEncoder.write(buf, off, len);
} catch (IOException e) {
exception = e;
throw e;
}
}
/**
* Flushes the encoder and calls <code>out.flush()</code>.
* <p>
* FIXME: I haven't decided yet how this will work in the final version.
* In the current implementation, flushing finishes the current .xz Block.
* This is equivalent to LZMA_FULL_FLUSH in liblzma (XZ Utils).
* Equivalent of liblzma's LZMA_SYNC_FLUSH might be implemented in
* the future, and perhaps should be what <code>flush()</code> should do.
*/
public void flush() throws IOException {
if (exception != null)
throw exception;
if (blockEncoder != null) {
try {
blockEncoder.finish();
index.add(blockEncoder.getUnpaddedSize(),
blockEncoder.getUncompressedSize());
blockEncoder = null;
} catch (IOException e) {
exception = e;
throw e;
}
}
out.flush();
}
/**
* Finishes compression without closing the underlying stream.
* No more data can be written to this stream after finishing
* (calling <code>write</code> with an empty buffer is OK).
* <p>
* Repeated calls to <code>finish()</code> do nothing unless
* an exception was thrown by this stream earlier. In that case
* the same exception is thrown again.
* <p>
* After finishing, the stream may be closed normally with
* <code>close()</code>. If the stream will be closed anyway, there
* usually is no need to call <code>finish()</code> separately.
*/
public void finish() throws IOException {
if (!finished) {
// flush() checks for pending exceptions so we don't need to
// worry about it here.
flush();
try {
index.encode(out);
encodeStreamFooter();
finished = true;
} catch (IOException e) {
exception = e;
throw e;
}
}
}
/**
* Finishes compression and closes the underlying stream.
* The underlying stream <code>out</code> is closed even if finishing
* fails. If both finishing and closing fail, the exception thrown
* by <code>finish()</code> is thrown and the exception from the failed
* <code>out.close()</code> is lost.
*/
public void close() throws IOException {
// If finish() throws an exception, it stores the exception to
// the variable "exception". So we can ignore the possible
// exception here.
try {
finish();
} catch (IOException e) {}
try {
out.close();
} catch (IOException e) {
// Remember the exception but only if there is no previous
// pending exception.
if (exception == null)
exception = e;
}
if (exception != null)
throw exception;
}
private void encodeStreamFlags(byte[] buf, int off) {
buf[off] = 0x00;
buf[off + 1] = (byte)streamFlags.checkType;
}
private void encodeStreamHeader() throws IOException {
out.write(XZ.HEADER_MAGIC);
byte[] buf = new byte[2];
encodeStreamFlags(buf, 0);
out.write(buf);
EncoderUtil.writeCRC32(out, buf);
}
private void encodeStreamFooter() throws IOException {
byte[] buf = new byte[6];
long backwardSize = index.getIndexSize() / 4 - 1;
for (int i = 0; i < 4; ++i)
buf[i] = (byte)(backwardSize >>> (i * 8));
encodeStreamFlags(buf, 4);
EncoderUtil.writeCRC32(out, buf);
out.write(buf);
out.write(XZ.FOOTER_MAGIC);
}
}

View File

@ -1,33 +0,0 @@
/*
* CRC32
*
* Author: Lasse Collin <lasse.collin@tukaani.org>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz.check;
public class CRC32 extends Check {
private final java.util.zip.CRC32 state = new java.util.zip.CRC32();
public CRC32() {
size = 4;
name = "CRC32";
}
public void update(byte[] buf, int off, int len) {
state.update(buf, off, len);
}
public byte[] finish() {
long value = state.getValue();
byte[] buf = new byte[] { (byte)(value),
(byte)(value >>> 8),
(byte)(value >>> 16),
(byte)(value >>> 24) };
state.reset();
return buf;
}
}

View File

@ -1,54 +0,0 @@
/*
* CRC64
*
* Author: Lasse Collin <lasse.collin@tukaani.org>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz.check;
public class CRC64 extends Check {
private static final long poly = 0xC96C5795D7870F42L;
private static final long[] crcTable = new long[256];
private long crc = -1;
static {
for (int b = 0; b < crcTable.length; ++b) {
long r = b;
for (int i = 0; i < 8; ++i) {
if ((r & 1) == 1)
r = (r >>> 1) ^ poly;
else
r >>>= 1;
}
crcTable[b] = r;
}
}
public CRC64() {
size = 8;
name = "CRC64";
}
public void update(byte[] buf, int off, int len) {
int end = off + len;
while (off < end)
crc = crcTable[(buf[off++] ^ (int)crc) & 0xFF] ^ (crc >>> 8);
}
public byte[] finish() {
long value = ~crc;
crc = -1;
byte[] buf = new byte[8];
for (int i = 0; i < buf.length; ++i)
buf[i] = (byte)(value >> (i * 8));
return buf;
}
}

View File

@ -1,57 +0,0 @@
/*
* Check
*
* Author: Lasse Collin <lasse.collin@tukaani.org>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz.check;
import org.tukaani.xz.XZ;
import org.tukaani.xz.UnsupportedOptionsException;
public abstract class Check {
int size;
String name;
public abstract void update(byte[] buf, int off, int len);
public abstract byte[] finish();
public void update(byte[] buf) {
update(buf, 0, buf.length);
}
public int getSize() {
return size;
}
public String getName() {
return name;
}
public static Check getInstance(int checkType)
throws UnsupportedOptionsException {
switch (checkType) {
case XZ.CHECK_NONE:
return new None();
case XZ.CHECK_CRC32:
return new CRC32();
case XZ.CHECK_CRC64:
return new CRC64();
case XZ.CHECK_SHA256:
try {
return new SHA256();
} catch (java.security.NoSuchAlgorithmException e) {}
break;
}
throw new UnsupportedOptionsException(
"Unsupported Check ID " + checkType);
}
}

View File

@ -1,24 +0,0 @@
/*
* None
*
* Author: Lasse Collin <lasse.collin@tukaani.org>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz.check;
public class None extends Check {
public None() {
size = 0;
name = "None";
}
public void update(byte[] buf, int off, int len) {}
public byte[] finish() {
byte[] empty = new byte[0];
return empty;
}
}

View File

@ -1,30 +0,0 @@
/*
* SHA256
*
* Author: Lasse Collin <lasse.collin@tukaani.org>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz.check;
public class SHA256 extends Check {
private final java.security.MessageDigest sha256;
public SHA256() throws java.security.NoSuchAlgorithmException {
size = 32;
name = "SHA-256";
sha256 = java.security.MessageDigest.getInstance("SHA-256");
}
public void update(byte[] buf, int off, int len) {
sha256.update(buf, off, len);
}
public byte[] finish() {
byte[] buf = sha256.digest();
sha256.reset();
return buf;
}
}

View File

@ -1,121 +0,0 @@
/*
* DecoderUtil
*
* Author: Lasse Collin <lasse.collin@tukaani.org>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz.common;
import java.io.InputStream;
import java.io.IOException;
import java.io.EOFException;
import java.util.zip.CRC32;
import org.tukaani.xz.XZ;
import org.tukaani.xz.XZFormatException;
import org.tukaani.xz.CorruptedInputException;
import org.tukaani.xz.UnsupportedOptionsException;
public class DecoderUtil extends Util {
public static boolean isCRC32Valid(byte[] buf, int off, int len,
int ref_off) {
CRC32 crc32 = new CRC32();
crc32.update(buf, off, len);
long value = crc32.getValue();
for (int i = 0; i < 4; ++i)
if ((byte)(value >>> (i * 8)) != buf[ref_off + i])
return false;
return true;
}
public static StreamFlags decodeStreamHeader(byte[] buf)
throws IOException {
for (int i = 0; i < XZ.HEADER_MAGIC.length; ++i)
if (buf[i] != XZ.HEADER_MAGIC[i])
throw new XZFormatException();
if (!isCRC32Valid(buf, XZ.HEADER_MAGIC.length, 2,
XZ.HEADER_MAGIC.length + 2))
throw new CorruptedInputException("XZ Stream Header is corrupt");
try {
return decodeStreamFlags(buf, XZ.HEADER_MAGIC.length);
} catch (UnsupportedOptionsException e) {
throw new UnsupportedOptionsException(
"Unsupported options in XZ Stream Header");
}
}
public static StreamFlags decodeStreamFooter(byte[] buf)
throws IOException {
if (buf[10] != XZ.FOOTER_MAGIC[0] || buf[11] != XZ.FOOTER_MAGIC[1]) {
// NOTE: The exception could be XZFormatException too.
// It depends on the situation which one is better.
throw new CorruptedInputException("XZ Stream Footer is corrupt");
}
if (!isCRC32Valid(buf, 4, 6, 0))
throw new CorruptedInputException("XZ Stream Footer is corrupt");
StreamFlags streamFlags;
try {
streamFlags = decodeStreamFlags(buf, 8);
} catch (UnsupportedOptionsException e) {
throw new UnsupportedOptionsException(
"Unsupported options in XZ Stream Footer");
}
streamFlags.backwardSize = 0;
for (int i = 0; i < 4; ++i)
streamFlags.backwardSize |= (buf[i + 4] & 0xFF) << (i * 8);
streamFlags.backwardSize = (streamFlags.backwardSize + 1) * 4;
return streamFlags;
}
private static StreamFlags decodeStreamFlags(byte[] buf, int off)
throws UnsupportedOptionsException {
if (buf[off] != 0x00 || (buf[off + 1] & 0xFF) >= 0x10)
throw new UnsupportedOptionsException();
StreamFlags streamFlags = new StreamFlags();
streamFlags.checkType = buf[off + 1];
return streamFlags;
}
public static boolean areStreamFlagsEqual(StreamFlags a, StreamFlags b) {
// backwardSize is intentionally not compared.
return a.checkType == b.checkType;
}
public static long decodeVLI(InputStream in) throws IOException {
int b = in.read();
if (b == -1)
throw new EOFException();
long num = b & 0x7F;
int i = 0;
while ((b & 0x80) != 0x00) {
if (++i >= VLI_SIZE_MAX)
throw new CorruptedInputException();
b = in.read();
if (b == -1)
throw new EOFException();
if (b == 0x00)
throw new CorruptedInputException();
num |= (long)(b & 0x7F) << (i * 7);
}
return num;
}
}

View File

@ -1,36 +0,0 @@
/*
* EncoderUtil
*
* Author: Lasse Collin <lasse.collin@tukaani.org>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz.common;
import java.io.OutputStream;
import java.io.IOException;
import java.util.zip.CRC32;
public class EncoderUtil extends Util {
public static void writeCRC32(OutputStream out, byte[] buf)
throws IOException {
CRC32 crc32 = new CRC32();
crc32.update(buf);
long value = crc32.getValue();
for (int i = 0; i < 4; ++i)
out.write((byte)(value >>> (i * 8)));
}
public static void encodeVLI(OutputStream out, long num)
throws IOException {
while (num >= 0x80) {
out.write((byte)(num | 0x80));
num >>>= 7;
}
out.write((byte)num);
}
}

View File

@ -1,15 +0,0 @@
/*
* StreamFlags
*
* Author: Lasse Collin <lasse.collin@tukaani.org>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz.common;
public class StreamFlags {
public int checkType = -1;
public long backwardSize = -1;
}

View File

@ -1,28 +0,0 @@
/*
* Util
*
* Author: Lasse Collin <lasse.collin@tukaani.org>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz.common;
public class Util {
public static final int STREAM_HEADER_SIZE = 12;
public static final long BACKWARD_SIZE_MAX = 1L << 34;
public static final int BLOCK_HEADER_SIZE_MAX = 1024;
public static final long VLI_MAX = Long.MAX_VALUE;
public static final int VLI_SIZE_MAX = 9;
public static int getVLISize(long num) {
int size = 0;
do {
++size;
num >>= 7;
} while (num != 0);
return size;
}
}

View File

@ -1,27 +0,0 @@
/*
* DeltaCoder
*
* Author: Lasse Collin <lasse.collin@tukaani.org>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz.delta;
abstract class DeltaCoder {
static final int DISTANCE_MIN = 1;
static final int DISTANCE_MAX = 256;
static final int DISTANCE_MASK = DISTANCE_MAX - 1;
final int distance;
final byte[] history = new byte[DISTANCE_MAX];
int pos = 0;
public DeltaCoder(int distance) {
if (distance < DISTANCE_MIN || distance > DISTANCE_MAX)
throw new IllegalArgumentException();
this.distance = distance;
}
}

View File

@ -1,24 +0,0 @@
/*
* DeltaDecoder
*
* Author: Lasse Collin <lasse.collin@tukaani.org>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz.delta;
public class DeltaDecoder extends DeltaCoder {
public DeltaDecoder(int distance) {
super(distance);
}
public void decode(byte[] buf, int off, int len) {
int end = off + len;
for (int i = off; i < end; ++i) {
buf[i] += history[(distance + pos) & DISTANCE_MASK];
history[pos-- & DISTANCE_MASK] = buf[i];
}
}
}

View File

@ -1,56 +0,0 @@
/*
* IndexBase
*
* Author: Lasse Collin <lasse.collin@tukaani.org>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz.index;
import org.tukaani.xz.common.Util;
import org.tukaani.xz.XZIOException;
abstract class IndexBase {
private final XZIOException invalidIndexException;
long blocksSum = 0;
long uncompressedSum = 0;
long indexListSize = 0;
long recordCount = 0;
IndexBase(XZIOException invalidIndexException) {
this.invalidIndexException = invalidIndexException;
}
private long getUnpaddedIndexSize() {
// Index Indicator + Number of Records + List of Records + CRC32
return 1 + Util.getVLISize(recordCount) + indexListSize + 4;
}
public long getIndexSize() {
return (getUnpaddedIndexSize() + 3) & ~3;
}
long getStreamSize() {
return Util.STREAM_HEADER_SIZE + blocksSum + getIndexSize()
+ Util.STREAM_HEADER_SIZE;
}
int getIndexPaddingSize() {
return (int)((4 - getUnpaddedIndexSize()) & 3);
}
void add(long unpaddedSize, long uncompressedSize) throws XZIOException {
blocksSum += (unpaddedSize + 3) & ~3;
uncompressedSum += uncompressedSize;
indexListSize += Util.getVLISize(unpaddedSize)
+ Util.getVLISize(uncompressedSize);
++recordCount;
if (blocksSum < 0 || uncompressedSum < 0
|| getIndexSize() > Util.BACKWARD_SIZE_MAX
|| getStreamSize() < 0)
throw invalidIndexException;
}
}

View File

@ -1,59 +0,0 @@
/*
* IndexEncoder
*
* Author: Lasse Collin <lasse.collin@tukaani.org>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz.index;
import java.io.OutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.zip.CheckedOutputStream;
import org.tukaani.xz.common.EncoderUtil;
import org.tukaani.xz.XZIOException;
public class IndexEncoder extends IndexBase {
private final ArrayList<IndexRecord> records = new ArrayList<>();
public IndexEncoder() {
super(new XZIOException("XZ Stream or its Index has grown too big"));
}
public void add(long unpaddedSize, long uncompressedSize)
throws XZIOException {
super.add(unpaddedSize, uncompressedSize);
records.add(new IndexRecord(unpaddedSize, uncompressedSize));
}
public void encode(OutputStream out) throws IOException {
java.util.zip.CRC32 crc32 = new java.util.zip.CRC32();
CheckedOutputStream outChecked = new CheckedOutputStream(out, crc32);
// Index Indicator
outChecked.write(0x00);
// Number of Records
EncoderUtil.encodeVLI(outChecked, recordCount);
// List of Records
for (Iterator i = records.iterator(); i.hasNext(); ) {
IndexRecord record = (IndexRecord)i.next();
EncoderUtil.encodeVLI(outChecked, record.unpadded);
EncoderUtil.encodeVLI(outChecked, record.uncompressed);
}
// Index Padding
for (int i = getIndexPaddingSize(); i > 0; --i)
outChecked.write(0x00);
// CRC32
long value = crc32.getValue();
for (int i = 0; i < 4; ++i)
out.write((byte)(value >>> (i * 8)));
}
}

View File

@ -1,94 +0,0 @@
/*
* IndexHash
*
* Author: Lasse Collin <lasse.collin@tukaani.org>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz.index;
import java.io.InputStream;
import java.io.DataInputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.Arrays;
import java.util.zip.CheckedInputStream;
import org.tukaani.xz.common.DecoderUtil;
import org.tukaani.xz.XZIOException;
import org.tukaani.xz.CorruptedInputException;
public class IndexHash extends IndexBase {
private org.tukaani.xz.check.Check hash;
public IndexHash() {
super(new CorruptedInputException());
try {
hash = new org.tukaani.xz.check.SHA256();
} catch (java.security.NoSuchAlgorithmException e) {
hash = new org.tukaani.xz.check.CRC32();
}
}
public void add(long unpaddedSize, long uncompressedSize)
throws XZIOException {
super.add(unpaddedSize, uncompressedSize);
ByteBuffer buf = ByteBuffer.allocate(2 * 8);
buf.putLong(unpaddedSize);
buf.putLong(uncompressedSize);
hash.update(buf.array());
}
public void validate(InputStream in) throws IOException {
// Index Indicator (0x00) has already been read by BlockInputStream
// so add 0x00 to the CRC32 here.
java.util.zip.CRC32 crc32 = new java.util.zip.CRC32();
crc32.update('\0');
CheckedInputStream inChecked = new CheckedInputStream(in, crc32);
// Get and validate the Number of Records field.
long storedRecordCount = DecoderUtil.decodeVLI(inChecked);
if (storedRecordCount != recordCount)
throw new CorruptedInputException("XZ Index is corrupt");
// Decode and hash the Index field and compare it to
// the hash value calculated from the decoded Blocks.
IndexHash stored = new IndexHash();
for (long i = 0; i < recordCount; ++i) {
long unpaddedSize = DecoderUtil.decodeVLI(inChecked);
long uncompressedSize = DecoderUtil.decodeVLI(inChecked);
try {
stored.add(unpaddedSize, uncompressedSize);
} catch (XZIOException e) {
throw new CorruptedInputException("XZ Index is corrupt");
}
if (stored.blocksSum > blocksSum
|| stored.uncompressedSum > uncompressedSum
|| stored.indexListSize > indexListSize)
throw new CorruptedInputException("XZ Index is corrupt");
}
if (stored.blocksSum != blocksSum
|| stored.uncompressedSum != uncompressedSum
|| stored.indexListSize != indexListSize
|| !Arrays.equals(stored.hash.finish(), hash.finish()))
throw new CorruptedInputException("XZ Index is corrupt");
// Index Padding
DataInputStream inData = new DataInputStream(inChecked);
for (int i = getIndexPaddingSize(); i > 0; --i)
if (inData.readUnsignedByte() != 0x00)
throw new CorruptedInputException("XZ Index is corrupt");
// CRC32
long value = crc32.getValue();
for (int i = 0; i < 4; ++i)
if (((value >>> (i * 8)) & 0xFF) != inData.readUnsignedByte())
throw new CorruptedInputException("XZ Index is corrupt");
}
}

View File

@ -1,20 +0,0 @@
/*
* IndexRecord
*
* Author: Lasse Collin <lasse.collin@tukaani.org>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz.index;
public class IndexRecord {
public final long unpadded;
public final long uncompressed;
IndexRecord(long unpadded, long uncompressed) {
this.unpadded = unpadded;
this.uncompressed = uncompressed;
}
}

View File

@ -1,126 +0,0 @@
/*
* LZDecoder
*
* Authors: Lasse Collin <lasse.collin@tukaani.org>
* Igor Pavlov <http://7-zip.org/>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz.lz;
import java.io.DataInputStream;
import java.io.IOException;
import org.tukaani.xz.CorruptedInputException;
public final class LZDecoder {
private final byte[] buf;
private int start = 0;
private int pos = 0;
private int full = 0;
private int limit = 0;
private int pendingLen = 0;
private int pendingDist = 0;
public LZDecoder(int dictSize, byte[] presetDict) {
buf = new byte[dictSize];
if (presetDict != null) {
pos = Math.min(presetDict.length, dictSize);
full = pos;
start = pos;
System.arraycopy(presetDict, presetDict.length - pos, buf, 0, pos);
}
}
public void reset() {
start = 0;
pos = 0;
full = 0;
limit = 0;
buf[buf.length - 1] = 0x00;
}
public void setLimit(int outMax) {
if (buf.length - pos <= outMax)
limit = buf.length;
else
limit = pos + outMax;
}
public boolean hasSpace() {
return pos < limit;
}
public boolean hasPending() {
return pendingLen > 0;
}
public int getPos() {
return pos;
}
public int getByte(int dist) {
int offset = pos - dist - 1;
if (dist >= pos)
offset += buf.length;
return buf[offset] & 0xFF;
}
public void putByte(byte b) {
buf[pos++] = b;
if (full < pos)
full = pos;
}
public void repeat(int dist, int len) throws IOException {
if (dist < 0 || dist >= full)
throw new CorruptedInputException();
int left = Math.min(limit - pos, len);
pendingLen = len - left;
pendingDist = dist;
int back = pos - dist - 1;
if (dist >= pos)
back += buf.length;
do {
buf[pos++] = buf[back++];
if (back == buf.length)
back = 0;
} while (--left > 0);
if (full < pos)
full = pos;
}
public void repeatPending() throws IOException {
if (pendingLen > 0)
repeat(pendingDist, pendingLen);
}
public void copyUncompressed(DataInputStream inData, int len)
throws IOException {
int copySize = Math.min(buf.length - pos, len);
inData.readFully(buf, pos, copySize);
pos += copySize;
if (full < pos)
full = pos;
}
public int flush(byte[] out, int outOff) {
int copySize = pos - start;
if (pos == buf.length)
pos = 0;
System.arraycopy(buf, start, out, outOff, copySize);
start = pos;
return copySize;
}
}

View File

@ -1,139 +0,0 @@
/*
* LZMACoder
*
* Authors: Lasse Collin <lasse.collin@tukaani.org>
* Igor Pavlov <http://7-zip.org/>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz.lzma;
import org.tukaani.xz.rangecoder.RangeCoder;
abstract class LZMACoder {
static final int POS_STATES_MAX = 1 << 4;
static final int MATCH_LEN_MIN = 2;
static final int MATCH_LEN_MAX = MATCH_LEN_MIN + LengthCoder.LOW_SYMBOLS
+ LengthCoder.MID_SYMBOLS
+ LengthCoder.HIGH_SYMBOLS - 1;
static final int DIST_STATES = 4;
static final int DIST_SLOTS = 1 << 6;
static final int DIST_MODEL_START = 4;
static final int DIST_MODEL_END = 14;
static final int ALIGN_BITS = 4;
static final int ALIGN_SIZE = 1 << ALIGN_BITS;
static final int ALIGN_MASK = ALIGN_SIZE - 1;
static final int REPS = 4;
final int posMask;
final int[] rep = new int[4];
final State state = new State();
final short[][] isMatch = new short[State.STATES][POS_STATES_MAX];
final short[] isRep = new short[State.STATES];
final short[] isRep0 = new short[State.STATES];
final short[] isRep1 = new short[State.STATES];
final short[] isRep2 = new short[State.STATES];
final short[][] isRep0Long = new short[State.STATES][POS_STATES_MAX];
final short[][] distSlots = new short[DIST_STATES][DIST_SLOTS];
final short[][] distSpecial = { new short[2], new short[2],
new short[4], new short[4],
new short[8], new short[8],
new short[16], new short[16],
new short[32], new short[32] };
final short[] distAlign = new short[ALIGN_SIZE];
static int getDistState(int len) {
return len < DIST_STATES + MATCH_LEN_MIN
? len - MATCH_LEN_MIN
: DIST_STATES - 1;
}
LZMACoder(int pb) {
posMask = (1 << pb) - 1;
}
void reset() {
rep[0] = 0;
rep[1] = 0;
rep[2] = 0;
rep[3] = 0;
state.reset();
for (int i = 0; i < isMatch.length; ++i)
RangeCoder.initProbs(isMatch[i]);
RangeCoder.initProbs(isRep);
RangeCoder.initProbs(isRep0);
RangeCoder.initProbs(isRep1);
RangeCoder.initProbs(isRep2);
for (int i = 0; i < isRep0Long.length; ++i)
RangeCoder.initProbs(isRep0Long[i]);
for (int i = 0; i < distSlots.length; ++i)
RangeCoder.initProbs(distSlots[i]);
for (int i = 0; i < distSpecial.length; ++i)
RangeCoder.initProbs(distSpecial[i]);
RangeCoder.initProbs(distAlign);
}
abstract static class LiteralCoder {
private final int lc;
private final int literalPosMask;
LiteralCoder(int lc, int lp) {
this.lc = lc;
this.literalPosMask = (1 << lp) - 1;
}
final int getSubcoderIndex(int prevByte, int pos) {
int low = prevByte >> (8 - lc);
int high = (pos & literalPosMask) << lc;
return low + high;
}
abstract class LiteralSubcoder {
final short[] probs = new short[0x300];
void reset() {
RangeCoder.initProbs(probs);
}
}
}
abstract static class LengthCoder {
static final int LOW_SYMBOLS = 1 << 3;
static final int MID_SYMBOLS = 1 << 3;
static final int HIGH_SYMBOLS = 1 << 8;
final short[] choice = new short[2];
final short[][] low = new short[POS_STATES_MAX][LOW_SYMBOLS];
final short[][] mid = new short[POS_STATES_MAX][MID_SYMBOLS];
final short[] high = new short[HIGH_SYMBOLS];
void reset() {
RangeCoder.initProbs(choice);
for (int i = 0; i < low.length; ++i)
RangeCoder.initProbs(low[i]);
for (int i = 0; i < low.length; ++i)
RangeCoder.initProbs(mid[i]);
RangeCoder.initProbs(high);
}
}
}

View File

@ -1,189 +0,0 @@
/*
* LZMADecoder
*
* Authors: Lasse Collin <lasse.collin@tukaani.org>
* Igor Pavlov <http://7-zip.org/>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz.lzma;
import java.io.IOException;
import org.tukaani.xz.lz.LZDecoder;
import org.tukaani.xz.rangecoder.RangeDecoder;
import org.tukaani.xz.CorruptedInputException;
public final class LZMADecoder extends LZMACoder {
private final LZDecoder lz;
private final RangeDecoder rc;
private final LiteralDecoder literalDecoder;
private final LengthDecoder matchLenDecoder = new LengthDecoder();
private final LengthDecoder repLenDecoder = new LengthDecoder();
public LZMADecoder(LZDecoder lz, RangeDecoder rc, int lc, int lp, int pb) {
super(pb);
this.lz = lz;
this.rc = rc;
this.literalDecoder = new LiteralDecoder(lc, lp);
reset();
}
public void reset() {
super.reset();
literalDecoder.reset();
matchLenDecoder.reset();
repLenDecoder.reset();
}
public void decode() throws IOException {
lz.repeatPending();
while (lz.hasSpace()) {
int posState = lz.getPos() & posMask;
if (rc.decodeBit(isMatch[state.get()], posState) == 0) {
literalDecoder.decode();
} else {
int len = rc.decodeBit(isRep, state.get()) == 0
? decodeMatch(posState)
: decodeRepMatch(posState);
lz.repeat(rep[0], len);
}
}
rc.normalize();
if (!rc.isInBufferOK())
throw new CorruptedInputException();
}
private int decodeMatch(int posState) throws IOException {
state.updateMatch();
rep[3] = rep[2];
rep[2] = rep[1];
rep[1] = rep[0];
int len = matchLenDecoder.decode(posState);
int distSlot = rc.decodeBitTree(distSlots[getDistState(len)]);
if (distSlot < DIST_MODEL_START) {
rep[0] = distSlot;
} else {
int limit = (distSlot >> 1) - 1;
rep[0] = (2 | (distSlot & 1)) << limit;
if (distSlot < DIST_MODEL_END) {
rep[0] |= rc.decodeReverseBitTree(
distSpecial[distSlot - DIST_MODEL_START]);
} else {
rep[0] |= rc.decodeDirectBits(limit - ALIGN_BITS)
<< ALIGN_BITS;
rep[0] |= rc.decodeReverseBitTree(distAlign);
}
}
return len;
}
private int decodeRepMatch(int posState) throws IOException {
if (rc.decodeBit(isRep0, state.get()) == 0) {
if (rc.decodeBit(isRep0Long[state.get()], posState) == 0) {
state.updateShortRep();
return 1;
}
} else {
int tmp;
if (rc.decodeBit(isRep1, state.get()) == 0) {
tmp = rep[1];
} else {
if (rc.decodeBit(isRep2, state.get()) == 0) {
tmp = rep[2];
} else {
tmp = rep[3];
rep[3] = rep[2];
}
rep[2] = rep[1];
}
rep[1] = rep[0];
rep[0] = tmp;
}
state.updateLongRep();
return repLenDecoder.decode(posState);
}
private class LiteralDecoder extends LiteralCoder {
final LiteralSubdecoder[] subdecoders;
LiteralDecoder(int lc, int lp) {
super(lc, lp);
subdecoders = new LiteralSubdecoder[1 << (lc + lp)];
for (int i = 0; i < subdecoders.length; ++i)
subdecoders[i] = new LiteralSubdecoder();
}
void reset() {
for (int i = 0; i < subdecoders.length; ++i)
subdecoders[i].reset();
}
void decode() throws IOException {
int i = getSubcoderIndex(lz.getByte(0), lz.getPos());
subdecoders[i].decode();
}
private class LiteralSubdecoder extends LiteralSubcoder {
void decode() throws IOException {
int symbol = 1;
if (state.isLiteral()) {
do {
symbol = (symbol << 1) | rc.decodeBit(probs, symbol);
} while (symbol < 0x100);
} else {
int matchByte = lz.getByte(rep[0]);
int offset = 0x100;
int matchBit;
int bit;
do {
matchByte <<= 1;
matchBit = matchByte & offset;
bit = rc.decodeBit(probs, offset + matchBit + symbol);
symbol = (symbol << 1) | bit;
offset &= (-bit) ^ ~matchBit;
} while (symbol < 0x100);
}
lz.putByte((byte)symbol);
state.updateLiteral();
}
}
}
private class LengthDecoder extends LengthCoder {
int decode(int posState) throws IOException {
if (rc.decodeBit(choice, 0) == 0)
return rc.decodeBitTree(low[posState]) + MATCH_LEN_MIN;
if (rc.decodeBit(choice, 1) == 0)
return rc.decodeBitTree(mid[posState])
+ MATCH_LEN_MIN + LOW_SYMBOLS;
return rc.decodeBitTree(high)
+ MATCH_LEN_MIN + LOW_SYMBOLS + MID_SYMBOLS;
}
}
}

View File

@ -1,65 +0,0 @@
/*
* State
*
* Authors: Lasse Collin <lasse.collin@tukaani.org>
* Igor Pavlov <http://7-zip.org/>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz.lzma;
final class State {
static final int STATES = 12;
private static final int LIT_STATES = 7;
private static final int LIT_LIT = 0;
private static final int MATCH_LIT_LIT = 1;
private static final int REP_LIT_LIT = 2;
private static final int SHORTREP_LIT_LIT = 3;
private static final int MATCH_LIT = 4;
private static final int REP_LIT = 5;
private static final int SHORTREP_LIT = 6;
private static final int LIT_MATCH = 7;
private static final int LIT_LONGREP = 8;
private static final int LIT_SHORTREP = 9;
private static final int NONLIT_MATCH = 10;
private static final int NONLIT_REP = 11;
private int state;
void reset() {
state = LIT_LIT;
}
int get() {
return state;
}
void updateLiteral() {
if (state <= SHORTREP_LIT_LIT)
state = LIT_LIT;
else if (state <= LIT_SHORTREP)
state -= 3;
else
state -= 6;
}
void updateMatch() {
state = state < LIT_STATES ? LIT_MATCH : NONLIT_MATCH;
}
void updateLongRep() {
state = state < LIT_STATES ? LIT_LONGREP : NONLIT_REP;
}
void updateShortRep() {
state = state < LIT_STATES ? LIT_SHORTREP : NONLIT_REP;
}
boolean isLiteral() {
return state < LIT_STATES;
}
}

View File

@ -1,21 +0,0 @@
/**
* XZ data compression support.
* <p>
* In the (very) long term, this aims to be a complete implementation of
* XZ data compression in Java. Currently only streamed decompression is
* supported.
* <p>
* For the latest source code, see the
* <a href="http://tukaani.org/xz/java.html">home page of XZ in Java</a>.
*
* <h3>Decompression notes</h3>
*
* If you are decompressing complete files and your application knows
* exactly how much uncompressed data there should be, it is still good
* to try reading one more byte by calling <code>read()</code> and checking
* that it returns <code>-1</code>. This way the decompressor will parse the
* file footers and verify the integrity checks, giving the caller more
* confidence that the uncompressed data is valid. (This advice seems to
* apply to <code>java.util.zip.GZIPInputStream</code> too.)
*/
package org.tukaani.xz;

View File

@ -1,25 +0,0 @@
/*
* RangeCoder
*
* Authors: Lasse Collin <lasse.collin@tukaani.org>
* Igor Pavlov <http://7-zip.org/>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz.rangecoder;
public abstract class RangeCoder {
static final int SHIFT_BITS = 8;
static final int TOP_MASK = 0xFF000000;
static final int BIT_MODEL_TOTAL_BITS = 11;
static final int BIT_MODEL_TOTAL = 1 << BIT_MODEL_TOTAL_BITS;
static final short PROB_INIT = (short)(BIT_MODEL_TOTAL / 2);
static final int MOVE_BITS = 5;
public static void initProbs(short[] probs) {
for (int i = 0; i < probs.length; ++i)
probs[i] = PROB_INIT;
}
}

View File

@ -1,129 +0,0 @@
/*
* RangeDecoder
*
* Authors: Lasse Collin <lasse.collin@tukaani.org>
* Igor Pavlov <http://7-zip.org/>
*
* This file has been put into the public domain.
* You can do whatever you want with this file.
*/
package org.tukaani.xz.rangecoder;
import java.io.DataInputStream;
import java.io.IOException;
import org.tukaani.xz.CorruptedInputException;
public final class RangeDecoder extends RangeCoder {
private static final int INIT_SIZE = 5;
private final byte[] buf;
private int pos = 0;
private int end = 0;
private int range = 0;
private int code = 0;
public RangeDecoder(int inputSizeMax) {
buf = new byte[inputSizeMax - INIT_SIZE];
}
public void prepareInputBuffer(DataInputStream in, int len)
throws IOException {
if (len < INIT_SIZE)
throw new CorruptedInputException();
if (in.readUnsignedByte() != 0x00)
throw new CorruptedInputException();
code = in.readInt();
range = 0xFFFFFFFF;
pos = 0;
end = len - INIT_SIZE;
in.readFully(buf, 0, end);
}
public boolean isInBufferOK() {
return pos <= end;
}
public boolean isFinished() {
return pos == end && code == 0;
}
public void normalize() throws IOException {
if ((range & TOP_MASK) == 0) {
try {
// If the input is corrupt, this might throw
// ArrayIndexOutOfBoundsException.
code = (code << SHIFT_BITS) | (buf[pos++] & 0xFF);
range <<= SHIFT_BITS;
} catch (ArrayIndexOutOfBoundsException e) {
throw new CorruptedInputException();
}
}
}
public int decodeBit(short[] probs, int index) throws IOException {
normalize();
int prob = probs[index];
int bound = (range >>> BIT_MODEL_TOTAL_BITS) * prob;
int bit;
// Compare code and bound as if they were unsigned 32-bit integers.
if ((code ^ 0x80000000) < (bound ^ 0x80000000)) {
range = bound;
probs[index] = (short)(
prob + ((BIT_MODEL_TOTAL - prob) >>> MOVE_BITS));
bit = 0;
} else {
range -= bound;
code -= bound;
probs[index] = (short)(prob - (prob >>> MOVE_BITS));
bit = 1;
}
return bit;
}
public int decodeBitTree(short[] probs) throws IOException {
int symbol = 1;
do {
symbol = (symbol << 1) | decodeBit(probs, symbol);
} while (symbol < probs.length);
return symbol - probs.length;
}
public int decodeReverseBitTree(short[] probs) throws IOException {
int symbol = 1;
int i = 0;
int result = 0;
do {
int bit = decodeBit(probs, symbol);
symbol = (symbol << 1) | bit;
result |= bit << i++;
} while (symbol < probs.length);
return result;
}
public int decodeDirectBits(int count) throws IOException {
int result = 0;
do {
normalize();
range >>>= 1;
int t = (code - range) >>> 31;
code -= range & (t - 1);
result = (result << 1) | (1 - t);
} while (--count != 0);
return result;
}
}