(screenshot-capture-tool) Make screenshot-capture-tool cooperate with docker
This commit is contained in:
parent
a8b9d21f2d
commit
3047e2dd7c
@ -8,7 +8,6 @@ import lombok.SneakyThrows;
|
|||||||
import nu.marginalia.hash.MurmurHash3_128;
|
import nu.marginalia.hash.MurmurHash3_128;
|
||||||
import nu.marginalia.link_parser.LinkParser;
|
import nu.marginalia.link_parser.LinkParser;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.service.control.ServiceHeartbeat;
|
|
||||||
import nu.marginalia.storage.model.*;
|
import nu.marginalia.storage.model.*;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
|
@ -3,6 +3,7 @@ plugins {
|
|||||||
|
|
||||||
id 'application'
|
id 'application'
|
||||||
id 'jvm-test-suite'
|
id 'jvm-test-suite'
|
||||||
|
id 'com.palantir.docker' version '0.35.0'
|
||||||
}
|
}
|
||||||
|
|
||||||
java {
|
java {
|
||||||
@ -16,6 +17,8 @@ application {
|
|||||||
applicationName = 'screenshot-capture-tool'
|
applicationName = 'screenshot-capture-tool'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
apply from: "$rootProject.projectDir/docker-service.gradle"
|
||||||
|
|
||||||
tasks.distZip.enabled = false
|
tasks.distZip.enabled = false
|
||||||
|
|
||||||
dependencies {
|
dependencies {
|
||||||
@ -30,6 +33,7 @@ dependencies {
|
|||||||
implementation libs.commons.compress
|
implementation libs.commons.compress
|
||||||
implementation libs.commons.io
|
implementation libs.commons.io
|
||||||
implementation libs.guice
|
implementation libs.guice
|
||||||
|
implementation libs.gson
|
||||||
|
|
||||||
testImplementation libs.bundles.slf4j.test
|
testImplementation libs.bundles.slf4j.test
|
||||||
testImplementation libs.bundles.junit
|
testImplementation libs.bundles.junit
|
||||||
|
@ -1,37 +1,28 @@
|
|||||||
package nu.marginalia.screenshot;
|
package nu.marginalia.screenshot;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
|
import nu.marginalia.model.gson.GsonFactory;
|
||||||
import nu.marginalia.service.module.DatabaseModule;
|
import nu.marginalia.service.module.DatabaseModule;
|
||||||
import org.jetbrains.annotations.NotNull;
|
|
||||||
import org.openqa.selenium.OutputType;
|
|
||||||
import org.openqa.selenium.PageLoadStrategy;
|
|
||||||
import org.openqa.selenium.TimeoutException;
|
|
||||||
import org.openqa.selenium.chrome.ChromeDriver;
|
|
||||||
import org.openqa.selenium.chrome.ChromeDriverService;
|
import org.openqa.selenium.chrome.ChromeDriverService;
|
||||||
import org.openqa.selenium.chrome.ChromeOptions;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import javax.imageio.ImageIO;
|
|
||||||
import java.io.ByteArrayInputStream;
|
import java.io.ByteArrayInputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
import java.net.http.HttpClient;
|
import java.net.http.HttpClient;
|
||||||
import java.net.http.HttpRequest;
|
import java.net.http.HttpRequest;
|
||||||
import java.net.http.HttpResponse;
|
import java.net.http.HttpResponse;
|
||||||
import java.nio.file.Files;
|
|
||||||
import java.nio.file.Path;
|
|
||||||
import java.sql.Connection;
|
import java.sql.Connection;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
import java.time.Duration;
|
import java.time.Duration;
|
||||||
import java.util.*;
|
import java.util.ArrayList;
|
||||||
import java.util.concurrent.ExecutorService;
|
import java.util.Collections;
|
||||||
import java.util.stream.Collectors;
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
import org.openqa.selenium.support.ui.ExpectedCondition;
|
import java.util.concurrent.TimeUnit;
|
||||||
import org.openqa.selenium.support.ui.WebDriverWait;
|
|
||||||
import org.openqa.selenium.JavascriptExecutor;
|
|
||||||
|
|
||||||
public class ScreenshotCaptureToolMain {
|
public class ScreenshotCaptureToolMain {
|
||||||
|
|
||||||
@ -43,30 +34,21 @@ public class ScreenshotCaptureToolMain {
|
|||||||
|
|
||||||
System.setProperty(ChromeDriverService.CHROME_DRIVER_SILENT_OUTPUT_PROPERTY, "true");
|
System.setProperty(ChromeDriverService.CHROME_DRIVER_SILENT_OUTPUT_PROPERTY, "true");
|
||||||
|
|
||||||
ChromeDriver driver = initChromeDriver();
|
|
||||||
List<EdgeDomain> crawlQueue = fetchCrawlQueue(ds, 1000);
|
List<EdgeDomain> crawlQueue = fetchCrawlQueue(ds, 1000);
|
||||||
|
|
||||||
HttpClient httpClient = HttpClient.newHttpClient();
|
HttpClient httpClient = HttpClient.newBuilder()
|
||||||
|
.version(HttpClient.Version.HTTP_1_1)
|
||||||
|
.connectTimeout(Duration.ofSeconds(30))
|
||||||
|
.build()
|
||||||
|
;
|
||||||
|
|
||||||
try (Connection conn = ds.getConnection()) {
|
try (Connection conn = ds.getConnection()) {
|
||||||
|
for (var domain : crawlQueue) {
|
||||||
|
|
||||||
logger.info("Probing domains");
|
|
||||||
var ret = crawlQueue.parallelStream().collect(Collectors.partitioningBy(domain -> probeUrl(httpClient, domain)));
|
|
||||||
|
|
||||||
var badDomains = ret.getOrDefault(Boolean.FALSE, Collections.emptyList());
|
|
||||||
var goodDomains = ret.getOrDefault(Boolean.TRUE, Collections.emptyList());
|
|
||||||
|
|
||||||
logger.info("Result: {} good domains, {} bad domains", goodDomains.size(), badDomains.size());
|
|
||||||
|
|
||||||
badDomains.forEach(domain -> flagDomainAsFetched(conn, domain));
|
|
||||||
|
|
||||||
for (var domain : goodDomains) {
|
|
||||||
logger.info("Fetching {}", domain);
|
logger.info("Fetching {}", domain);
|
||||||
|
|
||||||
var filePath = fetchDomain(driver, domain);
|
byte[] webpBytes = fetchDomain(httpClient, domain);
|
||||||
if (filePath != null) {
|
if (webpBytes != null) {
|
||||||
uploadScreenshot(conn, domain, filePath);
|
uploadScreenshot(conn, domain, webpBytes);
|
||||||
} else {
|
} else {
|
||||||
flagDomainAsFetched(conn, domain);
|
flagDomainAsFetched(conn, domain);
|
||||||
}
|
}
|
||||||
@ -74,38 +56,15 @@ public class ScreenshotCaptureToolMain {
|
|||||||
|
|
||||||
} catch (SQLException e) {
|
} catch (SQLException e) {
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
} finally {
|
|
||||||
driver.quit();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@NotNull
|
|
||||||
private static ChromeDriver initChromeDriver() {
|
|
||||||
System.setProperty("webdriver.chrome.driver", "./chromedriver");
|
|
||||||
ChromeOptions options = new ChromeOptions();
|
|
||||||
|
|
||||||
options.setPageLoadStrategy(PageLoadStrategy.NONE);
|
|
||||||
options.setPageLoadTimeout(Duration.ofSeconds(30));
|
|
||||||
|
|
||||||
options.addArguments(
|
|
||||||
"no-sandbox",
|
|
||||||
"headless",
|
|
||||||
"user-agent=search.marginalia.nu",
|
|
||||||
"window-size=1024,768",
|
|
||||||
"force-device-scale-factor=0.5",
|
|
||||||
"high-dpi-support=0.5",
|
|
||||||
"dns-prefetch-disable",
|
|
||||||
"disable-gpu",
|
|
||||||
"disable-dev-shm-usage",
|
|
||||||
"disable-software-rasterizer",
|
|
||||||
"disable-extensions"
|
|
||||||
);
|
|
||||||
|
|
||||||
return new ChromeDriver(options);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static void flagDomainAsFetched(Connection conn, EdgeDomain domain) {
|
private static void flagDomainAsFetched(Connection conn, EdgeDomain domain) {
|
||||||
try (var stmt = conn.prepareStatement("REPLACE INTO DATA_DOMAIN_HISTORY(DOMAIN_NAME, SCREENSHOT_DATE) VALUES (?, NOW())")) {
|
try (var stmt = conn.prepareStatement("""
|
||||||
|
REPLACE INTO DATA_DOMAIN_HISTORY(DOMAIN_NAME, SCREENSHOT_DATE)
|
||||||
|
VALUES (?, NOW())
|
||||||
|
"""))
|
||||||
|
{
|
||||||
stmt.setString(1, domain.toString());
|
stmt.setString(1, domain.toString());
|
||||||
stmt.executeUpdate();
|
stmt.executeUpdate();
|
||||||
} catch (SQLException e) {
|
} catch (SQLException e) {
|
||||||
@ -113,87 +72,68 @@ public class ScreenshotCaptureToolMain {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void uploadScreenshot(Connection conn, EdgeDomain domain, Path screenshotPath) {
|
private static void uploadScreenshot(Connection conn, EdgeDomain domain, byte[] webpBytes) {
|
||||||
logger.info("Uploading {}", screenshotPath);
|
try (var stmt = conn.prepareStatement("""
|
||||||
try (var stmt = conn.prepareStatement("REPLACE INTO DATA_DOMAIN_SCREENSHOT(DOMAIN_NAME, CONTENT_TYPE, DATA) VALUES (?,?,?)");
|
REPLACE INTO DATA_DOMAIN_SCREENSHOT(DOMAIN_NAME, CONTENT_TYPE, DATA)
|
||||||
var is = Files.newInputStream(screenshotPath)
|
VALUES (?,?,?)
|
||||||
|
""");
|
||||||
|
var is = new ByteArrayInputStream(webpBytes)
|
||||||
) {
|
) {
|
||||||
stmt.setString(1, domain.toString());
|
stmt.setString(1, domain.toString());
|
||||||
stmt.setString(2, "image/webp");
|
stmt.setString(2, "image/png");
|
||||||
stmt.setBlob(3, is);
|
stmt.setBlob(3, is);
|
||||||
stmt.executeUpdate();
|
stmt.executeUpdate();
|
||||||
|
|
||||||
Files.delete(screenshotPath);
|
|
||||||
|
|
||||||
} catch (SQLException | IOException e) {
|
} catch (SQLException | IOException e) {
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
flagDomainAsFetched(conn, domain);
|
flagDomainAsFetched(conn, domain);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static boolean probeUrl(HttpClient httpClient, EdgeDomain domain) {
|
private static Gson gson = GsonFactory.get();
|
||||||
|
|
||||||
|
private static byte[] fetchDomain(HttpClient client, EdgeDomain domain) {
|
||||||
try {
|
try {
|
||||||
|
Map<String, Object> requestData = Map.of(
|
||||||
|
"url", domain.toRootUrl().toString(),
|
||||||
|
"options",
|
||||||
|
Map.of("fullPage", false,
|
||||||
|
"type", "png"),
|
||||||
|
"gotoOptions", Map.of(
|
||||||
|
"waitUntil", "networkidle2",
|
||||||
|
"timeout", TimeUnit.SECONDS.toMillis(10)
|
||||||
|
)
|
||||||
|
);
|
||||||
|
|
||||||
var request = HttpRequest.newBuilder()
|
var request = HttpRequest.newBuilder()
|
||||||
.uri(new URI(domain.toRootUrl().toString()))
|
.uri(new URI("http://browserless:3000/screenshot"))
|
||||||
.timeout(Duration.ofSeconds(5))
|
.method("POST", HttpRequest.BodyPublishers.ofString(
|
||||||
.method("HEAD", HttpRequest.BodyPublishers.noBody())
|
gson.toJson(requestData)
|
||||||
.header("user-agent", "search.marginialia.nu")
|
))
|
||||||
|
.header("Content-type", "application/json")
|
||||||
.build();
|
.build();
|
||||||
|
var rsp = client.send(request, HttpResponse.BodyHandlers.ofByteArray());
|
||||||
|
|
||||||
var response = httpClient.send(request, HttpResponse.BodyHandlers.ofString());
|
if (rsp.statusCode() >= 300) {
|
||||||
|
|
||||||
return response.statusCode() < 400;
|
|
||||||
} catch (Exception ex) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
private static Path fetchDomain(ChromeDriver driver, EdgeDomain domain) {
|
|
||||||
try {
|
|
||||||
driver.get(domain.toRootUrl().toString());
|
|
||||||
|
|
||||||
WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(30));
|
|
||||||
|
|
||||||
try {
|
|
||||||
wait.until((ExpectedCondition<Boolean>) wd -> {
|
|
||||||
if (wd instanceof JavascriptExecutor jse) {
|
|
||||||
return "complete".equals(jse.executeScript("return document.readyState"));
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
catch (TimeoutException ex) {
|
|
||||||
logger.info("Wait timed out, forcing window.stop()");
|
|
||||||
driver.executeScript("window.stop()");
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
final byte[] bytes = driver.getScreenshotAs(OutputType.BYTES);
|
|
||||||
|
|
||||||
final var img = ImageIO.read(new ByteArrayInputStream(bytes));
|
|
||||||
|
|
||||||
|
|
||||||
Path destPath = Files.createTempFile("website-screenshot-", ".webp");
|
|
||||||
ImageIO.write(img, "webp", destPath.toFile());
|
|
||||||
|
|
||||||
// If the screenshot is very small by size, it's very likely not particularly interesting to look at
|
|
||||||
if (Files.size(destPath) < 3500) {
|
|
||||||
Files.delete(destPath);
|
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
return destPath;
|
byte[] image = rsp.body();
|
||||||
|
if (image.length < 3500) {
|
||||||
|
logger.warn("Skipping {} due to size ({})", domain, image.length);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return image;
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
ex.printStackTrace();
|
logger.warn("Exception in screenshotting " + domain, ex);
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static List<EdgeDomain> fetchCrawlQueue(HikariDataSource ds, int queueSize) {
|
private static List<EdgeDomain> fetchCrawlQueue(HikariDataSource ds, int queueSize) {
|
||||||
|
|
||||||
List<EdgeDomain> ret = new ArrayList<>(queueSize);
|
List<EdgeDomain> ret = new ArrayList<>(queueSize);
|
||||||
|
|
||||||
try (var conn = ds.getConnection(); var stmt = conn.createStatement()) {
|
try (var conn = ds.getConnection(); var stmt = conn.createStatement()) {
|
||||||
@ -208,8 +148,8 @@ public class ScreenshotCaptureToolMain {
|
|||||||
ret.add(new EdgeDomain(rsp.getString(1)));
|
ret.add(new EdgeDomain(rsp.getString(1)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (SQLException ex) {
|
catch (Exception ex) {
|
||||||
ex.printStackTrace();
|
logger.warn("Exception in fetching queue", ex);
|
||||||
return Collections.emptyList();
|
return Collections.emptyList();
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
|
53
docker-compose-screenshot-bot.yml
Normal file
53
docker-compose-screenshot-bot.yml
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
x-svc: &service
|
||||||
|
env_file:
|
||||||
|
- "run/env/service.env"
|
||||||
|
volumes:
|
||||||
|
- conf:/wmsa/conf:ro
|
||||||
|
- data:/wmsa/data
|
||||||
|
- logs:/var/log/wmsa
|
||||||
|
networks:
|
||||||
|
- wmsa
|
||||||
|
services:
|
||||||
|
screenshot-capture-tool:
|
||||||
|
<<: *service
|
||||||
|
image: "marginalia.nu/screenshot-capture-tool"
|
||||||
|
container_name: "screenshot-capture-tool"
|
||||||
|
networks:
|
||||||
|
- wmsa
|
||||||
|
- headlesschrome
|
||||||
|
depends_on:
|
||||||
|
- browserless
|
||||||
|
browserless:
|
||||||
|
<<: *service
|
||||||
|
image: "browserless/chrome"
|
||||||
|
container_name: "headlesschrome"
|
||||||
|
env_file:
|
||||||
|
- "run/env/browserless.env"
|
||||||
|
ports:
|
||||||
|
- "3000:3000"
|
||||||
|
networks:
|
||||||
|
- wmsa
|
||||||
|
- headlesschrome
|
||||||
|
|
||||||
|
networks:
|
||||||
|
wmsa:
|
||||||
|
headlesschrome:
|
||||||
|
volumes:
|
||||||
|
logs:
|
||||||
|
driver: local
|
||||||
|
driver_opts:
|
||||||
|
type: none
|
||||||
|
o: bind
|
||||||
|
device: run/logs
|
||||||
|
conf:
|
||||||
|
driver: local
|
||||||
|
driver_opts:
|
||||||
|
type: none
|
||||||
|
o: bind
|
||||||
|
device: run/conf
|
||||||
|
data:
|
||||||
|
driver: local
|
||||||
|
driver_opts:
|
||||||
|
type: none
|
||||||
|
o: bind
|
||||||
|
device: run/data
|
2
run/env/browserless.env
vendored
Normal file
2
run/env/browserless.env
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
DEFAULT_LAUNCH_ARGS='["--window-size=1024,768", "--user-agent=search.marginalia.nu", "--force-device-scale-factor=0.5", "--high-dpi-support=0.5"]'
|
||||||
|
DEBUG=-*
|
40
run/experiment-elsewhere.sh
Executable file
40
run/experiment-elsewhere.sh
Executable file
@ -0,0 +1,40 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
EXPERIMENT=$1
|
||||||
|
SAMPLE_NAME=crawl-${2:-m}
|
||||||
|
ARGS=${@:3}
|
||||||
|
SAMPLE_DIR="node-1/samples/${SAMPLE_NAME}/"
|
||||||
|
|
||||||
|
export EXPERIMENT_RUNNER_OPTS="--enable-preview"
|
||||||
|
echo "args = $ARGS"
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
pushd $(dirname $0)
|
||||||
|
|
||||||
|
JAVA_OPTS="
|
||||||
|
-Dcrawl.rootDirRewrite=/crawl:${SAMPLE_DIR}
|
||||||
|
-Ddb.overrideJdbc=jdbc:mariadb://localhost:3306/WMSA_prod?rewriteBatchedStatements=true
|
||||||
|
-ea
|
||||||
|
"
|
||||||
|
|
||||||
|
## Configuration ends
|
||||||
|
|
||||||
|
if [ -z "$EXPERIMENT" ]; then
|
||||||
|
echo "Usage: $0 experiment-name path-to-crawl-data"
|
||||||
|
exit 255;
|
||||||
|
fi
|
||||||
|
|
||||||
|
tar xf ../code/tools/experiment-runner/build/distributions/experiment-runner.tar -C install/
|
||||||
|
|
||||||
|
PATH+=":install/experiment-runner/bin"
|
||||||
|
|
||||||
|
export WMSA_HOME=.
|
||||||
|
export PATH
|
||||||
|
export JAVA_OPTS
|
||||||
|
|
||||||
|
experiment-runner $2 ${EXPERIMENT} ${ARGS}
|
||||||
|
|
||||||
|
popd
|
Loading…
Reference in New Issue
Block a user