(screenshot-capture-tool) Make screenshot-capture-tool cooperate with docker

This commit is contained in:
Viktor Lofgren 2023-11-01 16:38:55 +01:00
parent a8b9d21f2d
commit 3047e2dd7c
6 changed files with 160 additions and 122 deletions

View File

@ -8,7 +8,6 @@ import lombok.SneakyThrows;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.link_parser.LinkParser;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.service.control.ServiceHeartbeat;
import nu.marginalia.storage.model.*;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;

View File

@ -3,6 +3,7 @@ plugins {
id 'application'
id 'jvm-test-suite'
id 'com.palantir.docker' version '0.35.0'
}
java {
@ -16,6 +17,8 @@ application {
applicationName = 'screenshot-capture-tool'
}
apply from: "$rootProject.projectDir/docker-service.gradle"
tasks.distZip.enabled = false
dependencies {
@ -30,6 +33,7 @@ dependencies {
implementation libs.commons.compress
implementation libs.commons.io
implementation libs.guice
implementation libs.gson
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit

View File

@ -1,37 +1,28 @@
package nu.marginalia.screenshot;
import com.google.gson.Gson;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.gson.GsonFactory;
import nu.marginalia.service.module.DatabaseModule;
import org.jetbrains.annotations.NotNull;
import org.openqa.selenium.OutputType;
import org.openqa.selenium.PageLoadStrategy;
import org.openqa.selenium.TimeoutException;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeDriverService;
import org.openqa.selenium.chrome.ChromeOptions;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.imageio.ImageIO;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.net.URI;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.Connection;
import java.sql.SQLException;
import java.time.Duration;
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.stream.Collectors;
import org.openqa.selenium.support.ui.ExpectedCondition;
import org.openqa.selenium.support.ui.WebDriverWait;
import org.openqa.selenium.JavascriptExecutor;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
public class ScreenshotCaptureToolMain {
@ -43,30 +34,21 @@ public class ScreenshotCaptureToolMain {
System.setProperty(ChromeDriverService.CHROME_DRIVER_SILENT_OUTPUT_PROPERTY, "true");
ChromeDriver driver = initChromeDriver();
List<EdgeDomain> crawlQueue = fetchCrawlQueue(ds, 1000);
HttpClient httpClient = HttpClient.newHttpClient();
HttpClient httpClient = HttpClient.newBuilder()
.version(HttpClient.Version.HTTP_1_1)
.connectTimeout(Duration.ofSeconds(30))
.build()
;
try (Connection conn = ds.getConnection()) {
logger.info("Probing domains");
var ret = crawlQueue.parallelStream().collect(Collectors.partitioningBy(domain -> probeUrl(httpClient, domain)));
var badDomains = ret.getOrDefault(Boolean.FALSE, Collections.emptyList());
var goodDomains = ret.getOrDefault(Boolean.TRUE, Collections.emptyList());
logger.info("Result: {} good domains, {} bad domains", goodDomains.size(), badDomains.size());
badDomains.forEach(domain -> flagDomainAsFetched(conn, domain));
for (var domain : goodDomains) {
for (var domain : crawlQueue) {
logger.info("Fetching {}", domain);
var filePath = fetchDomain(driver, domain);
if (filePath != null) {
uploadScreenshot(conn, domain, filePath);
byte[] webpBytes = fetchDomain(httpClient, domain);
if (webpBytes != null) {
uploadScreenshot(conn, domain, webpBytes);
} else {
flagDomainAsFetched(conn, domain);
}
@ -74,38 +56,15 @@ public class ScreenshotCaptureToolMain {
} catch (SQLException e) {
e.printStackTrace();
} finally {
driver.quit();
}
}
@NotNull
private static ChromeDriver initChromeDriver() {
System.setProperty("webdriver.chrome.driver", "./chromedriver");
ChromeOptions options = new ChromeOptions();
options.setPageLoadStrategy(PageLoadStrategy.NONE);
options.setPageLoadTimeout(Duration.ofSeconds(30));
options.addArguments(
"no-sandbox",
"headless",
"user-agent=search.marginalia.nu",
"window-size=1024,768",
"force-device-scale-factor=0.5",
"high-dpi-support=0.5",
"dns-prefetch-disable",
"disable-gpu",
"disable-dev-shm-usage",
"disable-software-rasterizer",
"disable-extensions"
);
return new ChromeDriver(options);
}
private static void flagDomainAsFetched(Connection conn, EdgeDomain domain) {
try (var stmt = conn.prepareStatement("REPLACE INTO DATA_DOMAIN_HISTORY(DOMAIN_NAME, SCREENSHOT_DATE) VALUES (?, NOW())")) {
try (var stmt = conn.prepareStatement("""
REPLACE INTO DATA_DOMAIN_HISTORY(DOMAIN_NAME, SCREENSHOT_DATE)
VALUES (?, NOW())
"""))
{
stmt.setString(1, domain.toString());
stmt.executeUpdate();
} catch (SQLException e) {
@ -113,87 +72,68 @@ public class ScreenshotCaptureToolMain {
}
}
private static void uploadScreenshot(Connection conn, EdgeDomain domain, Path screenshotPath) {
logger.info("Uploading {}", screenshotPath);
try (var stmt = conn.prepareStatement("REPLACE INTO DATA_DOMAIN_SCREENSHOT(DOMAIN_NAME, CONTENT_TYPE, DATA) VALUES (?,?,?)");
var is = Files.newInputStream(screenshotPath)
private static void uploadScreenshot(Connection conn, EdgeDomain domain, byte[] webpBytes) {
try (var stmt = conn.prepareStatement("""
REPLACE INTO DATA_DOMAIN_SCREENSHOT(DOMAIN_NAME, CONTENT_TYPE, DATA)
VALUES (?,?,?)
""");
var is = new ByteArrayInputStream(webpBytes)
) {
stmt.setString(1, domain.toString());
stmt.setString(2, "image/webp");
stmt.setString(2, "image/png");
stmt.setBlob(3, is);
stmt.executeUpdate();
Files.delete(screenshotPath);
} catch (SQLException | IOException e) {
e.printStackTrace();
}
flagDomainAsFetched(conn, domain);
}
private static boolean probeUrl(HttpClient httpClient, EdgeDomain domain) {
private static Gson gson = GsonFactory.get();
private static byte[] fetchDomain(HttpClient client, EdgeDomain domain) {
try {
Map<String, Object> requestData = Map.of(
"url", domain.toRootUrl().toString(),
"options",
Map.of("fullPage", false,
"type", "png"),
"gotoOptions", Map.of(
"waitUntil", "networkidle2",
"timeout", TimeUnit.SECONDS.toMillis(10)
)
);
var request = HttpRequest.newBuilder()
.uri(new URI(domain.toRootUrl().toString()))
.timeout(Duration.ofSeconds(5))
.method("HEAD", HttpRequest.BodyPublishers.noBody())
.header("user-agent", "search.marginialia.nu")
.uri(new URI("http://browserless:3000/screenshot"))
.method("POST", HttpRequest.BodyPublishers.ofString(
gson.toJson(requestData)
))
.header("Content-type", "application/json")
.build();
var rsp = client.send(request, HttpResponse.BodyHandlers.ofByteArray());
var response = httpClient.send(request, HttpResponse.BodyHandlers.ofString());
return response.statusCode() < 400;
} catch (Exception ex) {
return false;
}
}
private static Path fetchDomain(ChromeDriver driver, EdgeDomain domain) {
try {
driver.get(domain.toRootUrl().toString());
WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(30));
try {
wait.until((ExpectedCondition<Boolean>) wd -> {
if (wd instanceof JavascriptExecutor jse) {
return "complete".equals(jse.executeScript("return document.readyState"));
}
return true;
});
}
catch (TimeoutException ex) {
logger.info("Wait timed out, forcing window.stop()");
driver.executeScript("window.stop()");
}
final byte[] bytes = driver.getScreenshotAs(OutputType.BYTES);
final var img = ImageIO.read(new ByteArrayInputStream(bytes));
Path destPath = Files.createTempFile("website-screenshot-", ".webp");
ImageIO.write(img, "webp", destPath.toFile());
// If the screenshot is very small by size, it's very likely not particularly interesting to look at
if (Files.size(destPath) < 3500) {
Files.delete(destPath);
if (rsp.statusCode() >= 300) {
return null;
}
return destPath;
byte[] image = rsp.body();
if (image.length < 3500) {
logger.warn("Skipping {} due to size ({})", domain, image.length);
return null;
}
return image;
}
catch (Exception ex) {
ex.printStackTrace();
logger.warn("Exception in screenshotting " + domain, ex);
return null;
}
}
private static List<EdgeDomain> fetchCrawlQueue(HikariDataSource ds, int queueSize) {
List<EdgeDomain> ret = new ArrayList<>(queueSize);
try (var conn = ds.getConnection(); var stmt = conn.createStatement()) {
@ -202,14 +142,14 @@ public class ScreenshotCaptureToolMain {
SELECT EC_DOMAIN.DOMAIN_NAME FROM EC_DOMAIN
LEFT JOIN DATA_DOMAIN_HISTORY ON EC_DOMAIN.DOMAIN_NAME=DATA_DOMAIN_HISTORY.DOMAIN_NAME
ORDER BY SCREENSHOT_DATE IS NULL DESC, SCREENSHOT_DATE, INDEXED DESC
LIMIT
LIMIT
""" + queueSize);
while (rsp.next()) {
ret.add(new EdgeDomain(rsp.getString(1)));
}
}
catch (SQLException ex) {
ex.printStackTrace();
catch (Exception ex) {
logger.warn("Exception in fetching queue", ex);
return Collections.emptyList();
}
return ret;

View File

@ -0,0 +1,53 @@
x-svc: &service
env_file:
- "run/env/service.env"
volumes:
- conf:/wmsa/conf:ro
- data:/wmsa/data
- logs:/var/log/wmsa
networks:
- wmsa
services:
screenshot-capture-tool:
<<: *service
image: "marginalia.nu/screenshot-capture-tool"
container_name: "screenshot-capture-tool"
networks:
- wmsa
- headlesschrome
depends_on:
- browserless
browserless:
<<: *service
image: "browserless/chrome"
container_name: "headlesschrome"
env_file:
- "run/env/browserless.env"
ports:
- "3000:3000"
networks:
- wmsa
- headlesschrome
networks:
wmsa:
headlesschrome:
volumes:
logs:
driver: local
driver_opts:
type: none
o: bind
device: run/logs
conf:
driver: local
driver_opts:
type: none
o: bind
device: run/conf
data:
driver: local
driver_opts:
type: none
o: bind
device: run/data

2
run/env/browserless.env vendored Normal file
View File

@ -0,0 +1,2 @@
DEFAULT_LAUNCH_ARGS='["--window-size=1024,768", "--user-agent=search.marginalia.nu", "--force-device-scale-factor=0.5", "--high-dpi-support=0.5"]'
DEBUG=-*

40
run/experiment-elsewhere.sh Executable file
View File

@ -0,0 +1,40 @@
#!/bin/bash
set -e
EXPERIMENT=$1
SAMPLE_NAME=crawl-${2:-m}
ARGS=${@:3}
SAMPLE_DIR="node-1/samples/${SAMPLE_NAME}/"
export EXPERIMENT_RUNNER_OPTS="--enable-preview"
echo "args = $ARGS"
## Configuration
pushd $(dirname $0)
JAVA_OPTS="
-Dcrawl.rootDirRewrite=/crawl:${SAMPLE_DIR}
-Ddb.overrideJdbc=jdbc:mariadb://localhost:3306/WMSA_prod?rewriteBatchedStatements=true
-ea
"
## Configuration ends
if [ -z "$EXPERIMENT" ]; then
echo "Usage: $0 experiment-name path-to-crawl-data"
exit 255;
fi
tar xf ../code/tools/experiment-runner/build/distributions/experiment-runner.tar -C install/
PATH+=":install/experiment-runner/bin"
export WMSA_HOME=.
export PATH
export JAVA_OPTS
experiment-runner $2 ${EXPERIMENT} ${ARGS}
popd