From 3047e2dd7c590ca026c7aaf36ef5baac9aee130e Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 1 Nov 2023 16:38:55 +0100 Subject: [PATCH] (screenshot-capture-tool) Make screenshot-capture-tool cooperate with docker --- .../actor/task/ExportAtagsActor.java | 1 - .../screenshot-capture-tool/build.gradle | 4 + .../screenshot/ScreenshotCaptureToolMain.java | 182 ++++++------------ docker-compose-screenshot-bot.yml | 53 +++++ run/env/browserless.env | 2 + run/experiment-elsewhere.sh | 40 ++++ 6 files changed, 160 insertions(+), 122 deletions(-) create mode 100644 docker-compose-screenshot-bot.yml create mode 100644 run/env/browserless.env create mode 100755 run/experiment-elsewhere.sh diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportAtagsActor.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportAtagsActor.java index 68a05132..0af77acb 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportAtagsActor.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportAtagsActor.java @@ -8,7 +8,6 @@ import lombok.SneakyThrows; import nu.marginalia.hash.MurmurHash3_128; import nu.marginalia.link_parser.LinkParser; import nu.marginalia.model.EdgeDomain; -import nu.marginalia.service.control.ServiceHeartbeat; import nu.marginalia.storage.model.*; import org.apache.commons.lang3.StringUtils; import org.jsoup.Jsoup; diff --git a/code/tools/screenshot-capture-tool/build.gradle b/code/tools/screenshot-capture-tool/build.gradle index ee231abb..6c1f7c67 100644 --- a/code/tools/screenshot-capture-tool/build.gradle +++ b/code/tools/screenshot-capture-tool/build.gradle @@ -3,6 +3,7 @@ plugins { id 'application' id 'jvm-test-suite' + id 'com.palantir.docker' version '0.35.0' } java { @@ -16,6 +17,8 @@ application { applicationName = 'screenshot-capture-tool' } +apply from: "$rootProject.projectDir/docker-service.gradle" + tasks.distZip.enabled = false dependencies { @@ -30,6 +33,7 @@ dependencies { implementation libs.commons.compress implementation libs.commons.io implementation libs.guice + implementation libs.gson testImplementation libs.bundles.slf4j.test testImplementation libs.bundles.junit diff --git a/code/tools/screenshot-capture-tool/src/main/java/nu/marginalia/screenshot/ScreenshotCaptureToolMain.java b/code/tools/screenshot-capture-tool/src/main/java/nu/marginalia/screenshot/ScreenshotCaptureToolMain.java index 2bc7921c..eba90095 100644 --- a/code/tools/screenshot-capture-tool/src/main/java/nu/marginalia/screenshot/ScreenshotCaptureToolMain.java +++ b/code/tools/screenshot-capture-tool/src/main/java/nu/marginalia/screenshot/ScreenshotCaptureToolMain.java @@ -1,37 +1,28 @@ package nu.marginalia.screenshot; +import com.google.gson.Gson; import com.zaxxer.hikari.HikariDataSource; import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.service.module.DatabaseModule; -import org.jetbrains.annotations.NotNull; -import org.openqa.selenium.OutputType; -import org.openqa.selenium.PageLoadStrategy; -import org.openqa.selenium.TimeoutException; -import org.openqa.selenium.chrome.ChromeDriver; import org.openqa.selenium.chrome.ChromeDriverService; -import org.openqa.selenium.chrome.ChromeOptions; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import javax.imageio.ImageIO; import java.io.ByteArrayInputStream; import java.io.IOException; import java.net.URI; import java.net.http.HttpClient; import java.net.http.HttpRequest; import java.net.http.HttpResponse; -import java.nio.file.Files; -import java.nio.file.Path; import java.sql.Connection; import java.sql.SQLException; import java.time.Duration; -import java.util.*; -import java.util.concurrent.ExecutorService; -import java.util.stream.Collectors; - -import org.openqa.selenium.support.ui.ExpectedCondition; -import org.openqa.selenium.support.ui.WebDriverWait; -import org.openqa.selenium.JavascriptExecutor; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.concurrent.TimeUnit; public class ScreenshotCaptureToolMain { @@ -43,30 +34,21 @@ public class ScreenshotCaptureToolMain { System.setProperty(ChromeDriverService.CHROME_DRIVER_SILENT_OUTPUT_PROPERTY, "true"); - ChromeDriver driver = initChromeDriver(); List crawlQueue = fetchCrawlQueue(ds, 1000); - HttpClient httpClient = HttpClient.newHttpClient(); + HttpClient httpClient = HttpClient.newBuilder() + .version(HttpClient.Version.HTTP_1_1) + .connectTimeout(Duration.ofSeconds(30)) + .build() + ; try (Connection conn = ds.getConnection()) { - - - logger.info("Probing domains"); - var ret = crawlQueue.parallelStream().collect(Collectors.partitioningBy(domain -> probeUrl(httpClient, domain))); - - var badDomains = ret.getOrDefault(Boolean.FALSE, Collections.emptyList()); - var goodDomains = ret.getOrDefault(Boolean.TRUE, Collections.emptyList()); - - logger.info("Result: {} good domains, {} bad domains", goodDomains.size(), badDomains.size()); - - badDomains.forEach(domain -> flagDomainAsFetched(conn, domain)); - - for (var domain : goodDomains) { + for (var domain : crawlQueue) { logger.info("Fetching {}", domain); - var filePath = fetchDomain(driver, domain); - if (filePath != null) { - uploadScreenshot(conn, domain, filePath); + byte[] webpBytes = fetchDomain(httpClient, domain); + if (webpBytes != null) { + uploadScreenshot(conn, domain, webpBytes); } else { flagDomainAsFetched(conn, domain); } @@ -74,38 +56,15 @@ public class ScreenshotCaptureToolMain { } catch (SQLException e) { e.printStackTrace(); - } finally { - driver.quit(); } } - @NotNull - private static ChromeDriver initChromeDriver() { - System.setProperty("webdriver.chrome.driver", "./chromedriver"); - ChromeOptions options = new ChromeOptions(); - - options.setPageLoadStrategy(PageLoadStrategy.NONE); - options.setPageLoadTimeout(Duration.ofSeconds(30)); - - options.addArguments( - "no-sandbox", - "headless", - "user-agent=search.marginalia.nu", - "window-size=1024,768", - "force-device-scale-factor=0.5", - "high-dpi-support=0.5", - "dns-prefetch-disable", - "disable-gpu", - "disable-dev-shm-usage", - "disable-software-rasterizer", - "disable-extensions" - ); - - return new ChromeDriver(options); - } - private static void flagDomainAsFetched(Connection conn, EdgeDomain domain) { - try (var stmt = conn.prepareStatement("REPLACE INTO DATA_DOMAIN_HISTORY(DOMAIN_NAME, SCREENSHOT_DATE) VALUES (?, NOW())")) { + try (var stmt = conn.prepareStatement(""" + REPLACE INTO DATA_DOMAIN_HISTORY(DOMAIN_NAME, SCREENSHOT_DATE) + VALUES (?, NOW()) + """)) + { stmt.setString(1, domain.toString()); stmt.executeUpdate(); } catch (SQLException e) { @@ -113,87 +72,68 @@ public class ScreenshotCaptureToolMain { } } - private static void uploadScreenshot(Connection conn, EdgeDomain domain, Path screenshotPath) { - logger.info("Uploading {}", screenshotPath); - try (var stmt = conn.prepareStatement("REPLACE INTO DATA_DOMAIN_SCREENSHOT(DOMAIN_NAME, CONTENT_TYPE, DATA) VALUES (?,?,?)"); - var is = Files.newInputStream(screenshotPath) + private static void uploadScreenshot(Connection conn, EdgeDomain domain, byte[] webpBytes) { + try (var stmt = conn.prepareStatement(""" + REPLACE INTO DATA_DOMAIN_SCREENSHOT(DOMAIN_NAME, CONTENT_TYPE, DATA) + VALUES (?,?,?) + """); + var is = new ByteArrayInputStream(webpBytes) ) { stmt.setString(1, domain.toString()); - stmt.setString(2, "image/webp"); + stmt.setString(2, "image/png"); stmt.setBlob(3, is); stmt.executeUpdate(); - - Files.delete(screenshotPath); - } catch (SQLException | IOException e) { e.printStackTrace(); } - flagDomainAsFetched(conn, domain); } - private static boolean probeUrl(HttpClient httpClient, EdgeDomain domain) { + private static Gson gson = GsonFactory.get(); + + private static byte[] fetchDomain(HttpClient client, EdgeDomain domain) { try { + Map requestData = Map.of( + "url", domain.toRootUrl().toString(), + "options", + Map.of("fullPage", false, + "type", "png"), + "gotoOptions", Map.of( + "waitUntil", "networkidle2", + "timeout", TimeUnit.SECONDS.toMillis(10) + ) + ); + var request = HttpRequest.newBuilder() - .uri(new URI(domain.toRootUrl().toString())) - .timeout(Duration.ofSeconds(5)) - .method("HEAD", HttpRequest.BodyPublishers.noBody()) - .header("user-agent", "search.marginialia.nu") + .uri(new URI("http://browserless:3000/screenshot")) + .method("POST", HttpRequest.BodyPublishers.ofString( + gson.toJson(requestData) + )) + .header("Content-type", "application/json") .build(); + var rsp = client.send(request, HttpResponse.BodyHandlers.ofByteArray()); - var response = httpClient.send(request, HttpResponse.BodyHandlers.ofString()); - - return response.statusCode() < 400; - } catch (Exception ex) { - return false; - } - - } - private static Path fetchDomain(ChromeDriver driver, EdgeDomain domain) { - try { - driver.get(domain.toRootUrl().toString()); - - WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(30)); - - try { - wait.until((ExpectedCondition) wd -> { - if (wd instanceof JavascriptExecutor jse) { - return "complete".equals(jse.executeScript("return document.readyState")); - } - return true; - }); - } - catch (TimeoutException ex) { - logger.info("Wait timed out, forcing window.stop()"); - driver.executeScript("window.stop()"); - } - - - - final byte[] bytes = driver.getScreenshotAs(OutputType.BYTES); - - final var img = ImageIO.read(new ByteArrayInputStream(bytes)); - - - Path destPath = Files.createTempFile("website-screenshot-", ".webp"); - ImageIO.write(img, "webp", destPath.toFile()); - - // If the screenshot is very small by size, it's very likely not particularly interesting to look at - if (Files.size(destPath) < 3500) { - Files.delete(destPath); + if (rsp.statusCode() >= 300) { return null; } - return destPath; + byte[] image = rsp.body(); + if (image.length < 3500) { + logger.warn("Skipping {} due to size ({})", domain, image.length); + return null; + } + + return image; } catch (Exception ex) { - ex.printStackTrace(); + logger.warn("Exception in screenshotting " + domain, ex); return null; } } private static List fetchCrawlQueue(HikariDataSource ds, int queueSize) { + List ret = new ArrayList<>(queueSize); try (var conn = ds.getConnection(); var stmt = conn.createStatement()) { @@ -202,14 +142,14 @@ public class ScreenshotCaptureToolMain { SELECT EC_DOMAIN.DOMAIN_NAME FROM EC_DOMAIN LEFT JOIN DATA_DOMAIN_HISTORY ON EC_DOMAIN.DOMAIN_NAME=DATA_DOMAIN_HISTORY.DOMAIN_NAME ORDER BY SCREENSHOT_DATE IS NULL DESC, SCREENSHOT_DATE, INDEXED DESC - LIMIT + LIMIT """ + queueSize); while (rsp.next()) { ret.add(new EdgeDomain(rsp.getString(1))); } } - catch (SQLException ex) { - ex.printStackTrace(); + catch (Exception ex) { + logger.warn("Exception in fetching queue", ex); return Collections.emptyList(); } return ret; diff --git a/docker-compose-screenshot-bot.yml b/docker-compose-screenshot-bot.yml new file mode 100644 index 00000000..d442fccf --- /dev/null +++ b/docker-compose-screenshot-bot.yml @@ -0,0 +1,53 @@ +x-svc: &service + env_file: + - "run/env/service.env" + volumes: + - conf:/wmsa/conf:ro + - data:/wmsa/data + - logs:/var/log/wmsa + networks: + - wmsa +services: + screenshot-capture-tool: + <<: *service + image: "marginalia.nu/screenshot-capture-tool" + container_name: "screenshot-capture-tool" + networks: + - wmsa + - headlesschrome + depends_on: + - browserless + browserless: + <<: *service + image: "browserless/chrome" + container_name: "headlesschrome" + env_file: + - "run/env/browserless.env" + ports: + - "3000:3000" + networks: + - wmsa + - headlesschrome + +networks: + wmsa: + headlesschrome: +volumes: + logs: + driver: local + driver_opts: + type: none + o: bind + device: run/logs + conf: + driver: local + driver_opts: + type: none + o: bind + device: run/conf + data: + driver: local + driver_opts: + type: none + o: bind + device: run/data \ No newline at end of file diff --git a/run/env/browserless.env b/run/env/browserless.env new file mode 100644 index 00000000..6bd12c40 --- /dev/null +++ b/run/env/browserless.env @@ -0,0 +1,2 @@ +DEFAULT_LAUNCH_ARGS='["--window-size=1024,768", "--user-agent=search.marginalia.nu", "--force-device-scale-factor=0.5", "--high-dpi-support=0.5"]' +DEBUG=-* \ No newline at end of file diff --git a/run/experiment-elsewhere.sh b/run/experiment-elsewhere.sh new file mode 100755 index 00000000..b3a020d5 --- /dev/null +++ b/run/experiment-elsewhere.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +set -e + +EXPERIMENT=$1 +SAMPLE_NAME=crawl-${2:-m} +ARGS=${@:3} +SAMPLE_DIR="node-1/samples/${SAMPLE_NAME}/" + +export EXPERIMENT_RUNNER_OPTS="--enable-preview" +echo "args = $ARGS" + +## Configuration + +pushd $(dirname $0) + +JAVA_OPTS=" +-Dcrawl.rootDirRewrite=/crawl:${SAMPLE_DIR} +-Ddb.overrideJdbc=jdbc:mariadb://localhost:3306/WMSA_prod?rewriteBatchedStatements=true +-ea +" + +## Configuration ends + +if [ -z "$EXPERIMENT" ]; then + echo "Usage: $0 experiment-name path-to-crawl-data" + exit 255; +fi + +tar xf ../code/tools/experiment-runner/build/distributions/experiment-runner.tar -C install/ + +PATH+=":install/experiment-runner/bin" + +export WMSA_HOME=. +export PATH +export JAVA_OPTS + +experiment-runner $2 ${EXPERIMENT} ${ARGS} + +popd