From e22fde69ed6217bef4c80c4866e764b9d5b3a8ce Mon Sep 17 00:00:00 2001 From: vlofgren Date: Thu, 4 Aug 2022 21:14:17 +0200 Subject: [PATCH] Screenshot bot --- build.gradle | 1 + marginalia_nu/build.gradle | 14 +- .../edge/tools/ScreenshotCaptureToolMain.java | 160 ++++++++++++++++++ .../main/resources/sql/edge-crawler-cache.sql | 7 +- 4 files changed, 177 insertions(+), 5 deletions(-) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ScreenshotCaptureToolMain.java diff --git a/build.gradle b/build.gradle index cb2fef4b..c13b58da 100644 --- a/build.gradle +++ b/build.gradle @@ -28,6 +28,7 @@ repositories { } shadowJar { + zip64 true } jar { manifest { diff --git a/marginalia_nu/build.gradle b/marginalia_nu/build.gradle index eb553649..94e240fe 100644 --- a/marginalia_nu/build.gradle +++ b/marginalia_nu/build.gradle @@ -132,11 +132,16 @@ dependencies { e2eTestRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine' e2eTestImplementation 'org.projectlombok:lombok:1.18.24' e2eTestAnnotationProcessor 'org.projectlombok:lombok:1.18.24' - e2eTestImplementation 'org.testcontainers:nginx:1.17.2' + e2eTestImplementation 'org.testcontainers:nginx:1.17.3' e2eTestImplementation "org.testcontainers:junit-jupiter:1.17.2" - e2eTestImplementation "org.testcontainers:selenium:1.17.2" - e2eTestImplementation 'org.seleniumhq.selenium:selenium-remote-driver:4.1.4' - e2eTestImplementation 'org.seleniumhq.selenium:selenium-chrome-driver:4.1.4' + e2eTestImplementation 'org.testcontainers:selenium:1.17.3' + e2eTestImplementation 'org.seleniumhq.selenium:selenium-remote-driver:4.2.1' + e2eTestImplementation 'org.seleniumhq.selenium:selenium-chrome-driver:4.2.1' + + + implementation 'org.seleniumhq.selenium:selenium-chrome-driver:4.1.4' + implementation 'org.seleniumhq.selenium:selenium-java:4.3.0' + implementation 'org.sejda.imageio:webp-imageio:0.1.6' } configurations { @@ -144,6 +149,7 @@ configurations { } + test { maxParallelForks = 16 forkEvery = 1 diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ScreenshotCaptureToolMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ScreenshotCaptureToolMain.java new file mode 100644 index 00000000..8ddd1b9d --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ScreenshotCaptureToolMain.java @@ -0,0 +1,160 @@ +package nu.marginalia.wmsa.edge.tools; + +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.wmsa.configuration.module.DatabaseModule; +import nu.marginalia.wmsa.edge.model.EdgeDomain; +import org.jetbrains.annotations.NotNull; +import org.openqa.selenium.OutputType; +import org.openqa.selenium.PageLoadStrategy; +import org.openqa.selenium.chrome.ChromeDriver; +import org.openqa.selenium.chrome.ChromeOptions; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.imageio.ImageIO; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.sql.Connection; +import java.sql.SQLException; +import java.time.Duration; +import java.util.*; + +public class ScreenshotCaptureToolMain { + + private static final Logger logger = LoggerFactory.getLogger(ScreenshotCaptureToolMain.class); + + public static void main(String[] args) { + DatabaseModule databaseModule = new DatabaseModule(); + var ds = databaseModule.provideConnection(); + + ChromeDriver driver = initChromeDriver(args); + List crawlQueue = getDomains(args, ds); + + try (Connection conn = ds.getConnection()) { + for (var domain : crawlQueue) { + logger.info("Fetching {}", domain); + + fetchDomain(driver, domain) + .ifPresentOrElse( + (path) -> uploadScreenshot(conn, domain, path), + () -> flagDomainAsFetched(conn, domain)); + } + } catch (SQLException e) { + e.printStackTrace(); + } finally { + driver.quit(); + } + } + + @NotNull + private static List getDomains(String[] args, HikariDataSource ds) { + List crawlQueue; + if (args.length <= 1) { + crawlQueue = fetchCrawlQueue(ds, 100); + } + else { + crawlQueue = Arrays.stream(args).skip(1).map(EdgeDomain::new).toList(); + } + return crawlQueue; + } + + @NotNull + private static ChromeDriver initChromeDriver(String[] args) { + System.setProperty("webdriver.chrome.driver", args[0]); + ChromeOptions options = new ChromeOptions(); + + options.setPageLoadStrategy(PageLoadStrategy.NORMAL); + options.setPageLoadTimeout(Duration.ofSeconds(30)); + + options.addArguments( + "no-sandbox", + "headless", + "user-agent=search.marginalia.nu", + "window-size=1024,768", + "force-device-scale-factor=0.5", + "high-dpi-support=0.5", + "disable-gpu", + "disable-dev-shm-usage", + "disable-software-rasterizer"); + + return new ChromeDriver(options); + } + + private static void flagDomainAsFetched(Connection conn, EdgeDomain domain) { + try (var stmt = conn.prepareStatement("REPLACE INTO DATA_DOMAIN_HISTORY(DOMAIN_NAME, SCREENSHOT_DATE) VALUES (?, NOW())")) { + stmt.setString(1, domain.toString()); + stmt.executeUpdate(); + } catch (SQLException e) { + throw new RuntimeException(e); + } + } + + private static void uploadScreenshot(Connection conn, EdgeDomain domain, Path screenshotPath) { + logger.info("Uploading {}", screenshotPath); + try (var stmt = conn.prepareStatement("REPLACE INTO DATA_DOMAIN_SCREENSHOT(DOMAIN_NAME, CONTENT_TYPE, DATA) VALUES (?,?,?)"); + var is = Files.newInputStream(screenshotPath) + ) { + stmt.setString(1, domain.toString()); + stmt.setString(2, "image/webp"); + stmt.setBlob(3, is); + stmt.executeUpdate(); + + Files.delete(screenshotPath); + + } catch (SQLException | IOException e) { + e.printStackTrace(); + } + + + flagDomainAsFetched(conn, domain); + } + + private static Optional fetchDomain(ChromeDriver driver, EdgeDomain domain) { + try { + driver.get(domain.toRootUrl().toString()); + + final byte[] bytes = driver.getScreenshotAs(OutputType.BYTES); + + final var img = ImageIO.read(new ByteArrayInputStream(bytes)); + + Path destPath = Files.createTempFile("website-screenshot-", ".webp"); + ImageIO.write(img, "webp", destPath.toFile()); + + // If the screenshot is very small by size, it's very likely not particularly interesting to look at + if (Files.size(destPath) < 2500) { + Files.delete(destPath); + return Optional.empty(); + } + + return Optional.of(destPath); + } + catch (Exception ex) { + ex.printStackTrace(); + return Optional.empty(); + } + } + + private static List fetchCrawlQueue(HikariDataSource ds, int queueSize) { + List ret = new ArrayList<>(queueSize); + + try (var conn = ds.getConnection(); var stmt = conn.createStatement()) { + var rsp = stmt.executeQuery( + """ + SELECT EC_DOMAIN.DOMAIN_NAME FROM EC_DOMAIN + LEFT JOIN DATA_DOMAIN_HISTORY ON EC_DOMAIN.DOMAIN_NAME=DATA_DOMAIN_HISTORY.DOMAIN_NAME + ORDER BY SCREENSHOT_DATE IS NULL DESC, SCREENSHOT_DATE, INDEXED DESC + LIMIT + """ + queueSize); + while (rsp.next()) { + ret.add(new EdgeDomain(rsp.getString(1))); + } + } + catch (SQLException ex) { + ex.printStackTrace(); + return Collections.emptyList(); + } + return ret; + } +} diff --git a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql index b5dfaa17..782bc67d 100644 --- a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql +++ b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql @@ -220,4 +220,9 @@ CREATE TABLE IF NOT EXISTS DATA_DOMAIN_SCREENSHOT ( ) ROW_FORMAT=DYNAMIC CHARACTER SET utf8mb4 -COLLATE utf8mb4_unicode_ci; \ No newline at end of file +COLLATE utf8mb4_unicode_ci; + +CREATE TABLE DATA_DOMAIN_HISTORY ( + DOMAIN_NAME VARCHAR(255) PRIMARY KEY, + SCREENSHOT_DATE DATE DEFAULT NOW() +) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci; \ No newline at end of file