Screenshot bot

This commit is contained in:
vlofgren 2022-08-04 21:14:17 +02:00
parent 2ad6b97657
commit e22fde69ed
4 changed files with 177 additions and 5 deletions

View File

@ -28,6 +28,7 @@ repositories {
}
shadowJar {
zip64 true
}
jar {
manifest {

View File

@ -132,11 +132,16 @@ dependencies {
e2eTestRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine'
e2eTestImplementation 'org.projectlombok:lombok:1.18.24'
e2eTestAnnotationProcessor 'org.projectlombok:lombok:1.18.24'
e2eTestImplementation 'org.testcontainers:nginx:1.17.2'
e2eTestImplementation 'org.testcontainers:nginx:1.17.3'
e2eTestImplementation "org.testcontainers:junit-jupiter:1.17.2"
e2eTestImplementation "org.testcontainers:selenium:1.17.2"
e2eTestImplementation 'org.seleniumhq.selenium:selenium-remote-driver:4.1.4'
e2eTestImplementation 'org.seleniumhq.selenium:selenium-chrome-driver:4.1.4'
e2eTestImplementation 'org.testcontainers:selenium:1.17.3'
e2eTestImplementation 'org.seleniumhq.selenium:selenium-remote-driver:4.2.1'
e2eTestImplementation 'org.seleniumhq.selenium:selenium-chrome-driver:4.2.1'
implementation 'org.seleniumhq.selenium:selenium-chrome-driver:4.1.4'
implementation 'org.seleniumhq.selenium:selenium-java:4.3.0'
implementation 'org.sejda.imageio:webp-imageio:0.1.6'
}
configurations {
@ -144,6 +149,7 @@ configurations {
}
test {
maxParallelForks = 16
forkEvery = 1

View File

@ -0,0 +1,160 @@
package nu.marginalia.wmsa.edge.tools;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import org.jetbrains.annotations.NotNull;
import org.openqa.selenium.OutputType;
import org.openqa.selenium.PageLoadStrategy;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.imageio.ImageIO;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.Connection;
import java.sql.SQLException;
import java.time.Duration;
import java.util.*;
public class ScreenshotCaptureToolMain {
private static final Logger logger = LoggerFactory.getLogger(ScreenshotCaptureToolMain.class);
public static void main(String[] args) {
DatabaseModule databaseModule = new DatabaseModule();
var ds = databaseModule.provideConnection();
ChromeDriver driver = initChromeDriver(args);
List<EdgeDomain> crawlQueue = getDomains(args, ds);
try (Connection conn = ds.getConnection()) {
for (var domain : crawlQueue) {
logger.info("Fetching {}", domain);
fetchDomain(driver, domain)
.ifPresentOrElse(
(path) -> uploadScreenshot(conn, domain, path),
() -> flagDomainAsFetched(conn, domain));
}
} catch (SQLException e) {
e.printStackTrace();
} finally {
driver.quit();
}
}
@NotNull
private static List<EdgeDomain> getDomains(String[] args, HikariDataSource ds) {
List<EdgeDomain> crawlQueue;
if (args.length <= 1) {
crawlQueue = fetchCrawlQueue(ds, 100);
}
else {
crawlQueue = Arrays.stream(args).skip(1).map(EdgeDomain::new).toList();
}
return crawlQueue;
}
@NotNull
private static ChromeDriver initChromeDriver(String[] args) {
System.setProperty("webdriver.chrome.driver", args[0]);
ChromeOptions options = new ChromeOptions();
options.setPageLoadStrategy(PageLoadStrategy.NORMAL);
options.setPageLoadTimeout(Duration.ofSeconds(30));
options.addArguments(
"no-sandbox",
"headless",
"user-agent=search.marginalia.nu",
"window-size=1024,768",
"force-device-scale-factor=0.5",
"high-dpi-support=0.5",
"disable-gpu",
"disable-dev-shm-usage",
"disable-software-rasterizer");
return new ChromeDriver(options);
}
private static void flagDomainAsFetched(Connection conn, EdgeDomain domain) {
try (var stmt = conn.prepareStatement("REPLACE INTO DATA_DOMAIN_HISTORY(DOMAIN_NAME, SCREENSHOT_DATE) VALUES (?, NOW())")) {
stmt.setString(1, domain.toString());
stmt.executeUpdate();
} catch (SQLException e) {
throw new RuntimeException(e);
}
}
private static void uploadScreenshot(Connection conn, EdgeDomain domain, Path screenshotPath) {
logger.info("Uploading {}", screenshotPath);
try (var stmt = conn.prepareStatement("REPLACE INTO DATA_DOMAIN_SCREENSHOT(DOMAIN_NAME, CONTENT_TYPE, DATA) VALUES (?,?,?)");
var is = Files.newInputStream(screenshotPath)
) {
stmt.setString(1, domain.toString());
stmt.setString(2, "image/webp");
stmt.setBlob(3, is);
stmt.executeUpdate();
Files.delete(screenshotPath);
} catch (SQLException | IOException e) {
e.printStackTrace();
}
flagDomainAsFetched(conn, domain);
}
private static Optional<Path> fetchDomain(ChromeDriver driver, EdgeDomain domain) {
try {
driver.get(domain.toRootUrl().toString());
final byte[] bytes = driver.getScreenshotAs(OutputType.BYTES);
final var img = ImageIO.read(new ByteArrayInputStream(bytes));
Path destPath = Files.createTempFile("website-screenshot-", ".webp");
ImageIO.write(img, "webp", destPath.toFile());
// If the screenshot is very small by size, it's very likely not particularly interesting to look at
if (Files.size(destPath) < 2500) {
Files.delete(destPath);
return Optional.empty();
}
return Optional.of(destPath);
}
catch (Exception ex) {
ex.printStackTrace();
return Optional.empty();
}
}
private static List<EdgeDomain> fetchCrawlQueue(HikariDataSource ds, int queueSize) {
List<EdgeDomain> ret = new ArrayList<>(queueSize);
try (var conn = ds.getConnection(); var stmt = conn.createStatement()) {
var rsp = stmt.executeQuery(
"""
SELECT EC_DOMAIN.DOMAIN_NAME FROM EC_DOMAIN
LEFT JOIN DATA_DOMAIN_HISTORY ON EC_DOMAIN.DOMAIN_NAME=DATA_DOMAIN_HISTORY.DOMAIN_NAME
ORDER BY SCREENSHOT_DATE IS NULL DESC, SCREENSHOT_DATE, INDEXED DESC
LIMIT
""" + queueSize);
while (rsp.next()) {
ret.add(new EdgeDomain(rsp.getString(1)));
}
}
catch (SQLException ex) {
ex.printStackTrace();
return Collections.emptyList();
}
return ret;
}
}

View File

@ -220,4 +220,9 @@ CREATE TABLE IF NOT EXISTS DATA_DOMAIN_SCREENSHOT (
)
ROW_FORMAT=DYNAMIC
CHARACTER SET utf8mb4
COLLATE utf8mb4_unicode_ci;
COLLATE utf8mb4_unicode_ci;
CREATE TABLE DATA_DOMAIN_HISTORY (
DOMAIN_NAME VARCHAR(255) PRIMARY KEY,
SCREENSHOT_DATE DATE DEFAULT NOW()
) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;