Screenshot bot
This commit is contained in:
parent
2ad6b97657
commit
e22fde69ed
@ -28,6 +28,7 @@ repositories {
|
||||
}
|
||||
|
||||
shadowJar {
|
||||
zip64 true
|
||||
}
|
||||
jar {
|
||||
manifest {
|
||||
|
@ -132,11 +132,16 @@ dependencies {
|
||||
e2eTestRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine'
|
||||
e2eTestImplementation 'org.projectlombok:lombok:1.18.24'
|
||||
e2eTestAnnotationProcessor 'org.projectlombok:lombok:1.18.24'
|
||||
e2eTestImplementation 'org.testcontainers:nginx:1.17.2'
|
||||
e2eTestImplementation 'org.testcontainers:nginx:1.17.3'
|
||||
e2eTestImplementation "org.testcontainers:junit-jupiter:1.17.2"
|
||||
e2eTestImplementation "org.testcontainers:selenium:1.17.2"
|
||||
e2eTestImplementation 'org.seleniumhq.selenium:selenium-remote-driver:4.1.4'
|
||||
e2eTestImplementation 'org.seleniumhq.selenium:selenium-chrome-driver:4.1.4'
|
||||
e2eTestImplementation 'org.testcontainers:selenium:1.17.3'
|
||||
e2eTestImplementation 'org.seleniumhq.selenium:selenium-remote-driver:4.2.1'
|
||||
e2eTestImplementation 'org.seleniumhq.selenium:selenium-chrome-driver:4.2.1'
|
||||
|
||||
|
||||
implementation 'org.seleniumhq.selenium:selenium-chrome-driver:4.1.4'
|
||||
implementation 'org.seleniumhq.selenium:selenium-java:4.3.0'
|
||||
implementation 'org.sejda.imageio:webp-imageio:0.1.6'
|
||||
}
|
||||
|
||||
configurations {
|
||||
@ -144,6 +149,7 @@ configurations {
|
||||
|
||||
}
|
||||
|
||||
|
||||
test {
|
||||
maxParallelForks = 16
|
||||
forkEvery = 1
|
||||
|
@ -0,0 +1,160 @@
|
||||
package nu.marginalia.wmsa.edge.tools;
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.openqa.selenium.OutputType;
|
||||
import org.openqa.selenium.PageLoadStrategy;
|
||||
import org.openqa.selenium.chrome.ChromeDriver;
|
||||
import org.openqa.selenium.chrome.ChromeOptions;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.Connection;
|
||||
import java.sql.SQLException;
|
||||
import java.time.Duration;
|
||||
import java.util.*;
|
||||
|
||||
public class ScreenshotCaptureToolMain {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(ScreenshotCaptureToolMain.class);
|
||||
|
||||
public static void main(String[] args) {
|
||||
DatabaseModule databaseModule = new DatabaseModule();
|
||||
var ds = databaseModule.provideConnection();
|
||||
|
||||
ChromeDriver driver = initChromeDriver(args);
|
||||
List<EdgeDomain> crawlQueue = getDomains(args, ds);
|
||||
|
||||
try (Connection conn = ds.getConnection()) {
|
||||
for (var domain : crawlQueue) {
|
||||
logger.info("Fetching {}", domain);
|
||||
|
||||
fetchDomain(driver, domain)
|
||||
.ifPresentOrElse(
|
||||
(path) -> uploadScreenshot(conn, domain, path),
|
||||
() -> flagDomainAsFetched(conn, domain));
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
e.printStackTrace();
|
||||
} finally {
|
||||
driver.quit();
|
||||
}
|
||||
}
|
||||
|
||||
@NotNull
|
||||
private static List<EdgeDomain> getDomains(String[] args, HikariDataSource ds) {
|
||||
List<EdgeDomain> crawlQueue;
|
||||
if (args.length <= 1) {
|
||||
crawlQueue = fetchCrawlQueue(ds, 100);
|
||||
}
|
||||
else {
|
||||
crawlQueue = Arrays.stream(args).skip(1).map(EdgeDomain::new).toList();
|
||||
}
|
||||
return crawlQueue;
|
||||
}
|
||||
|
||||
@NotNull
|
||||
private static ChromeDriver initChromeDriver(String[] args) {
|
||||
System.setProperty("webdriver.chrome.driver", args[0]);
|
||||
ChromeOptions options = new ChromeOptions();
|
||||
|
||||
options.setPageLoadStrategy(PageLoadStrategy.NORMAL);
|
||||
options.setPageLoadTimeout(Duration.ofSeconds(30));
|
||||
|
||||
options.addArguments(
|
||||
"no-sandbox",
|
||||
"headless",
|
||||
"user-agent=search.marginalia.nu",
|
||||
"window-size=1024,768",
|
||||
"force-device-scale-factor=0.5",
|
||||
"high-dpi-support=0.5",
|
||||
"disable-gpu",
|
||||
"disable-dev-shm-usage",
|
||||
"disable-software-rasterizer");
|
||||
|
||||
return new ChromeDriver(options);
|
||||
}
|
||||
|
||||
private static void flagDomainAsFetched(Connection conn, EdgeDomain domain) {
|
||||
try (var stmt = conn.prepareStatement("REPLACE INTO DATA_DOMAIN_HISTORY(DOMAIN_NAME, SCREENSHOT_DATE) VALUES (?, NOW())")) {
|
||||
stmt.setString(1, domain.toString());
|
||||
stmt.executeUpdate();
|
||||
} catch (SQLException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private static void uploadScreenshot(Connection conn, EdgeDomain domain, Path screenshotPath) {
|
||||
logger.info("Uploading {}", screenshotPath);
|
||||
try (var stmt = conn.prepareStatement("REPLACE INTO DATA_DOMAIN_SCREENSHOT(DOMAIN_NAME, CONTENT_TYPE, DATA) VALUES (?,?,?)");
|
||||
var is = Files.newInputStream(screenshotPath)
|
||||
) {
|
||||
stmt.setString(1, domain.toString());
|
||||
stmt.setString(2, "image/webp");
|
||||
stmt.setBlob(3, is);
|
||||
stmt.executeUpdate();
|
||||
|
||||
Files.delete(screenshotPath);
|
||||
|
||||
} catch (SQLException | IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
|
||||
flagDomainAsFetched(conn, domain);
|
||||
}
|
||||
|
||||
private static Optional<Path> fetchDomain(ChromeDriver driver, EdgeDomain domain) {
|
||||
try {
|
||||
driver.get(domain.toRootUrl().toString());
|
||||
|
||||
final byte[] bytes = driver.getScreenshotAs(OutputType.BYTES);
|
||||
|
||||
final var img = ImageIO.read(new ByteArrayInputStream(bytes));
|
||||
|
||||
Path destPath = Files.createTempFile("website-screenshot-", ".webp");
|
||||
ImageIO.write(img, "webp", destPath.toFile());
|
||||
|
||||
// If the screenshot is very small by size, it's very likely not particularly interesting to look at
|
||||
if (Files.size(destPath) < 2500) {
|
||||
Files.delete(destPath);
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
return Optional.of(destPath);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
ex.printStackTrace();
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
|
||||
private static List<EdgeDomain> fetchCrawlQueue(HikariDataSource ds, int queueSize) {
|
||||
List<EdgeDomain> ret = new ArrayList<>(queueSize);
|
||||
|
||||
try (var conn = ds.getConnection(); var stmt = conn.createStatement()) {
|
||||
var rsp = stmt.executeQuery(
|
||||
"""
|
||||
SELECT EC_DOMAIN.DOMAIN_NAME FROM EC_DOMAIN
|
||||
LEFT JOIN DATA_DOMAIN_HISTORY ON EC_DOMAIN.DOMAIN_NAME=DATA_DOMAIN_HISTORY.DOMAIN_NAME
|
||||
ORDER BY SCREENSHOT_DATE IS NULL DESC, SCREENSHOT_DATE, INDEXED DESC
|
||||
LIMIT
|
||||
""" + queueSize);
|
||||
while (rsp.next()) {
|
||||
ret.add(new EdgeDomain(rsp.getString(1)));
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
ex.printStackTrace();
|
||||
return Collections.emptyList();
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
}
|
@ -220,4 +220,9 @@ CREATE TABLE IF NOT EXISTS DATA_DOMAIN_SCREENSHOT (
|
||||
)
|
||||
ROW_FORMAT=DYNAMIC
|
||||
CHARACTER SET utf8mb4
|
||||
COLLATE utf8mb4_unicode_ci;
|
||||
COLLATE utf8mb4_unicode_ci;
|
||||
|
||||
CREATE TABLE DATA_DOMAIN_HISTORY (
|
||||
DOMAIN_NAME VARCHAR(255) PRIMARY KEY,
|
||||
SCREENSHOT_DATE DATE DEFAULT NOW()
|
||||
) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
|
Loading…
Reference in New Issue
Block a user