Screenshot bot
This commit is contained in:
parent
2ad6b97657
commit
e22fde69ed
@ -28,6 +28,7 @@ repositories {
|
|||||||
}
|
}
|
||||||
|
|
||||||
shadowJar {
|
shadowJar {
|
||||||
|
zip64 true
|
||||||
}
|
}
|
||||||
jar {
|
jar {
|
||||||
manifest {
|
manifest {
|
||||||
|
@ -132,11 +132,16 @@ dependencies {
|
|||||||
e2eTestRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine'
|
e2eTestRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine'
|
||||||
e2eTestImplementation 'org.projectlombok:lombok:1.18.24'
|
e2eTestImplementation 'org.projectlombok:lombok:1.18.24'
|
||||||
e2eTestAnnotationProcessor 'org.projectlombok:lombok:1.18.24'
|
e2eTestAnnotationProcessor 'org.projectlombok:lombok:1.18.24'
|
||||||
e2eTestImplementation 'org.testcontainers:nginx:1.17.2'
|
e2eTestImplementation 'org.testcontainers:nginx:1.17.3'
|
||||||
e2eTestImplementation "org.testcontainers:junit-jupiter:1.17.2"
|
e2eTestImplementation "org.testcontainers:junit-jupiter:1.17.2"
|
||||||
e2eTestImplementation "org.testcontainers:selenium:1.17.2"
|
e2eTestImplementation 'org.testcontainers:selenium:1.17.3'
|
||||||
e2eTestImplementation 'org.seleniumhq.selenium:selenium-remote-driver:4.1.4'
|
e2eTestImplementation 'org.seleniumhq.selenium:selenium-remote-driver:4.2.1'
|
||||||
e2eTestImplementation 'org.seleniumhq.selenium:selenium-chrome-driver:4.1.4'
|
e2eTestImplementation 'org.seleniumhq.selenium:selenium-chrome-driver:4.2.1'
|
||||||
|
|
||||||
|
|
||||||
|
implementation 'org.seleniumhq.selenium:selenium-chrome-driver:4.1.4'
|
||||||
|
implementation 'org.seleniumhq.selenium:selenium-java:4.3.0'
|
||||||
|
implementation 'org.sejda.imageio:webp-imageio:0.1.6'
|
||||||
}
|
}
|
||||||
|
|
||||||
configurations {
|
configurations {
|
||||||
@ -144,6 +149,7 @@ configurations {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
test {
|
test {
|
||||||
maxParallelForks = 16
|
maxParallelForks = 16
|
||||||
forkEvery = 1
|
forkEvery = 1
|
||||||
|
@ -0,0 +1,160 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.tools;
|
||||||
|
|
||||||
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
|
import org.jetbrains.annotations.NotNull;
|
||||||
|
import org.openqa.selenium.OutputType;
|
||||||
|
import org.openqa.selenium.PageLoadStrategy;
|
||||||
|
import org.openqa.selenium.chrome.ChromeDriver;
|
||||||
|
import org.openqa.selenium.chrome.ChromeOptions;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import javax.imageio.ImageIO;
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.sql.Connection;
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.time.Duration;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
public class ScreenshotCaptureToolMain {
|
||||||
|
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(ScreenshotCaptureToolMain.class);
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
DatabaseModule databaseModule = new DatabaseModule();
|
||||||
|
var ds = databaseModule.provideConnection();
|
||||||
|
|
||||||
|
ChromeDriver driver = initChromeDriver(args);
|
||||||
|
List<EdgeDomain> crawlQueue = getDomains(args, ds);
|
||||||
|
|
||||||
|
try (Connection conn = ds.getConnection()) {
|
||||||
|
for (var domain : crawlQueue) {
|
||||||
|
logger.info("Fetching {}", domain);
|
||||||
|
|
||||||
|
fetchDomain(driver, domain)
|
||||||
|
.ifPresentOrElse(
|
||||||
|
(path) -> uploadScreenshot(conn, domain, path),
|
||||||
|
() -> flagDomainAsFetched(conn, domain));
|
||||||
|
}
|
||||||
|
} catch (SQLException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
} finally {
|
||||||
|
driver.quit();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@NotNull
|
||||||
|
private static List<EdgeDomain> getDomains(String[] args, HikariDataSource ds) {
|
||||||
|
List<EdgeDomain> crawlQueue;
|
||||||
|
if (args.length <= 1) {
|
||||||
|
crawlQueue = fetchCrawlQueue(ds, 100);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
crawlQueue = Arrays.stream(args).skip(1).map(EdgeDomain::new).toList();
|
||||||
|
}
|
||||||
|
return crawlQueue;
|
||||||
|
}
|
||||||
|
|
||||||
|
@NotNull
|
||||||
|
private static ChromeDriver initChromeDriver(String[] args) {
|
||||||
|
System.setProperty("webdriver.chrome.driver", args[0]);
|
||||||
|
ChromeOptions options = new ChromeOptions();
|
||||||
|
|
||||||
|
options.setPageLoadStrategy(PageLoadStrategy.NORMAL);
|
||||||
|
options.setPageLoadTimeout(Duration.ofSeconds(30));
|
||||||
|
|
||||||
|
options.addArguments(
|
||||||
|
"no-sandbox",
|
||||||
|
"headless",
|
||||||
|
"user-agent=search.marginalia.nu",
|
||||||
|
"window-size=1024,768",
|
||||||
|
"force-device-scale-factor=0.5",
|
||||||
|
"high-dpi-support=0.5",
|
||||||
|
"disable-gpu",
|
||||||
|
"disable-dev-shm-usage",
|
||||||
|
"disable-software-rasterizer");
|
||||||
|
|
||||||
|
return new ChromeDriver(options);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void flagDomainAsFetched(Connection conn, EdgeDomain domain) {
|
||||||
|
try (var stmt = conn.prepareStatement("REPLACE INTO DATA_DOMAIN_HISTORY(DOMAIN_NAME, SCREENSHOT_DATE) VALUES (?, NOW())")) {
|
||||||
|
stmt.setString(1, domain.toString());
|
||||||
|
stmt.executeUpdate();
|
||||||
|
} catch (SQLException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void uploadScreenshot(Connection conn, EdgeDomain domain, Path screenshotPath) {
|
||||||
|
logger.info("Uploading {}", screenshotPath);
|
||||||
|
try (var stmt = conn.prepareStatement("REPLACE INTO DATA_DOMAIN_SCREENSHOT(DOMAIN_NAME, CONTENT_TYPE, DATA) VALUES (?,?,?)");
|
||||||
|
var is = Files.newInputStream(screenshotPath)
|
||||||
|
) {
|
||||||
|
stmt.setString(1, domain.toString());
|
||||||
|
stmt.setString(2, "image/webp");
|
||||||
|
stmt.setBlob(3, is);
|
||||||
|
stmt.executeUpdate();
|
||||||
|
|
||||||
|
Files.delete(screenshotPath);
|
||||||
|
|
||||||
|
} catch (SQLException | IOException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
flagDomainAsFetched(conn, domain);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Optional<Path> fetchDomain(ChromeDriver driver, EdgeDomain domain) {
|
||||||
|
try {
|
||||||
|
driver.get(domain.toRootUrl().toString());
|
||||||
|
|
||||||
|
final byte[] bytes = driver.getScreenshotAs(OutputType.BYTES);
|
||||||
|
|
||||||
|
final var img = ImageIO.read(new ByteArrayInputStream(bytes));
|
||||||
|
|
||||||
|
Path destPath = Files.createTempFile("website-screenshot-", ".webp");
|
||||||
|
ImageIO.write(img, "webp", destPath.toFile());
|
||||||
|
|
||||||
|
// If the screenshot is very small by size, it's very likely not particularly interesting to look at
|
||||||
|
if (Files.size(destPath) < 2500) {
|
||||||
|
Files.delete(destPath);
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
return Optional.of(destPath);
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
ex.printStackTrace();
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static List<EdgeDomain> fetchCrawlQueue(HikariDataSource ds, int queueSize) {
|
||||||
|
List<EdgeDomain> ret = new ArrayList<>(queueSize);
|
||||||
|
|
||||||
|
try (var conn = ds.getConnection(); var stmt = conn.createStatement()) {
|
||||||
|
var rsp = stmt.executeQuery(
|
||||||
|
"""
|
||||||
|
SELECT EC_DOMAIN.DOMAIN_NAME FROM EC_DOMAIN
|
||||||
|
LEFT JOIN DATA_DOMAIN_HISTORY ON EC_DOMAIN.DOMAIN_NAME=DATA_DOMAIN_HISTORY.DOMAIN_NAME
|
||||||
|
ORDER BY SCREENSHOT_DATE IS NULL DESC, SCREENSHOT_DATE, INDEXED DESC
|
||||||
|
LIMIT
|
||||||
|
""" + queueSize);
|
||||||
|
while (rsp.next()) {
|
||||||
|
ret.add(new EdgeDomain(rsp.getString(1)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (SQLException ex) {
|
||||||
|
ex.printStackTrace();
|
||||||
|
return Collections.emptyList();
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
}
|
@ -221,3 +221,8 @@ CREATE TABLE IF NOT EXISTS DATA_DOMAIN_SCREENSHOT (
|
|||||||
ROW_FORMAT=DYNAMIC
|
ROW_FORMAT=DYNAMIC
|
||||||
CHARACTER SET utf8mb4
|
CHARACTER SET utf8mb4
|
||||||
COLLATE utf8mb4_unicode_ci;
|
COLLATE utf8mb4_unicode_ci;
|
||||||
|
|
||||||
|
CREATE TABLE DATA_DOMAIN_HISTORY (
|
||||||
|
DOMAIN_NAME VARCHAR(255) PRIMARY KEY,
|
||||||
|
SCREENSHOT_DATE DATE DEFAULT NOW()
|
||||||
|
) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
|
Loading…
Reference in New Issue
Block a user