(screenshot-capture-tool) Make screenshot-capture-tool cooperate with docker
This commit is contained in:
parent
a8b9d21f2d
commit
3047e2dd7c
@ -8,7 +8,6 @@ import lombok.SneakyThrows;
|
||||
import nu.marginalia.hash.MurmurHash3_128;
|
||||
import nu.marginalia.link_parser.LinkParser;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.service.control.ServiceHeartbeat;
|
||||
import nu.marginalia.storage.model.*;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jsoup.Jsoup;
|
||||
|
@ -3,6 +3,7 @@ plugins {
|
||||
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
id 'com.palantir.docker' version '0.35.0'
|
||||
}
|
||||
|
||||
java {
|
||||
@ -16,6 +17,8 @@ application {
|
||||
applicationName = 'screenshot-capture-tool'
|
||||
}
|
||||
|
||||
apply from: "$rootProject.projectDir/docker-service.gradle"
|
||||
|
||||
tasks.distZip.enabled = false
|
||||
|
||||
dependencies {
|
||||
@ -30,6 +33,7 @@ dependencies {
|
||||
implementation libs.commons.compress
|
||||
implementation libs.commons.io
|
||||
implementation libs.guice
|
||||
implementation libs.gson
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
|
@ -1,37 +1,28 @@
|
||||
package nu.marginalia.screenshot;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.gson.GsonFactory;
|
||||
import nu.marginalia.service.module.DatabaseModule;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.openqa.selenium.OutputType;
|
||||
import org.openqa.selenium.PageLoadStrategy;
|
||||
import org.openqa.selenium.TimeoutException;
|
||||
import org.openqa.selenium.chrome.ChromeDriver;
|
||||
import org.openqa.selenium.chrome.ChromeDriverService;
|
||||
import org.openqa.selenium.chrome.ChromeOptions;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.net.URI;
|
||||
import java.net.http.HttpClient;
|
||||
import java.net.http.HttpRequest;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.Connection;
|
||||
import java.sql.SQLException;
|
||||
import java.time.Duration;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.openqa.selenium.support.ui.ExpectedCondition;
|
||||
import org.openqa.selenium.support.ui.WebDriverWait;
|
||||
import org.openqa.selenium.JavascriptExecutor;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
public class ScreenshotCaptureToolMain {
|
||||
|
||||
@ -43,30 +34,21 @@ public class ScreenshotCaptureToolMain {
|
||||
|
||||
System.setProperty(ChromeDriverService.CHROME_DRIVER_SILENT_OUTPUT_PROPERTY, "true");
|
||||
|
||||
ChromeDriver driver = initChromeDriver();
|
||||
List<EdgeDomain> crawlQueue = fetchCrawlQueue(ds, 1000);
|
||||
|
||||
HttpClient httpClient = HttpClient.newHttpClient();
|
||||
HttpClient httpClient = HttpClient.newBuilder()
|
||||
.version(HttpClient.Version.HTTP_1_1)
|
||||
.connectTimeout(Duration.ofSeconds(30))
|
||||
.build()
|
||||
;
|
||||
|
||||
try (Connection conn = ds.getConnection()) {
|
||||
|
||||
|
||||
logger.info("Probing domains");
|
||||
var ret = crawlQueue.parallelStream().collect(Collectors.partitioningBy(domain -> probeUrl(httpClient, domain)));
|
||||
|
||||
var badDomains = ret.getOrDefault(Boolean.FALSE, Collections.emptyList());
|
||||
var goodDomains = ret.getOrDefault(Boolean.TRUE, Collections.emptyList());
|
||||
|
||||
logger.info("Result: {} good domains, {} bad domains", goodDomains.size(), badDomains.size());
|
||||
|
||||
badDomains.forEach(domain -> flagDomainAsFetched(conn, domain));
|
||||
|
||||
for (var domain : goodDomains) {
|
||||
for (var domain : crawlQueue) {
|
||||
logger.info("Fetching {}", domain);
|
||||
|
||||
var filePath = fetchDomain(driver, domain);
|
||||
if (filePath != null) {
|
||||
uploadScreenshot(conn, domain, filePath);
|
||||
byte[] webpBytes = fetchDomain(httpClient, domain);
|
||||
if (webpBytes != null) {
|
||||
uploadScreenshot(conn, domain, webpBytes);
|
||||
} else {
|
||||
flagDomainAsFetched(conn, domain);
|
||||
}
|
||||
@ -74,38 +56,15 @@ public class ScreenshotCaptureToolMain {
|
||||
|
||||
} catch (SQLException e) {
|
||||
e.printStackTrace();
|
||||
} finally {
|
||||
driver.quit();
|
||||
}
|
||||
}
|
||||
|
||||
@NotNull
|
||||
private static ChromeDriver initChromeDriver() {
|
||||
System.setProperty("webdriver.chrome.driver", "./chromedriver");
|
||||
ChromeOptions options = new ChromeOptions();
|
||||
|
||||
options.setPageLoadStrategy(PageLoadStrategy.NONE);
|
||||
options.setPageLoadTimeout(Duration.ofSeconds(30));
|
||||
|
||||
options.addArguments(
|
||||
"no-sandbox",
|
||||
"headless",
|
||||
"user-agent=search.marginalia.nu",
|
||||
"window-size=1024,768",
|
||||
"force-device-scale-factor=0.5",
|
||||
"high-dpi-support=0.5",
|
||||
"dns-prefetch-disable",
|
||||
"disable-gpu",
|
||||
"disable-dev-shm-usage",
|
||||
"disable-software-rasterizer",
|
||||
"disable-extensions"
|
||||
);
|
||||
|
||||
return new ChromeDriver(options);
|
||||
}
|
||||
|
||||
private static void flagDomainAsFetched(Connection conn, EdgeDomain domain) {
|
||||
try (var stmt = conn.prepareStatement("REPLACE INTO DATA_DOMAIN_HISTORY(DOMAIN_NAME, SCREENSHOT_DATE) VALUES (?, NOW())")) {
|
||||
try (var stmt = conn.prepareStatement("""
|
||||
REPLACE INTO DATA_DOMAIN_HISTORY(DOMAIN_NAME, SCREENSHOT_DATE)
|
||||
VALUES (?, NOW())
|
||||
"""))
|
||||
{
|
||||
stmt.setString(1, domain.toString());
|
||||
stmt.executeUpdate();
|
||||
} catch (SQLException e) {
|
||||
@ -113,87 +72,68 @@ public class ScreenshotCaptureToolMain {
|
||||
}
|
||||
}
|
||||
|
||||
private static void uploadScreenshot(Connection conn, EdgeDomain domain, Path screenshotPath) {
|
||||
logger.info("Uploading {}", screenshotPath);
|
||||
try (var stmt = conn.prepareStatement("REPLACE INTO DATA_DOMAIN_SCREENSHOT(DOMAIN_NAME, CONTENT_TYPE, DATA) VALUES (?,?,?)");
|
||||
var is = Files.newInputStream(screenshotPath)
|
||||
private static void uploadScreenshot(Connection conn, EdgeDomain domain, byte[] webpBytes) {
|
||||
try (var stmt = conn.prepareStatement("""
|
||||
REPLACE INTO DATA_DOMAIN_SCREENSHOT(DOMAIN_NAME, CONTENT_TYPE, DATA)
|
||||
VALUES (?,?,?)
|
||||
""");
|
||||
var is = new ByteArrayInputStream(webpBytes)
|
||||
) {
|
||||
stmt.setString(1, domain.toString());
|
||||
stmt.setString(2, "image/webp");
|
||||
stmt.setString(2, "image/png");
|
||||
stmt.setBlob(3, is);
|
||||
stmt.executeUpdate();
|
||||
|
||||
Files.delete(screenshotPath);
|
||||
|
||||
} catch (SQLException | IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
|
||||
flagDomainAsFetched(conn, domain);
|
||||
}
|
||||
|
||||
private static boolean probeUrl(HttpClient httpClient, EdgeDomain domain) {
|
||||
private static Gson gson = GsonFactory.get();
|
||||
|
||||
private static byte[] fetchDomain(HttpClient client, EdgeDomain domain) {
|
||||
try {
|
||||
Map<String, Object> requestData = Map.of(
|
||||
"url", domain.toRootUrl().toString(),
|
||||
"options",
|
||||
Map.of("fullPage", false,
|
||||
"type", "png"),
|
||||
"gotoOptions", Map.of(
|
||||
"waitUntil", "networkidle2",
|
||||
"timeout", TimeUnit.SECONDS.toMillis(10)
|
||||
)
|
||||
);
|
||||
|
||||
var request = HttpRequest.newBuilder()
|
||||
.uri(new URI(domain.toRootUrl().toString()))
|
||||
.timeout(Duration.ofSeconds(5))
|
||||
.method("HEAD", HttpRequest.BodyPublishers.noBody())
|
||||
.header("user-agent", "search.marginialia.nu")
|
||||
.uri(new URI("http://browserless:3000/screenshot"))
|
||||
.method("POST", HttpRequest.BodyPublishers.ofString(
|
||||
gson.toJson(requestData)
|
||||
))
|
||||
.header("Content-type", "application/json")
|
||||
.build();
|
||||
var rsp = client.send(request, HttpResponse.BodyHandlers.ofByteArray());
|
||||
|
||||
var response = httpClient.send(request, HttpResponse.BodyHandlers.ofString());
|
||||
|
||||
return response.statusCode() < 400;
|
||||
} catch (Exception ex) {
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
private static Path fetchDomain(ChromeDriver driver, EdgeDomain domain) {
|
||||
try {
|
||||
driver.get(domain.toRootUrl().toString());
|
||||
|
||||
WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(30));
|
||||
|
||||
try {
|
||||
wait.until((ExpectedCondition<Boolean>) wd -> {
|
||||
if (wd instanceof JavascriptExecutor jse) {
|
||||
return "complete".equals(jse.executeScript("return document.readyState"));
|
||||
}
|
||||
return true;
|
||||
});
|
||||
}
|
||||
catch (TimeoutException ex) {
|
||||
logger.info("Wait timed out, forcing window.stop()");
|
||||
driver.executeScript("window.stop()");
|
||||
}
|
||||
|
||||
|
||||
|
||||
final byte[] bytes = driver.getScreenshotAs(OutputType.BYTES);
|
||||
|
||||
final var img = ImageIO.read(new ByteArrayInputStream(bytes));
|
||||
|
||||
|
||||
Path destPath = Files.createTempFile("website-screenshot-", ".webp");
|
||||
ImageIO.write(img, "webp", destPath.toFile());
|
||||
|
||||
// If the screenshot is very small by size, it's very likely not particularly interesting to look at
|
||||
if (Files.size(destPath) < 3500) {
|
||||
Files.delete(destPath);
|
||||
if (rsp.statusCode() >= 300) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return destPath;
|
||||
byte[] image = rsp.body();
|
||||
if (image.length < 3500) {
|
||||
logger.warn("Skipping {} due to size ({})", domain, image.length);
|
||||
return null;
|
||||
}
|
||||
|
||||
return image;
|
||||
}
|
||||
catch (Exception ex) {
|
||||
ex.printStackTrace();
|
||||
logger.warn("Exception in screenshotting " + domain, ex);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private static List<EdgeDomain> fetchCrawlQueue(HikariDataSource ds, int queueSize) {
|
||||
|
||||
List<EdgeDomain> ret = new ArrayList<>(queueSize);
|
||||
|
||||
try (var conn = ds.getConnection(); var stmt = conn.createStatement()) {
|
||||
@ -202,14 +142,14 @@ public class ScreenshotCaptureToolMain {
|
||||
SELECT EC_DOMAIN.DOMAIN_NAME FROM EC_DOMAIN
|
||||
LEFT JOIN DATA_DOMAIN_HISTORY ON EC_DOMAIN.DOMAIN_NAME=DATA_DOMAIN_HISTORY.DOMAIN_NAME
|
||||
ORDER BY SCREENSHOT_DATE IS NULL DESC, SCREENSHOT_DATE, INDEXED DESC
|
||||
LIMIT
|
||||
LIMIT
|
||||
""" + queueSize);
|
||||
while (rsp.next()) {
|
||||
ret.add(new EdgeDomain(rsp.getString(1)));
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
ex.printStackTrace();
|
||||
catch (Exception ex) {
|
||||
logger.warn("Exception in fetching queue", ex);
|
||||
return Collections.emptyList();
|
||||
}
|
||||
return ret;
|
||||
|
53
docker-compose-screenshot-bot.yml
Normal file
53
docker-compose-screenshot-bot.yml
Normal file
@ -0,0 +1,53 @@
|
||||
x-svc: &service
|
||||
env_file:
|
||||
- "run/env/service.env"
|
||||
volumes:
|
||||
- conf:/wmsa/conf:ro
|
||||
- data:/wmsa/data
|
||||
- logs:/var/log/wmsa
|
||||
networks:
|
||||
- wmsa
|
||||
services:
|
||||
screenshot-capture-tool:
|
||||
<<: *service
|
||||
image: "marginalia.nu/screenshot-capture-tool"
|
||||
container_name: "screenshot-capture-tool"
|
||||
networks:
|
||||
- wmsa
|
||||
- headlesschrome
|
||||
depends_on:
|
||||
- browserless
|
||||
browserless:
|
||||
<<: *service
|
||||
image: "browserless/chrome"
|
||||
container_name: "headlesschrome"
|
||||
env_file:
|
||||
- "run/env/browserless.env"
|
||||
ports:
|
||||
- "3000:3000"
|
||||
networks:
|
||||
- wmsa
|
||||
- headlesschrome
|
||||
|
||||
networks:
|
||||
wmsa:
|
||||
headlesschrome:
|
||||
volumes:
|
||||
logs:
|
||||
driver: local
|
||||
driver_opts:
|
||||
type: none
|
||||
o: bind
|
||||
device: run/logs
|
||||
conf:
|
||||
driver: local
|
||||
driver_opts:
|
||||
type: none
|
||||
o: bind
|
||||
device: run/conf
|
||||
data:
|
||||
driver: local
|
||||
driver_opts:
|
||||
type: none
|
||||
o: bind
|
||||
device: run/data
|
2
run/env/browserless.env
vendored
Normal file
2
run/env/browserless.env
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
DEFAULT_LAUNCH_ARGS='["--window-size=1024,768", "--user-agent=search.marginalia.nu", "--force-device-scale-factor=0.5", "--high-dpi-support=0.5"]'
|
||||
DEBUG=-*
|
40
run/experiment-elsewhere.sh
Executable file
40
run/experiment-elsewhere.sh
Executable file
@ -0,0 +1,40 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
EXPERIMENT=$1
|
||||
SAMPLE_NAME=crawl-${2:-m}
|
||||
ARGS=${@:3}
|
||||
SAMPLE_DIR="node-1/samples/${SAMPLE_NAME}/"
|
||||
|
||||
export EXPERIMENT_RUNNER_OPTS="--enable-preview"
|
||||
echo "args = $ARGS"
|
||||
|
||||
## Configuration
|
||||
|
||||
pushd $(dirname $0)
|
||||
|
||||
JAVA_OPTS="
|
||||
-Dcrawl.rootDirRewrite=/crawl:${SAMPLE_DIR}
|
||||
-Ddb.overrideJdbc=jdbc:mariadb://localhost:3306/WMSA_prod?rewriteBatchedStatements=true
|
||||
-ea
|
||||
"
|
||||
|
||||
## Configuration ends
|
||||
|
||||
if [ -z "$EXPERIMENT" ]; then
|
||||
echo "Usage: $0 experiment-name path-to-crawl-data"
|
||||
exit 255;
|
||||
fi
|
||||
|
||||
tar xf ../code/tools/experiment-runner/build/distributions/experiment-runner.tar -C install/
|
||||
|
||||
PATH+=":install/experiment-runner/bin"
|
||||
|
||||
export WMSA_HOME=.
|
||||
export PATH
|
||||
export JAVA_OPTS
|
||||
|
||||
experiment-runner $2 ${EXPERIMENT} ${ARGS}
|
||||
|
||||
popd
|
Loading…
Reference in New Issue
Block a user