Specialization for javadocs

This commit is contained in:
Viktor Lofgren 2023-07-01 20:16:56 +02:00
parent 24dce8c03b
commit 42375f0e53
4 changed files with 1990 additions and 1 deletions

View File

@ -13,15 +13,19 @@ public class HtmlProcessorSpecializations {
private final LemmySpecialization lemmySpecialization;
private final XenForoSpecialization xenforoSpecialization;
private final PhpBBSpecialization phpBBSpecialization;
private final JavadocSpecialization javadocSpecialization;
private final DefaultSpecialization defaultSpecialization;
@Inject
public HtmlProcessorSpecializations(LemmySpecialization lemmySpecialization,
XenForoSpecialization xenforoSpecialization,
PhpBBSpecialization phpBBSpecialization, DefaultSpecialization defaultSpecialization) {
PhpBBSpecialization phpBBSpecialization,
JavadocSpecialization javadocSpecialization,
DefaultSpecialization defaultSpecialization) {
this.lemmySpecialization = lemmySpecialization;
this.xenforoSpecialization = xenforoSpecialization;
this.phpBBSpecialization = phpBBSpecialization;
this.javadocSpecialization = javadocSpecialization;
this.defaultSpecialization = defaultSpecialization;
}
@ -36,6 +40,10 @@ public class HtmlProcessorSpecializations {
if (generator.keywords().contains("phpbb")) {
return xenforoSpecialization;
}
if (generator.keywords().contains("javadoc")) {
return javadocSpecialization;
}
return defaultSpecialization;
}

View File

@ -0,0 +1,40 @@
package nu.marginalia.converting.processor.plugin.specialization;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.summary.SummaryExtractor;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Set;
@Singleton
public class JavadocSpecialization extends DefaultSpecialization {
private static final Logger logger = LoggerFactory.getLogger(JavadocSpecialization.class);
@Inject
public JavadocSpecialization(SummaryExtractor summaryExtractor) {
super(summaryExtractor);
}
@Override
public Document prune(Document doc) {
final var prunedDoc = super.prune(doc);
prunedDoc.getElementsByTag("noscript").remove();
return prunedDoc;
}
@Override
public String getSummary(Document doc,
Set<String> importantWords) {
var block = doc.getElementsByClass("block").first();
if (block != null)
return block.text();
return super.getSummary(doc, importantWords);
}
}

View File

@ -0,0 +1,48 @@
package nu.marginalia.converting.processor.plugin.specialization;
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
import nu.marginalia.summary.SummaryExtractor;
import nu.marginalia.test.CommonTestData;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import java.util.Set;
class JavadocSpecializationTest {
static JavadocSpecialization specialization;
static DocumentGeneratorExtractor generatorExtractor = new DocumentGeneratorExtractor();
String thread = CommonTestData.loadTestData("mock-crawl-data/javadoc/stream.html");
@BeforeAll
public static void setUpAll() {
specialization = new JavadocSpecialization(
new SummaryExtractor(255,
null,
null,
null,
null,
null));
}
@Test
void prune() {
System.out.println(specialization.prune(Jsoup.parse(thread)));
}
@Test
void generatorExtraction() {
var gen = generatorExtractor.generatorCleaned(Jsoup.parse(thread));
System.out.println(gen);
}
@Test
void getSummary() {
String summary = specialization.getSummary(Jsoup.parse(thread), Set.of(""));
System.out.println(summary);
}
}

File diff suppressed because it is too large Load Diff