You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by ma...@apache.org on 2023/01/22 06:45:32 UTC
[opennlp-sandbox] 01/01: updates sandbox component 'wikinews-importer' to be compatible with latest opennlp-tools release
This is an automated email from the ASF dual-hosted git repository.
mawiesne pushed a commit to branch migrate-wikinews-importer-to-opennlp-tools-2_1_0
in repository https://gitbox.apache.org/repos/asf/opennlp-sandbox.git
commit 608c287b4571529f9408f9d98abafdb39bb03a1f
Author: Martin Wiesner <ma...@hs-heilbronn.de>
AuthorDate: Sun Jan 22 07:45:20 2023 +0100
updates sandbox component 'wikinews-importer' to be compatible with latest opennlp-tools release
- adjusts parent project (org.apache.apache) to version 18
- adjusts Java language level to 11
- updates `uimaj` dependencies to version 3.3.1
- modernizes resource handling in `WikinewsConverter`
- corrects some formatting issues
- removes unused imports
---
wikinews-importer/pom.xml | 50 +++++---------
.../wikinews_importer/AnnotatingMarkupParser.java | 77 ++++++++++------------
.../apache/opennlp/wikinews_importer/UimaUtil.java | 8 +--
.../wikinews_importer/WikinewsConverter.java | 67 +++++++------------
4 files changed, 76 insertions(+), 126 deletions(-)
diff --git a/wikinews-importer/pom.xml b/wikinews-importer/pom.xml
index efec79e..3b67865 100644
--- a/wikinews-importer/pom.xml
+++ b/wikinews-importer/pom.xml
@@ -21,75 +21,55 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
-
<parent>
<groupId>org.apache</groupId>
<artifactId>apache</artifactId>
- <version>9</version>
+ <!-- TODO OPENNLP-1452 once this is resolved, move to 29 as well. -->
+ <version>18</version>
<relativePath />
</parent>
<groupId>org.apache.opennlp</groupId>
<artifactId>wikinews-importer</artifactId>
- <version>0.0.1-incubating-SNAPSHOT</version>
+ <version>2.1.1-incubating-SNAPSHOT</version>
<packaging>jar</packaging>
- <name>OpenNLP Wikinews Importer</name>
-
- <prerequisites>
- <maven>3.0</maven>
- </prerequisites>
+ <name>Apache OpenNLP Wikinews Importer</name>
- <repositories>
- <repository>
- <id>maven2-repository.java.net</id>
- <name>Java.net Repository for Maven</name>
- <url>http://download.java.net/maven/2/</url>
- <layout>default</layout>
- </repository>
+ <properties>
+ <uimaj.version>3.3.1</uimaj.version>
+ </properties>
- <repository>
- <id>info-bliki-repository</id>
- <url>http://gwtwiki.googlecode.com/svn/maven-repository/</url>
- <releases>
- <enabled>true</enabled>
- </releases>
- <snapshots>
- <enabled>false</enabled>
- </snapshots>
- </repository>
- </repositories>
-
<dependencies>
<dependency>
<groupId>com.sun.jersey</groupId>
<artifactId>jersey-json</artifactId>
- <version>1.8</version>
+ <version>1.19.4</version>
</dependency>
<dependency>
<groupId>com.sun.jersey</groupId>
<artifactId>jersey-client</artifactId>
- <version>1.8</version>
+ <version>1.19.4</version>
</dependency>
<dependency>
<groupId>info.bliki.wiki</groupId>
<artifactId>bliki-core</artifactId>
- <version>3.0.16</version>
+ <version>3.0.19</version>
</dependency>
<dependency>
<groupId>org.apache.uima</groupId>
<artifactId>uimaj-core</artifactId>
- <version>2.3.1</version>
+ <version>${uimaj.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
- <version>4.8.1</version>
+ <version>4.13.2</version>
<scope>test</scope>
</dependency>
</dependencies>
@@ -100,9 +80,9 @@
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
- <source>1.6</source>
- <target>1.6</target>
- <compilerArgument>-Xlint</compilerArgument>
+ <source>11</source>
+ <target>11</target>
+ <compilerArgument>-Xlint</compilerArgument>
</configuration>
</plugin>
</plugins>
diff --git a/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/AnnotatingMarkupParser.java b/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/AnnotatingMarkupParser.java
index b1f9b00..f341a29 100644
--- a/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/AnnotatingMarkupParser.java
+++ b/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/AnnotatingMarkupParser.java
@@ -17,6 +17,13 @@
package org.apache.opennlp.wikinews_importer;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Pattern;
+
import info.bliki.htmlcleaner.ContentToken;
import info.bliki.htmlcleaner.TagNode;
import info.bliki.wiki.filter.ITextConverter;
@@ -28,24 +35,15 @@ import info.bliki.wiki.model.ImageFormat;
import info.bliki.wiki.model.WikiModel;
import info.bliki.wiki.tags.WPATag;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.regex.Pattern;
-
/**
* Parse mediawiki markup to strip the formatting info and extract a simple text
* version suitable for NLP along with header, paragraph and link position
* annotations.
- *
+ * <p>
* Use the {@code #convert(String)} and {@code #getWikiLinks()} methods.
- *
- * Due to the constraints imposed by the {@code ITextConverter} /
- * {@code WikiModel} API, this class is not thread safe: only one instance
+ * <p>
+ * Due to the constraints imposed by the {@link ITextConverter} /
+ * {@link WikiModel} API, this class is not thread safe: only one instance
* should be run by thread.
*/
public class AnnotatingMarkupParser implements ITextConverter {
@@ -58,19 +56,17 @@ public class AnnotatingMarkupParser implements ITextConverter {
public static final String WIKIOBJECT_ATTR_KEY = "wikiobject";
- public static final Set<String> PARAGRAPH_TAGS = new HashSet<String>(
- Arrays.asList("p"));
+ public static final Set<String> PARAGRAPH_TAGS = Set.of("p");
- public static final Set<String> HEADING_TAGS = new HashSet<String>(
- Arrays.asList("h1", "h2", "h3", "h4", "h5", "h6"));
+ public static final Set<String> HEADING_TAGS = Set.of("h1", "h2", "h3", "h4", "h5", "h6");
public static final Pattern INTERWIKI_PATTERN = Pattern.compile("http://[\\w-]+\\.wikipedia\\.org/wiki/.*");
- protected final List<Annotation> wikilinks = new ArrayList<Annotation>();
+ protected final List<Annotation> wikilinks = new ArrayList<>();
- protected final List<Annotation> headers = new ArrayList<Annotation>();
+ protected final List<Annotation> headers = new ArrayList<>();
- protected final List<Annotation> paragraphs = new ArrayList<Annotation>();
+ protected final List<Annotation> paragraphs = new ArrayList<>();
protected String languageCode = "en";
@@ -91,24 +87,21 @@ public class AnnotatingMarkupParser implements ITextConverter {
model = makeWikiModel(languageCode);
}
- public WikiModel makeWikiModel(String languageCode) {
- return new WikiModel(String.format(
- "http:/%s.wikipedia.org/wiki/${image}", languageCode),
- String.format("http://%s.wikipedia.org/wiki/${title}",
- languageCode)) {
+ public WikiModel makeWikiModel(String langCode) {
+ return new WikiModel(String.format("https:/%s.wikipedia.org/wiki/${image}", langCode),
+ String.format("https://%s.wikipedia.org/wiki/${title}", langCode)) {
@Override
public String getRawWikiContent(String namespace,
String articleName, Map<String, String> templateParameters) {
// disable template support
- // TODO: we need to readd template support at least for dates
+ // TODO: we need to read template support at least for dates
return "";
}
};
}
-
- public void nodesToText(List<? extends Object> nodes, Appendable buffer,
- IWikiModel model) throws IOException {
+ @Override
+ public void nodesToText(List<?> nodes, Appendable buffer, IWikiModel model) throws IOException {
CountingAppendable countingBuffer;
if (buffer instanceof CountingAppendable) {
countingBuffer = (CountingAppendable) buffer;
@@ -179,22 +172,18 @@ public class AnnotatingMarkupParser implements ITextConverter {
// sentences with links to entities
hasSpecialHandling = true;
ImageFormat iformat = (ImageFormat) oAttributes.get(WIKIOBJECT_ATTR_KEY);
- imageNodeToText(tagNode, iformat, countingBuffer,
- model);
+ imageNodeToText(tagNode, iformat, countingBuffer, model);
}
if (!hasSpecialHandling) {
- nodesToText(tagNode.getChildren(), countingBuffer,
- model);
+ nodesToText(tagNode.getChildren(), countingBuffer, model);
}
if (PARAGRAPH_TAGS.contains(tagName)) {
paragraphs.add(new Annotation(tagBegin,
- countingBuffer.currentPosition,
- "paragraph", tagName));
+ countingBuffer.currentPosition, "paragraph", tagName));
countingBuffer.append("\n\n");
} else if (HEADING_TAGS.contains(tagName)) {
headers.add(new Annotation(tagBegin,
- countingBuffer.currentPosition, "heading",
- tagName));
+ countingBuffer.currentPosition, "heading", tagName));
countingBuffer.append("\n\n");
} else if ("a".equals(tagName)) {
String href = attributes.get(HREF_ATTR_KEY);
@@ -212,11 +201,13 @@ public class AnnotatingMarkupParser implements ITextConverter {
}
}
+ @Override
public void imageNodeToText(TagNode tagNode, ImageFormat imageFormat,
Appendable buffer, IWikiModel model) throws IOException {
// nodesToText(tagNode.getChildren(), buffer, model);
}
+ @Override
public boolean noLinks() {
return true;
}
@@ -234,7 +225,7 @@ public class AnnotatingMarkupParser implements ITextConverter {
}
public List<String> getParagraphs() {
- List<String> texts = new ArrayList<String>();
+ List<String> texts = new ArrayList<>();
for (Annotation p : paragraphs) {
texts.add(text.substring(p.begin, p.end));
}
@@ -242,7 +233,7 @@ public class AnnotatingMarkupParser implements ITextConverter {
}
public List<String> getHeaders() {
- List<String> texts = new ArrayList<String>();
+ List<String> texts = new ArrayList<>();
for (Annotation h : headers) {
texts.add(text.substring(h.begin, h.end));
}
@@ -253,7 +244,7 @@ public class AnnotatingMarkupParser implements ITextConverter {
return redirect;
}
- public class CountingAppendable implements Appendable {
+ public static class CountingAppendable implements Appendable {
public int currentPosition = 0;
@@ -263,18 +254,20 @@ public class AnnotatingMarkupParser implements ITextConverter {
this.wrappedBuffer = wrappedBuffer;
}
+ @Override
public Appendable append(CharSequence charSeq) throws IOException {
currentPosition += charSeq.length();
return wrappedBuffer.append(charSeq);
}
+ @Override
public Appendable append(char aChar) throws IOException {
currentPosition += 1;
return wrappedBuffer.append(aChar);
}
- public Appendable append(CharSequence charSeq, int start, int end)
- throws IOException {
+ @Override
+ public Appendable append(CharSequence charSeq, int start, int end) throws IOException {
currentPosition += end - start;
return wrappedBuffer.append(charSeq, start, end);
}
diff --git a/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/UimaUtil.java b/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/UimaUtil.java
index a9fd480..745ec11 100644
--- a/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/UimaUtil.java
+++ b/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/UimaUtil.java
@@ -58,8 +58,7 @@ public class UimaUtil {
TypeSystemDescription typeSystemDesciptor;
try {
- typeSystemDesciptor = (TypeSystemDescription) xmlParser
- .parse(xmlTypeSystemSource);
+ typeSystemDesciptor = (TypeSystemDescription) xmlParser.parse(xmlTypeSystemSource);
typeSystemDesciptor.resolveImports();
} catch (InvalidXMLException e) {
@@ -109,11 +108,10 @@ public class UimaUtil {
throw new IllegalStateException("SAX error while creating parser!", e);
}
- XmiCasDeserializer dezerializer = new XmiCasDeserializer(
- cas.getTypeSystem());
+ XmiCasDeserializer deserializer = new XmiCasDeserializer(cas.getTypeSystem());
try {
- saxParser.parse(xmiIn, dezerializer.getXmiCasHandler(cas));
+ saxParser.parse(xmiIn, deserializer.getXmiCasHandler(cas));
} catch (SAXException e) {
throw new IOException("Invalid XMI input!", e);
}
diff --git a/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsConverter.java b/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsConverter.java
index abbcc54..9c03e74 100644
--- a/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsConverter.java
+++ b/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsConverter.java
@@ -22,8 +22,8 @@ import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
-import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
+import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
@@ -49,45 +49,38 @@ public class WikinewsConverter {
private final TypeSystemDescription tsDesc;
private final File outputFolder;
- private List<String> endOfArtilceMarkers = new ArrayList<String>();
+ private final List<String> endOfArticleMarkers = new ArrayList<>();
CASArticleFilter(TypeSystemDescription tsDesc, File outputFolder) {
this.tsDesc = tsDesc;
this.outputFolder = outputFolder;
- endOfArtilceMarkers.add("{{haveyoursay}}");
- endOfArtilceMarkers.add("== Sources ==");
- endOfArtilceMarkers.add("==Sources==");
- endOfArtilceMarkers.add("== Source ==");
- endOfArtilceMarkers.add("==Source==");
- endOfArtilceMarkers.add("==References==");
- endOfArtilceMarkers.add("== References ==");
- endOfArtilceMarkers.add("=== References===");
+ endOfArticleMarkers.add("{{haveyoursay}}");
+ endOfArticleMarkers.add("== Sources ==");
+ endOfArticleMarkers.add("==Sources==");
+ endOfArticleMarkers.add("== Source ==");
+ endOfArticleMarkers.add("==Source==");
+ endOfArticleMarkers.add("==References==");
+ endOfArticleMarkers.add("== References ==");
+ endOfArticleMarkers.add("=== References===");
}
-
- public static String titleToUri(String title) {
- try {
- return URLEncoder.encode(title.replaceAll(" ", "_"), "UTF-8");
- } catch (UnsupportedEncodingException e) {
- throw new RuntimeException(e);
- }
+ public static String titleToUri(String title) {
+ return URLEncoder.encode(title.replaceAll(" ", "_"), StandardCharsets.UTF_8);
}
-
+
+ @Override
public void process(WikiArticle page, Siteinfo siteinfo)
throws SAXException {
if (page.getIntegerNamespace() == 0 && page.isMain()) {
-
if (page.getText().toLowerCase().contains("{publish}")) {
String pageText = page.getText();
-
-
int cutIndex = pageText.length();
- for (String endMarker : endOfArtilceMarkers) {
+ for (String endMarker : endOfArticleMarkers) {
int endMarkerIndex = pageText.indexOf(endMarker);
if (endMarkerIndex != -1 && endMarkerIndex < cutIndex) {
cutIndex = endMarkerIndex;
@@ -98,8 +91,9 @@ public class WikinewsConverter {
pageText = pageText.substring(0, cutIndex);
}
- WikinewsWikiModel wikiModel = new WikinewsWikiModel("http://en.wikinews.org/wiki/${image}",
- "http://en.wikinews.org/wiki/${title}");
+ WikinewsWikiModel wikiModel = new WikinewsWikiModel(
+ "https://en.wikinews.org/wiki/${image}",
+ "https://en.wikinews.org/wiki/${title}");
AnnotatingMarkupParser converter = new AnnotatingMarkupParser();
String plainStr = wikiModel.render(converter, pageText);
@@ -137,8 +131,7 @@ public class WikinewsConverter {
}
for (Annotation subHeadAnn : converter.getHeaderAnnotations()) {
- AnnotationFS subHeadAnnFS = articleCAS.createAnnotation(
- articleCAS.getTypeSystem()
+ AnnotationFS subHeadAnnFS = articleCAS.createAnnotation(articleCAS.getTypeSystem()
.getType("org.apache.opennlp.annotations.SubHeadline"),
bodyOffset + subHeadAnn.begin, bodyOffset + subHeadAnn.end);
@@ -150,8 +143,7 @@ public class WikinewsConverter {
Feature linkFeature = wikiLinkType.getFeatureByBaseName("link");
for (Annotation wikiLinkAnn : converter.getWikiLinkAnnotations()) {
- AnnotationFS wikiLinkAnnFS = articleCAS.createAnnotation(
- articleCAS.getTypeSystem()
+ AnnotationFS wikiLinkAnnFS = articleCAS.createAnnotation(articleCAS.getTypeSystem()
.getType("org.apache.opennlp.annotations.WikiLink"),
bodyOffset + wikiLinkAnn.begin, bodyOffset + wikiLinkAnn.end);
@@ -164,32 +156,19 @@ public class WikinewsConverter {
markupCas.setDocumentText(page.toString());
// now serialize CAS
- OutputStream casOut = null;
- try {
- casOut = new FileOutputStream(outputFolder.getAbsolutePath() +
- File.separator + titleToUri(page.getTitle()) + ".xmi");
-
+ try (OutputStream casOut = new FileOutputStream(outputFolder.getAbsolutePath() +
+ File.separator + titleToUri(page.getTitle()) + ".xmi")) {
+
UimaUtil.serializeCASToXmi(articleCAS, casOut);
}
catch (IOException e) {
e.printStackTrace();
}
- finally {
- try {
- if (casOut != null)
- casOut.close();
- } catch (IOException e) {
- }
- }
-
}
}
}
}
- /**
- * @param args
- */
public static void main(String[] args) throws Exception {
if (args.length != 2) {
System.err.println("Usage: Parser <XML-File> <Output-Folder>");