You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by ma...@apache.org on 2023/01/22 06:45:32 UTC
[opennlp-sandbox] 01/01: updates sandbox component 'wikinews-importer' to be compatible with latest opennlp-tools release

This is an automated email from the ASF dual-hosted git repository.

mawiesne pushed a commit to branch migrate-wikinews-importer-to-opennlp-tools-2_1_0
in repository https://gitbox.apache.org/repos/asf/opennlp-sandbox.git

commit 608c287b4571529f9408f9d98abafdb39bb03a1f
Author: Martin Wiesner <ma...@hs-heilbronn.de>
AuthorDate: Sun Jan 22 07:45:20 2023 +0100

    updates sandbox component 'wikinews-importer' to be compatible with latest opennlp-tools release
    
    - adjusts parent project (org.apache.apache) to version 18
    - adjusts Java language level to 11
    - updates `uimaj` dependencies to version 3.3.1
    - modernizes resource handling in `WikinewsConverter`
    - corrects some formatting issues
    - removes unused imports
---
 wikinews-importer/pom.xml                          | 50 +++++---------
 .../wikinews_importer/AnnotatingMarkupParser.java  | 77 ++++++++++------------
 .../apache/opennlp/wikinews_importer/UimaUtil.java |  8 +--
 .../wikinews_importer/WikinewsConverter.java       | 67 +++++++------------
 4 files changed, 76 insertions(+), 126 deletions(-)

diff --git a/wikinews-importer/pom.xml b/wikinews-importer/pom.xml
index efec79e..3b67865 100644
--- a/wikinews-importer/pom.xml
+++ b/wikinews-importer/pom.xml
@@ -21,75 +21,55 @@
 
 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
 	<modelVersion>4.0.0</modelVersion>
-
 	<parent>
 		<groupId>org.apache</groupId>
 		<artifactId>apache</artifactId>
-		<version>9</version>
+		<!-- TODO OPENNLP-1452 once this is resolved, move to 29 as well. -->
+		<version>18</version>
 		<relativePath />
 	</parent>
 
 	<groupId>org.apache.opennlp</groupId>
 	<artifactId>wikinews-importer</artifactId>
-	<version>0.0.1-incubating-SNAPSHOT</version>
+	<version>2.1.1-incubating-SNAPSHOT</version>
 	<packaging>jar</packaging>
 
-	<name>OpenNLP Wikinews Importer</name>
-
-	<prerequisites>
-		<maven>3.0</maven>
-	</prerequisites>
+	<name>Apache OpenNLP Wikinews Importer</name>
 
-	<repositories>
-		<repository>
-			<id>maven2-repository.java.net</id>
-			<name>Java.net Repository for Maven</name>
-			<url>http://download.java.net/maven/2/</url>
-			<layout>default</layout>
-		</repository>
+	<properties>
+		<uimaj.version>3.3.1</uimaj.version>
+	</properties>
 
-		<repository>
-			<id>info-bliki-repository</id>
-			<url>http://gwtwiki.googlecode.com/svn/maven-repository/</url>
-			<releases>
-				<enabled>true</enabled>
-			</releases>
-			<snapshots>
-				<enabled>false</enabled>
-			</snapshots>
-		</repository>
-	</repositories>
-	
 	<dependencies>
 		<dependency>
 		    <groupId>com.sun.jersey</groupId>
 		    <artifactId>jersey-json</artifactId>
-		    <version>1.8</version>
+		    <version>1.19.4</version>
 		</dependency>
 
 		<dependency>
 		    <groupId>com.sun.jersey</groupId>
 		    <artifactId>jersey-client</artifactId>
-		    <version>1.8</version>
+		    <version>1.19.4</version>
 		</dependency>
 
 	    <dependency> 
 	      <groupId>info.bliki.wiki</groupId> 
 	      <artifactId>bliki-core</artifactId> 
-	      <version>3.0.16</version> 
+	      <version>3.0.19</version>
 	    </dependency>
     
     	<dependency>
 			<groupId>org.apache.uima</groupId>
 			<artifactId>uimaj-core</artifactId>
-			<version>2.3.1</version>
+			<version>${uimaj.version}</version>
 			<scope>compile</scope>
 		</dependency>
 		
 		<dependency>
 			<groupId>junit</groupId>
 			<artifactId>junit</artifactId>
-			<version>4.8.1</version>
+			<version>4.13.2</version>
 			<scope>test</scope>
 		</dependency>
 	</dependencies>
@@ -100,9 +80,9 @@
 				<groupId>org.apache.maven.plugins</groupId>
 				<artifactId>maven-compiler-plugin</artifactId>
 				<configuration>
-					<source>1.6</source>
-					<target>1.6</target>
-          			<compilerArgument>-Xlint</compilerArgument>
+					<source>11</source>
+					<target>11</target>
+					<compilerArgument>-Xlint</compilerArgument>
 				</configuration>
 			</plugin>
 		</plugins>
diff --git a/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/AnnotatingMarkupParser.java b/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/AnnotatingMarkupParser.java
index b1f9b00..f341a29 100644
--- a/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/AnnotatingMarkupParser.java
+++ b/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/AnnotatingMarkupParser.java
@@ -17,6 +17,13 @@
 
 package org.apache.opennlp.wikinews_importer;
 
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Pattern;
+
 import info.bliki.htmlcleaner.ContentToken;
 import info.bliki.htmlcleaner.TagNode;
 import info.bliki.wiki.filter.ITextConverter;
@@ -28,24 +35,15 @@ import info.bliki.wiki.model.ImageFormat;
 import info.bliki.wiki.model.WikiModel;
 import info.bliki.wiki.tags.WPATag;
 
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.regex.Pattern;
-
 /**
  * Parse mediawiki markup to strip the formatting info and extract a simple text
  * version suitable for NLP along with header, paragraph and link position
  * annotations.
- * 
+ * <p>
  * Use the {@code #convert(String)} and {@code #getWikiLinks()} methods.
- * 
- * Due to the constraints imposed by the {@code ITextConverter} /
- * {@code WikiModel} API, this class is not thread safe: only one instance
+ * <p>
+ * Due to the constraints imposed by the {@link ITextConverter} /
+ * {@link WikiModel} API, this class is not thread safe: only one instance
  * should be run by thread.
  */
 public class AnnotatingMarkupParser implements ITextConverter {
@@ -58,19 +56,17 @@ public class AnnotatingMarkupParser implements ITextConverter {
 
     public static final String WIKIOBJECT_ATTR_KEY = "wikiobject";
 
-    public static final Set<String> PARAGRAPH_TAGS = new HashSet<String>(
-            Arrays.asList("p"));
+    public static final Set<String> PARAGRAPH_TAGS = Set.of("p");
 
-    public static final Set<String> HEADING_TAGS = new HashSet<String>(
-            Arrays.asList("h1", "h2", "h3", "h4", "h5", "h6"));
+    public static final Set<String> HEADING_TAGS = Set.of("h1", "h2", "h3", "h4", "h5", "h6");
 
     public static final Pattern INTERWIKI_PATTERN = Pattern.compile("http://[\\w-]+\\.wikipedia\\.org/wiki/.*");
 
-    protected final List<Annotation> wikilinks = new ArrayList<Annotation>();
+    protected final List<Annotation> wikilinks = new ArrayList<>();
 
-    protected final List<Annotation> headers = new ArrayList<Annotation>();
+    protected final List<Annotation> headers = new ArrayList<>();
 
-    protected final List<Annotation> paragraphs = new ArrayList<Annotation>();
+    protected final List<Annotation> paragraphs = new ArrayList<>();
 
     protected String languageCode = "en";
 
@@ -91,24 +87,21 @@ public class AnnotatingMarkupParser implements ITextConverter {
         model = makeWikiModel(languageCode);
     }
 
-    public WikiModel makeWikiModel(String languageCode) {
-        return new WikiModel(String.format(
-                "http:/%s.wikipedia.org/wiki/${image}", languageCode),
-                String.format("http://%s.wikipedia.org/wiki/${title}",
-                        languageCode)) {
+    public WikiModel makeWikiModel(String langCode) {
+        return new WikiModel(String.format("https:/%s.wikipedia.org/wiki/${image}", langCode),
+                String.format("https://%s.wikipedia.org/wiki/${title}", langCode)) {
             @Override
             public String getRawWikiContent(String namespace,
                     String articleName, Map<String, String> templateParameters) {
                 // disable template support
-                // TODO: we need to readd template support at least for dates
+                // TODO: we need to read template support at least for dates
                 return "";
             }
         };
     }
 
-
-    public void nodesToText(List<? extends Object> nodes, Appendable buffer,
-            IWikiModel model) throws IOException {
+    @Override
+    public void nodesToText(List<?> nodes, Appendable buffer, IWikiModel model) throws IOException {
         CountingAppendable countingBuffer;
         if (buffer instanceof CountingAppendable) {
             countingBuffer = (CountingAppendable) buffer;
@@ -179,22 +172,18 @@ public class AnnotatingMarkupParser implements ITextConverter {
                             // sentences with links to entities
                             hasSpecialHandling = true;
                             ImageFormat iformat = (ImageFormat) oAttributes.get(WIKIOBJECT_ATTR_KEY);
-                            imageNodeToText(tagNode, iformat, countingBuffer,
-                                    model);
+                            imageNodeToText(tagNode, iformat, countingBuffer, model);
                         }
                         if (!hasSpecialHandling) {
-                            nodesToText(tagNode.getChildren(), countingBuffer,
-                                    model);
+                            nodesToText(tagNode.getChildren(), countingBuffer, model);
                         }
                         if (PARAGRAPH_TAGS.contains(tagName)) {
                             paragraphs.add(new Annotation(tagBegin,
-                                    countingBuffer.currentPosition,
-                                    "paragraph", tagName));
+                                    countingBuffer.currentPosition, "paragraph", tagName));
                             countingBuffer.append("\n\n");
                         } else if (HEADING_TAGS.contains(tagName)) {
                             headers.add(new Annotation(tagBegin,
-                                countingBuffer.currentPosition, "heading",
-                                    tagName));
+                                countingBuffer.currentPosition, "heading", tagName));
                             countingBuffer.append("\n\n");
                         } else if ("a".equals(tagName)) {
                           String href = attributes.get(HREF_ATTR_KEY);
@@ -212,11 +201,13 @@ public class AnnotatingMarkupParser implements ITextConverter {
         }
     }
 
+    @Override
     public void imageNodeToText(TagNode tagNode, ImageFormat imageFormat,
             Appendable buffer, IWikiModel model) throws IOException {
 //        nodesToText(tagNode.getChildren(), buffer, model);
     }
 
+    @Override
     public boolean noLinks() {
         return true;
     }
@@ -234,7 +225,7 @@ public class AnnotatingMarkupParser implements ITextConverter {
     }
 
     public List<String> getParagraphs() {
-        List<String> texts = new ArrayList<String>();
+        List<String> texts = new ArrayList<>();
         for (Annotation p : paragraphs) {
             texts.add(text.substring(p.begin, p.end));
         }
@@ -242,7 +233,7 @@ public class AnnotatingMarkupParser implements ITextConverter {
     }
 
     public List<String> getHeaders() {
-        List<String> texts = new ArrayList<String>();
+        List<String> texts = new ArrayList<>();
         for (Annotation h : headers) {
             texts.add(text.substring(h.begin, h.end));
         }
@@ -253,7 +244,7 @@ public class AnnotatingMarkupParser implements ITextConverter {
         return redirect;
     }
 
-    public class CountingAppendable implements Appendable {
+    public static class CountingAppendable implements Appendable {
 
         public int currentPosition = 0;
 
@@ -263,18 +254,20 @@ public class AnnotatingMarkupParser implements ITextConverter {
             this.wrappedBuffer = wrappedBuffer;
         }
 
+        @Override
         public Appendable append(CharSequence charSeq) throws IOException {
             currentPosition += charSeq.length();
             return wrappedBuffer.append(charSeq);
         }
 
+        @Override
         public Appendable append(char aChar) throws IOException {
             currentPosition += 1;
             return wrappedBuffer.append(aChar);
         }
 
-        public Appendable append(CharSequence charSeq, int start, int end)
-                throws IOException {
+        @Override
+        public Appendable append(CharSequence charSeq, int start, int end) throws IOException {
             currentPosition += end - start;
             return wrappedBuffer.append(charSeq, start, end);
         }
diff --git a/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/UimaUtil.java b/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/UimaUtil.java
index a9fd480..745ec11 100644
--- a/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/UimaUtil.java
+++ b/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/UimaUtil.java
@@ -58,8 +58,7 @@ public class UimaUtil {
     TypeSystemDescription typeSystemDesciptor;
 
     try {
-      typeSystemDesciptor = (TypeSystemDescription) xmlParser
-          .parse(xmlTypeSystemSource);
+      typeSystemDesciptor = (TypeSystemDescription) xmlParser.parse(xmlTypeSystemSource);
 
       typeSystemDesciptor.resolveImports();
     } catch (InvalidXMLException e) {
@@ -109,11 +108,10 @@ public class UimaUtil {
       throw new IllegalStateException("SAX error while creating parser!", e);
     }
 
-    XmiCasDeserializer dezerializer = new XmiCasDeserializer(
-        cas.getTypeSystem());
+    XmiCasDeserializer deserializer = new XmiCasDeserializer(cas.getTypeSystem());
 
     try {
-      saxParser.parse(xmiIn, dezerializer.getXmiCasHandler(cas));
+      saxParser.parse(xmiIn, deserializer.getXmiCasHandler(cas));
     } catch (SAXException e) {
       throw new IOException("Invalid XMI input!", e);
     }
diff --git a/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsConverter.java b/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsConverter.java
index abbcc54..9c03e74 100644
--- a/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsConverter.java
+++ b/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsConverter.java
@@ -22,8 +22,8 @@ import java.io.FileInputStream;
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.OutputStream;
-import java.io.UnsupportedEncodingException;
 import java.net.URLEncoder;
+import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.List;
 
@@ -49,45 +49,38 @@ public class WikinewsConverter {
 
     private final TypeSystemDescription tsDesc;
     private final File outputFolder;
-    private List<String> endOfArtilceMarkers = new ArrayList<String>();
+    private final List<String> endOfArticleMarkers = new ArrayList<>();
     
     CASArticleFilter(TypeSystemDescription tsDesc, File outputFolder) {
       
       this.tsDesc = tsDesc;
       this.outputFolder = outputFolder;
       
-      endOfArtilceMarkers.add("{{haveyoursay}}");
-      endOfArtilceMarkers.add("== Sources ==");
-      endOfArtilceMarkers.add("==Sources==");
-      endOfArtilceMarkers.add("== Source ==");
-      endOfArtilceMarkers.add("==Source==");
-      endOfArtilceMarkers.add("==References==");
-      endOfArtilceMarkers.add("== References ==");
-      endOfArtilceMarkers.add("=== References===");
+      endOfArticleMarkers.add("{{haveyoursay}}");
+      endOfArticleMarkers.add("== Sources ==");
+      endOfArticleMarkers.add("==Sources==");
+      endOfArticleMarkers.add("== Source ==");
+      endOfArticleMarkers.add("==Source==");
+      endOfArticleMarkers.add("==References==");
+      endOfArticleMarkers.add("== References ==");
+      endOfArticleMarkers.add("=== References===");
     }
     
-    
-      public static String titleToUri(String title) {
-      try {
-          return URLEncoder.encode(title.replaceAll(" ", "_"), "UTF-8");
-      } catch (UnsupportedEncodingException e) {
-          throw new RuntimeException(e);
-      }
+    public static String titleToUri(String title) {
+      return URLEncoder.encode(title.replaceAll(" ", "_"), StandardCharsets.UTF_8);
     }
-    
+
+    @Override
     public void process(WikiArticle page, Siteinfo siteinfo)
         throws SAXException {
       
       if (page.getIntegerNamespace() == 0 && page.isMain()) {
-
         if (page.getText().toLowerCase().contains("{publish}")) {
           
           String pageText = page.getText();
-          
-
           int cutIndex = pageText.length();
 
-          for (String endMarker : endOfArtilceMarkers) {
+          for (String endMarker : endOfArticleMarkers) {
             int endMarkerIndex = pageText.indexOf(endMarker);
               if (endMarkerIndex != -1 && endMarkerIndex < cutIndex) {
                 cutIndex = endMarkerIndex;
@@ -98,8 +91,9 @@ public class WikinewsConverter {
             pageText = pageText.substring(0, cutIndex);
           }
           
-          WikinewsWikiModel wikiModel = new WikinewsWikiModel("http://en.wikinews.org/wiki/${image}", 
-              "http://en.wikinews.org/wiki/${title}");
+          WikinewsWikiModel wikiModel = new WikinewsWikiModel(
+                  "https://en.wikinews.org/wiki/${image}",
+                  "https://en.wikinews.org/wiki/${title}");
           
           AnnotatingMarkupParser converter = new AnnotatingMarkupParser();
           String plainStr = wikiModel.render(converter, pageText);
@@ -137,8 +131,7 @@ public class WikinewsConverter {
           }
           
           for (Annotation subHeadAnn : converter.getHeaderAnnotations()) {
-            AnnotationFS subHeadAnnFS = articleCAS.createAnnotation(
-                articleCAS.getTypeSystem()
+            AnnotationFS subHeadAnnFS = articleCAS.createAnnotation(articleCAS.getTypeSystem()
                 .getType("org.apache.opennlp.annotations.SubHeadline"),
                 bodyOffset + subHeadAnn.begin, bodyOffset + subHeadAnn.end);
             
@@ -150,8 +143,7 @@ public class WikinewsConverter {
           Feature linkFeature = wikiLinkType.getFeatureByBaseName("link");
           
           for (Annotation wikiLinkAnn : converter.getWikiLinkAnnotations()) {
-            AnnotationFS wikiLinkAnnFS = articleCAS.createAnnotation(
-                articleCAS.getTypeSystem()
+            AnnotationFS wikiLinkAnnFS = articleCAS.createAnnotation(articleCAS.getTypeSystem()
                 .getType("org.apache.opennlp.annotations.WikiLink"),
                 bodyOffset + wikiLinkAnn.begin, bodyOffset + wikiLinkAnn.end);
             
@@ -164,32 +156,19 @@ public class WikinewsConverter {
           markupCas.setDocumentText(page.toString());
           
           // now serialize CAS
-          OutputStream casOut = null;
-          try {
-              casOut = new FileOutputStream(outputFolder.getAbsolutePath() +
-            		  File.separator + titleToUri(page.getTitle()) + ".xmi");
-              
+          try (OutputStream casOut = new FileOutputStream(outputFolder.getAbsolutePath() +
+                  File.separator + titleToUri(page.getTitle()) + ".xmi")) {
+
               UimaUtil.serializeCASToXmi(articleCAS, casOut);
           }
           catch (IOException e) {
             e.printStackTrace();
           }
-          finally {
-            try {
-            if (casOut != null)
-                casOut.close();
-              } catch (IOException e) {
-              }
-          }
-          
         }
       }
     }
   }
 
-  /**
-   * @param args
-   */
   public static void main(String[] args) throws Exception {
     if (args.length != 2) {
       System.err.println("Usage: Parser <XML-File> <Output-Folder>");