You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/07/07 00:10:43 UTC
svn commit: r1143582 - in /incubator/opennlp/sandbox/wikinews-importer: ./
samples/ src/main/java/org/apache/opennlp/wikinews_importer/
Author: joern
Date: Wed Jul 6 22:10:42 2011
New Revision: 1143582
URL: http://svn.apache.org/viewvc?rev=1143582&view=rev
Log:
OPENNLP-211 First version of wikinews importer, based on code contributed by Olivier Grisel. Thanks.
Added:
incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/AnnotatingMarkupParser.java (with props)
incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/Annotation.java (with props)
incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/UimaUtil.java (with props)
incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsConverter.java (with props)
incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsWikiModel.java (with props)
Removed:
incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/CreateCorpus.java
incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/FileUtil.java
incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsImporter.java
Modified:
incubator/opennlp/sandbox/wikinews-importer/pom.xml
incubator/opennlp/sandbox/wikinews-importer/samples/TypeSystem.xml
Modified: incubator/opennlp/sandbox/wikinews-importer/pom.xml
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/wikinews-importer/pom.xml?rev=1143582&r1=1143581&r2=1143582&view=diff
==============================================================================
--- incubator/opennlp/sandbox/wikinews-importer/pom.xml (original)
+++ incubator/opennlp/sandbox/wikinews-importer/pom.xml Wed Jul 6 22:10:42 2011
@@ -32,7 +32,7 @@
<groupId>org.apache.opennlp</groupId>
<artifactId>wikinews-importer</artifactId>
<version>0.0.1-incubating-SNAPSHOT</version>
- <packaging>war</packaging>
+ <packaging>jar</packaging>
<name>OpenNLP Wikinews Importer</name>
@@ -42,16 +42,21 @@
<repositories>
<repository>
- <id>maven2-repository.java.net</id>
- <name>Java.net Repository for Maven</name>
- <url>http://download.java.net/maven/2/</url>
- <layout>default</layout>
- </repository>
+ <id>maven2-repository.java.net</id>
+ <name>Java.net Repository for Maven</name>
+ <url>http://download.java.net/maven/2/</url>
+ <layout>default</layout>
+ </repository>
+
<repository>
- <id>maven-repository.java.net</id>
- <name>Java.net Maven 1 Repository (legacy)</name>
- <url>http://download.java.net/maven/1</url>
- <layout>legacy</layout>
+ <id>info-bliki-repository</id>
+ <url>http://gwtwiki.googlecode.com/svn/maven-repository/</url>
+ <releases>
+ <enabled>true</enabled>
+ </releases>
+ <snapshots>
+ <enabled>false</enabled>
+ </snapshots>
</repository>
</repositories>
@@ -68,6 +73,19 @@
<version>1.8</version>
</dependency>
+ <dependency>
+ <groupId>info.bliki.wiki</groupId>
+ <artifactId>bliki-core</artifactId>
+ <version>3.0.16</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.uima</groupId>
+ <artifactId>uimaj-core</artifactId>
+ <version>2.3.1</version>
+ <scope>compile</scope>
+ </dependency>
+
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
Modified: incubator/opennlp/sandbox/wikinews-importer/samples/TypeSystem.xml
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/wikinews-importer/samples/TypeSystem.xml?rev=1143582&r1=1143581&r2=1143582&view=diff
==============================================================================
--- incubator/opennlp/sandbox/wikinews-importer/samples/TypeSystem.xml (original)
+++ incubator/opennlp/sandbox/wikinews-importer/samples/TypeSystem.xml Wed Jul 6 22:10:42 2011
@@ -32,34 +32,53 @@
<description></description>
<supertypeName>uima.tcas.Annotation</supertypeName>
</typeDescription>
-
+
+ <typeDescription>
+ <name>org.apache.opennlp.annotations.SubHeadline</name>
+ <description></description>
+ <supertypeName>uima.tcas.Annotation</supertypeName>
+ </typeDescription>
+
<typeDescription>
<name>org.apache.opennlp.annotations.Paragraph</name>
<description></description>
<supertypeName>uima.tcas.Annotation</supertypeName>
</typeDescription>
-
+
<typeDescription>
<name>org.apache.opennlp.annotations.Sentence</name>
<description></description>
<supertypeName>uima.tcas.Annotation</supertypeName>
</typeDescription>
-
+
<typeDescription>
<name>org.apache.opennlp.annotations.Token</name>
<description></description>
<supertypeName>uima.tcas.Annotation</supertypeName>
</typeDescription>
-
+
<typeDescription>
<name>org.apache.opennlp.annotations.Person</name>
<description></description>
<supertypeName>uima.tcas.Annotation</supertypeName>
</typeDescription>
+
<typeDescription>
<name>org.apache.opennlp.annotations.Organization</name>
<description></description>
<supertypeName>uima.tcas.Annotation</supertypeName>
</typeDescription>
+
+ <typeDescription>
+ <name>org.apache.opennlp.annotations.WikiLink</name>
+ <supertypeName>uima.tcas.Annotation</supertypeName>
+ <features>
+ <featureDescription>
+ <name>link</name>
+ <description></description>
+ <rangeTypeName>uima.cas.String</rangeTypeName>
+ </featureDescription>
+ </features>
+ </typeDescription>
</types>
</typeSystemDescription>
\ No newline at end of file
Added: incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/AnnotatingMarkupParser.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/AnnotatingMarkupParser.java?rev=1143582&view=auto
==============================================================================
--- incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/AnnotatingMarkupParser.java (added)
+++ incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/AnnotatingMarkupParser.java Wed Jul 6 22:10:42 2011
@@ -0,0 +1,284 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.opennlp.wikinews_importer;
+
+import info.bliki.htmlcleaner.ContentToken;
+import info.bliki.htmlcleaner.TagNode;
+import info.bliki.wiki.filter.ITextConverter;
+import info.bliki.wiki.filter.WPList;
+import info.bliki.wiki.filter.WPTable;
+import info.bliki.wiki.model.Configuration;
+import info.bliki.wiki.model.IWikiModel;
+import info.bliki.wiki.model.ImageFormat;
+import info.bliki.wiki.model.WikiModel;
+import info.bliki.wiki.tags.WPATag;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+/**
+ * Parse mediawiki markup to strip the formatting info and extract a simple text
+ * version suitable for NLP along with header, paragraph and link position
+ * annotations.
+ *
+ * Use the {@code #convert(String)} and {@code #getWikiLinks()} methods.
+ *
+ * Due to the constraints imposed by the {@code ITextConverter} /
+ * {@code WikiModel} API, this class is not thread safe: only one instance
+ * should be run by thread.
+ */
+public class AnnotatingMarkupParser implements ITextConverter {
+
+ public static final String HREF_ATTR_KEY = "href";
+
+ public static final String WIKILINK_TITLE_ATTR_KEY = "title";
+
+ public static final String WIKILINK_TARGET_ATTR_KEY = "href";
+
+ public static final String WIKIOBJECT_ATTR_KEY = "wikiobject";
+
+ public static final Set<String> PARAGRAPH_TAGS = new HashSet<String>(
+ Arrays.asList("p"));
+
+ public static final Set<String> HEADING_TAGS = new HashSet<String>(
+ Arrays.asList("h1", "h2", "h3", "h4", "h5", "h6"));
+
+ public static final Pattern INTERWIKI_PATTERN = Pattern.compile("http://[\\w-]+\\.wikipedia\\.org/wiki/.*");
+
+ protected final List<Annotation> wikilinks = new ArrayList<Annotation>();
+
+ protected final List<Annotation> headers = new ArrayList<Annotation>();
+
+ protected final List<Annotation> paragraphs = new ArrayList<Annotation>();
+
+ protected String languageCode = "en";
+
+ protected final WikiModel model;
+
+ protected String redirect;
+
+ protected String text;
+
+ protected static final Pattern REDIRECT_PATTERN = Pattern.compile("^#REDIRECT \\[\\[([^\\]]*)\\]\\]");
+
+ public AnnotatingMarkupParser() {
+ model = makeWikiModel(languageCode);
+ }
+
+ public AnnotatingMarkupParser(String languageCode) {
+ this.languageCode = languageCode;
+ model = makeWikiModel(languageCode);
+ }
+
+ public WikiModel makeWikiModel(String languageCode) {
+ return new WikiModel(String.format(
+ "http:/%s.wikipedia.org/wiki/${image}", languageCode),
+ String.format("http://%s.wikipedia.org/wiki/${title}",
+ languageCode)) {
+ @Override
+ public String getRawWikiContent(String namespace,
+ String articleName, Map<String, String> templateParameters) {
+ // disable template support
+ // TODO: we need to readd template support at least for dates
+ return "";
+ }
+ };
+ }
+
+
+ public void nodesToText(List<? extends Object> nodes, Appendable buffer,
+ IWikiModel model) throws IOException {
+ CountingAppendable countingBuffer;
+ if (buffer instanceof CountingAppendable) {
+ countingBuffer = (CountingAppendable) buffer;
+ } else {
+ // wrap
+ countingBuffer = new CountingAppendable(buffer);
+ }
+
+ if (nodes != null && !nodes.isEmpty()) {
+ try {
+ int level = model.incrementRecursionLevel();
+ if (level > Configuration.RENDERER_RECURSION_LIMIT) {
+ countingBuffer.append("Error - recursion limit exceeded"
+ + " rendering tags in PlainTextConverter#nodesToText().");
+ return;
+ }
+ for (Object node : nodes) {
+ if (node instanceof WPATag) {
+ // extract wikilink annotations
+ WPATag tag = (WPATag) node;
+ String wikilinkLabel = (String) tag.getAttributes().get(
+ WIKILINK_TITLE_ATTR_KEY);
+ String wikilinkTarget = (String) tag.getAttributes().get(
+ WIKILINK_TARGET_ATTR_KEY);
+ if (wikilinkLabel != null) {
+ int colonIdx = -1; // wikilinkLabel.indexOf(':');
+ if (colonIdx == -1) {
+ // do not serialize non-topic wiki-links such as
+ // translation links missing from the
+ // INTERWIKI_LINK map
+ int start = countingBuffer.currentPosition;
+ tag.getBodyString(countingBuffer);
+ int end = countingBuffer.currentPosition;
+ if (!wikilinkTarget.startsWith("#")) {
+ // TODO: wikilink label is not important,since that is the covered text?
+ wikilinks.add(new Annotation(start, end, wikilinkLabel, wikilinkTarget));
+ }
+ }
+ } else {
+ tag.getBodyString(countingBuffer);
+ }
+
+ } else if (node instanceof ContentToken) {
+ ContentToken contentToken = (ContentToken) node;
+ countingBuffer.append(contentToken.getContent());
+ } else if (node instanceof List) {
+ } else if (node instanceof WPList) {
+ } else if (node instanceof WPTable) {
+ // ignore lists and tables since they most of the time
+ // do not hold grammatically correct
+ // interesting sentences that are representative of the
+ // language.
+ } else if (node instanceof TagNode) {
+ TagNode tagNode = (TagNode) node;
+ Map<String, String> attributes = tagNode.getAttributes();
+ Map<String, Object> oAttributes = tagNode.getObjectAttributes();
+ boolean hasSpecialHandling = false;
+ String tagName = tagNode.getName();
+ int tagBegin = countingBuffer.currentPosition;
+
+ if ("ref".equals(tagName)) {
+ // ignore the references since they do not hold
+ // interesting text content
+ hasSpecialHandling = true;
+ } else if (oAttributes != null
+ && oAttributes.get(WIKIOBJECT_ATTR_KEY) instanceof ImageFormat) {
+ // the caption of images often holds well formed
+ // sentences with links to entities
+ hasSpecialHandling = true;
+ ImageFormat iformat = (ImageFormat) oAttributes.get(WIKIOBJECT_ATTR_KEY);
+ imageNodeToText(tagNode, iformat, countingBuffer,
+ model);
+ }
+ if (!hasSpecialHandling) {
+ nodesToText(tagNode.getChildren(), countingBuffer,
+ model);
+ }
+ if (PARAGRAPH_TAGS.contains(tagName)) {
+ paragraphs.add(new Annotation(tagBegin,
+ countingBuffer.currentPosition,
+ "paragraph", tagName));
+ countingBuffer.append("\n\n");
+ } else if (HEADING_TAGS.contains(tagName)) {
+ headers.add(new Annotation(tagBegin,
+ countingBuffer.currentPosition, "heading",
+ tagName));
+ countingBuffer.append("\n\n");
+ } else if ("a".equals(tagName)) {
+ String href = attributes.get(HREF_ATTR_KEY);
+
+ // TODO: How to get covered text here? Is not needed anyway right?!
+ wikilinks.add(new Annotation(tagBegin, countingBuffer.currentPosition,
+ "", href));
+ }
+
+ }
+ }
+ } finally {
+ model.decrementRecursionLevel();
+ }
+ }
+ }
+
+ public void imageNodeToText(TagNode tagNode, ImageFormat imageFormat,
+ Appendable buffer, IWikiModel model) throws IOException {
+// nodesToText(tagNode.getChildren(), buffer, model);
+ }
+
+ public boolean noLinks() {
+ return true;
+ }
+
+ public List<Annotation> getWikiLinkAnnotations() {
+ return wikilinks;
+ }
+
+ public List<Annotation> getHeaderAnnotations() {
+ return headers;
+ }
+
+ public List<Annotation> getParagraphAnnotations() {
+ return paragraphs;
+ }
+
+ public List<String> getParagraphs() {
+ List<String> texts = new ArrayList<String>();
+ for (Annotation p : paragraphs) {
+ texts.add(text.substring(p.begin, p.end));
+ }
+ return texts;
+ }
+
+ public List<String> getHeaders() {
+ List<String> texts = new ArrayList<String>();
+ for (Annotation h : headers) {
+ texts.add(text.substring(h.begin, h.end));
+ }
+ return texts;
+ }
+
+ public String getRedirect() {
+ return redirect;
+ }
+
+ public class CountingAppendable implements Appendable {
+
+ public int currentPosition = 0;
+
+ final protected Appendable wrappedBuffer;
+
+ public CountingAppendable(Appendable wrappedBuffer) {
+ this.wrappedBuffer = wrappedBuffer;
+ }
+
+ public Appendable append(CharSequence charSeq) throws IOException {
+ currentPosition += charSeq.length();
+ return wrappedBuffer.append(charSeq);
+ }
+
+ public Appendable append(char aChar) throws IOException {
+ currentPosition += 1;
+ return wrappedBuffer.append(aChar);
+ }
+
+ public Appendable append(CharSequence charSeq, int start, int end)
+ throws IOException {
+ currentPosition += end - start;
+ return wrappedBuffer.append(charSeq, start, end);
+ }
+
+ }
+
+}
Propchange: incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/AnnotatingMarkupParser.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/Annotation.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/Annotation.java?rev=1143582&view=auto
==============================================================================
--- incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/Annotation.java (added)
+++ incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/Annotation.java Wed Jul 6 22:10:42 2011
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.opennlp.wikinews_importer;
+
+public class Annotation {
+
+ public final int begin;
+
+ public final int end;
+
+ public final String label;
+
+ public final String value;
+
+ public Annotation(int start, int end, String label, String value) {
+ this.begin = start;
+ this.end = end;
+ this.label = label;
+ this.value = value;
+ }
+
+}
Propchange: incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/Annotation.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/UimaUtil.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/UimaUtil.java?rev=1143582&view=auto
==============================================================================
--- incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/UimaUtil.java (added)
+++ incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/UimaUtil.java Wed Jul 6 22:10:42 2011
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.opennlp.wikinews_importer;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import org.apache.uima.ResourceSpecifierFactory;
+import org.apache.uima.UIMAFramework;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.impl.XmiCasDeserializer;
+import org.apache.uima.cas.impl.XmiCasSerializer;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.resource.metadata.FsIndexDescription;
+import org.apache.uima.resource.metadata.TypePriorities;
+import org.apache.uima.resource.metadata.TypeSystemDescription;
+import org.apache.uima.resource.metadata.impl.FsIndexDescription_impl;
+import org.apache.uima.util.CasCreationUtils;
+import org.apache.uima.util.InvalidXMLException;
+import org.apache.uima.util.XMLInputSource;
+import org.apache.uima.util.XMLParser;
+import org.apache.uima.util.XMLSerializer;
+import org.xml.sax.SAXException;
+
+public class UimaUtil {
+
+ static TypeSystemDescription createTypeSystemDescription(InputStream in) {
+
+ // Note:
+ // Type System location is not set correctly,
+ // resolving a referenced type system will fail
+
+ XMLInputSource xmlTypeSystemSource = new XMLInputSource(in, new File(""));
+
+ XMLParser xmlParser = UIMAFramework.getXMLParser();
+
+ TypeSystemDescription typeSystemDesciptor;
+
+ try {
+ typeSystemDesciptor = (TypeSystemDescription) xmlParser
+ .parse(xmlTypeSystemSource);
+
+ typeSystemDesciptor.resolveImports();
+ } catch (InvalidXMLException e) {
+ e.printStackTrace();
+ typeSystemDesciptor = null;
+ }
+
+ return typeSystemDesciptor;
+ }
+
+ static CAS createEmptyCAS(TypeSystemDescription typeSystem) {
+ ResourceSpecifierFactory resourceSpecifierFactory = UIMAFramework
+ .getResourceSpecifierFactory();
+ TypePriorities typePriorities = resourceSpecifierFactory
+ .createTypePriorities();
+
+ FsIndexDescription indexDesciptor = new FsIndexDescription_impl();
+ indexDesciptor.setLabel("TOPIndex");
+ indexDesciptor.setTypeName("uima.cas.TOP");
+ indexDesciptor.setKind(FsIndexDescription.KIND_SORTED);
+
+ CAS cas;
+ try {
+ cas = CasCreationUtils.createCas(typeSystem, typePriorities,
+ new FsIndexDescription[] { indexDesciptor });
+ } catch (ResourceInitializationException e) {
+ e.printStackTrace();
+ cas = null;
+ }
+
+ return cas;
+ }
+
+ static void deserializeXmiCAS(CAS cas, InputStream xmiIn) throws IOException {
+
+ SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
+ saxParserFactory.setValidating(false);
+
+ SAXParser saxParser;
+
+ try {
+ saxParser = saxParserFactory.newSAXParser();
+ } catch (ParserConfigurationException e) {
+ throw new IllegalStateException(
+ "SAXParser should be configured correctly!", e);
+ } catch (SAXException e) {
+ throw new IllegalStateException("SAX error while creating parser!", e);
+ }
+
+ XmiCasDeserializer dezerializer = new XmiCasDeserializer(
+ cas.getTypeSystem());
+
+ try {
+ saxParser.parse(xmiIn, dezerializer.getXmiCasHandler(cas));
+ } catch (SAXException e) {
+ throw new IOException("Invalid XMI input!", e);
+ }
+ }
+
+ static void serializeCASToXmi(CAS cas, OutputStream out) throws IOException {
+ XmiCasSerializer xmiSerializer = new XmiCasSerializer(cas.getTypeSystem());
+
+ XMLSerializer xmlSerialzer = new XMLSerializer(out, true);
+
+ try {
+ xmiSerializer.serialize(cas, xmlSerialzer.getContentHandler());
+ } catch (SAXException e) {
+ e.printStackTrace();
+ }
+ }
+}
Propchange: incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/UimaUtil.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsConverter.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsConverter.java?rev=1143582&view=auto
==============================================================================
--- incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsConverter.java (added)
+++ incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsConverter.java Wed Jul 6 22:10:42 2011
@@ -0,0 +1,216 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.opennlp.wikinews_importer;
+
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.UnsupportedEncodingException;
+import java.net.URLEncoder;
+import java.util.ArrayList;
+import java.util.List;
+
+import info.bliki.wiki.dump.IArticleFilter;
+import info.bliki.wiki.dump.Siteinfo;
+import info.bliki.wiki.dump.WikiArticle;
+import info.bliki.wiki.dump.WikiXMLParser;
+
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.Feature;
+import org.apache.uima.cas.Type;
+import org.apache.uima.cas.text.AnnotationFS;
+import org.apache.uima.resource.metadata.TypeSystemDescription;
+import org.xml.sax.SAXException;
+
+/**
+ * Demo application which reads a compressed or uncompressed Wikipedia XML dump
+ * file (depending on the given file extension <i>.gz</i>, <i>.bz2</i> or
+ * <i>.xml</i>) and prints the title and wiki text.
+ *
+ */
+public class WikinewsConverter {
+
+ /**
+ * Print title an content of all the wiki pages in the dump.
+ *
+ */
+ static class CASArticleFilter implements IArticleFilter {
+
+ private final TypeSystemDescription tsDesc;
+ private List<String> endOfArtilceMarkers = new ArrayList<String>();
+
+ CASArticleFilter(TypeSystemDescription tsDesc) {
+
+ this.tsDesc = tsDesc;
+
+ endOfArtilceMarkers.add("{{haveyoursay}}");
+ endOfArtilceMarkers.add("== Sources ==");
+ endOfArtilceMarkers.add("==Sources==");
+ endOfArtilceMarkers.add("== Source ==");
+ endOfArtilceMarkers.add("==Source==");
+ endOfArtilceMarkers.add("==References==");
+ endOfArtilceMarkers.add("== References ==");
+ endOfArtilceMarkers.add("=== References===");
+ }
+
+
+ public static String titleToUri(String title) {
+ try {
+ return URLEncoder.encode(title.replaceAll(" ", "_"), "UTF-8");
+ } catch (UnsupportedEncodingException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ public void process(WikiArticle page, Siteinfo siteinfo)
+ throws SAXException {
+
+ if (page.getIntegerNamespace() == 0 && page.isMain()) {
+
+ if (page.getText().toLowerCase().contains("{publish}")) {
+
+ String pageText = page.getText();
+
+ int cutIndex = -1;
+
+ for (String endMarker : endOfArtilceMarkers) {
+
+ int endMarkerIndex = pageText.indexOf(endMarker);
+ if (endMarkerIndex != -1) {
+ cutIndex = endMarkerIndex;
+ break;
+ }
+ }
+
+ if (cutIndex == -1)
+ cutIndex = pageText.length();
+
+ pageText = pageText.substring(0, cutIndex);
+
+ WikinewsWikiModel wikiModel = new WikinewsWikiModel("http://en.wikinews.org/wiki/${image}",
+ "http://en.wikinews.org/wiki/${title}");
+
+ AnnotatingMarkupParser converter = new AnnotatingMarkupParser();
+ String plainStr = wikiModel.render(converter, pageText);
+
+ CAS articleCAS = UimaUtil.createEmptyCAS(tsDesc);
+
+ // TODO: find a way to nicely add title ..
+ StringBuilder articleText = new StringBuilder();
+ articleText.append(page.getTitle());
+
+ int endOffsetTitle = articleText.length();
+
+ articleText.append("\n");
+ articleText.append("\n");
+
+ int bodyOffset = articleText.length();
+
+ articleText.append(plainStr); // Note: Add offset to annotations ... by this
+
+ articleCAS.setDocumentLanguage("en");
+ articleCAS.setDocumentText(articleText.toString());
+
+ AnnotationFS headlineAnnotation = articleCAS.createAnnotation(articleCAS.getTypeSystem()
+ .getType("org.apache.opennlp.annotations.Headline"),
+ 0, endOffsetTitle);
+
+ articleCAS.addFsToIndexes(headlineAnnotation);
+
+ for (Annotation paraAnn : converter.getParagraphAnnotations()) {
+ AnnotationFS paraAnnFS = articleCAS.createAnnotation(articleCAS.getTypeSystem()
+ .getType("org.apache.opennlp.annotations.Paragraph"),
+ bodyOffset + paraAnn.begin, bodyOffset + paraAnn.end);
+
+ articleCAS.addFsToIndexes(paraAnnFS);
+ }
+
+ for (Annotation subHeadAnn : converter.getHeaderAnnotations()) {
+ AnnotationFS subHeadAnnFS = articleCAS.createAnnotation(
+ articleCAS.getTypeSystem()
+ .getType("org.apache.opennlp.annotations.SubHeadline"),
+ bodyOffset + subHeadAnn.begin, bodyOffset + subHeadAnn.end);
+
+ articleCAS.addFsToIndexes(subHeadAnnFS);
+ }
+
+ Type wikiLinkType = articleCAS.getTypeSystem()
+ .getType("org.apache.opennlp.annotations.WikiLink");
+ Feature linkFeature = wikiLinkType.getFeatureByBaseName("link");
+
+ for (Annotation wikiLinkAnn : converter.getWikiLinkAnnotations()) {
+ AnnotationFS wikiLinkAnnFS = articleCAS.createAnnotation(
+ articleCAS.getTypeSystem()
+ .getType("org.apache.opennlp.annotations.WikiLink"),
+ bodyOffset + wikiLinkAnn.begin, bodyOffset + wikiLinkAnn.end);
+
+ wikiLinkAnnFS.setStringValue(linkFeature, wikiLinkAnn.value);
+
+ articleCAS.addFsToIndexes(wikiLinkAnnFS);
+ }
+
+ CAS markupCas = articleCAS.createView("WikiMarkup");
+ markupCas.setDocumentText(page.toString());
+
+ // now serialize CAS
+ OutputStream casOut = null;
+ try {
+ casOut = new FileOutputStream("articles/" + titleToUri(page.getTitle()) + ".xmi");
+
+ UimaUtil.serializeCASToXmi(articleCAS, casOut);
+ }
+ catch (IOException e) {
+ e.printStackTrace();
+ }
+ finally {
+ try {
+ if (casOut != null)
+ casOut.close();
+ } catch (IOException e) {
+ }
+ }
+
+ }
+ }
+ }
+ }
+
+ /**
+ * @param args
+ */
+ public static void main(String[] args) throws Exception {
+ if (args.length != 1) {
+ System.err.println("Usage: Parser <XML-FILE>");
+ // TODO: add folder where file are written here
+ System.exit(-1);
+ }
+
+ TypeSystemDescription tsDesc = UimaUtil.createTypeSystemDescription(
+ new FileInputStream("samples/TypeSystem.xml"));
+
+ String bz2Filename = args[0];
+ try {
+ IArticleFilter handler = new CASArticleFilter(tsDesc);
+ WikiXMLParser wxp = new WikiXMLParser(bz2Filename, handler);
+ wxp.parse();
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+}
Propchange: incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsConverter.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsWikiModel.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsWikiModel.java?rev=1143582&view=auto
==============================================================================
--- incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsWikiModel.java (added)
+++ incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsWikiModel.java Wed Jul 6 22:10:42 2011
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.opennlp.wikinews_importer;
+
+import java.util.Map;
+
+import info.bliki.wiki.model.WikiModel;
+
+public class WikinewsWikiModel extends WikiModel {
+
+ public WikinewsWikiModel(String imageBaseURL, String linkBaseURL) {
+ super(imageBaseURL, linkBaseURL);
+ }
+
+ @Override
+ public String getRawWikiContent(String namespace, String articleName,
+ Map<String, String> map) {
+
+ String result = super.getRawWikiContent(namespace, articleName, map);
+ if (result == null) {
+
+ // Maybe use special handling for date, ... ?!
+
+ if (articleName.equals("w"))
+ return map.get("1");
+
+ return "";
+ }
+ else {
+ return result;
+ }
+ }
+}
Propchange: incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsWikiModel.java
------------------------------------------------------------------------------
svn:mime-type = text/plain