You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/07/07 00:10:43 UTC
svn commit: r1143582 - in /incubator/opennlp/sandbox/wikinews-importer: ./ samples/ src/main/java/org/apache/opennlp/wikinews_importer/

Author: joern
Date: Wed Jul  6 22:10:42 2011
New Revision: 1143582

URL: http://svn.apache.org/viewvc?rev=1143582&view=rev
Log:
OPENNLP-211 First version of wikinews importer, based on code contributed by Olivier Grisel. Thanks.

Added:
    incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/AnnotatingMarkupParser.java   (with props)
    incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/Annotation.java   (with props)
    incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/UimaUtil.java   (with props)
    incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsConverter.java   (with props)
    incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsWikiModel.java   (with props)
Removed:
    incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/CreateCorpus.java
    incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/FileUtil.java
    incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsImporter.java
Modified:
    incubator/opennlp/sandbox/wikinews-importer/pom.xml
    incubator/opennlp/sandbox/wikinews-importer/samples/TypeSystem.xml

Modified: incubator/opennlp/sandbox/wikinews-importer/pom.xml
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/wikinews-importer/pom.xml?rev=1143582&r1=1143581&r2=1143582&view=diff
==============================================================================
--- incubator/opennlp/sandbox/wikinews-importer/pom.xml (original)
+++ incubator/opennlp/sandbox/wikinews-importer/pom.xml Wed Jul  6 22:10:42 2011
@@ -32,7 +32,7 @@
 	<groupId>org.apache.opennlp</groupId>
 	<artifactId>wikinews-importer</artifactId>
 	<version>0.0.1-incubating-SNAPSHOT</version>
-	<packaging>war</packaging>
+	<packaging>jar</packaging>
 
 	<name>OpenNLP Wikinews Importer</name>
 
@@ -42,16 +42,21 @@
 
 	<repositories>
 		<repository>
-		    <id>maven2-repository.java.net</id>
-		    <name>Java.net Repository for Maven</name>
-		    <url>http://download.java.net/maven/2/</url>
-		    <layout>default</layout>
-		</repository> 
+			<id>maven2-repository.java.net</id>
+			<name>Java.net Repository for Maven</name>
+			<url>http://download.java.net/maven/2/</url>
+			<layout>default</layout>
+		</repository>
+
 		<repository>
-		    <id>maven-repository.java.net</id>
-		    <name>Java.net Maven 1 Repository (legacy)</name>
-		    <url>http://download.java.net/maven/1</url>
-		    <layout>legacy</layout>
+			<id>info-bliki-repository</id>
+			<url>http://gwtwiki.googlecode.com/svn/maven-repository/</url>
+			<releases>
+				<enabled>true</enabled>
+			</releases>
+			<snapshots>
+				<enabled>false</enabled>
+			</snapshots>
 		</repository>
 	</repositories>
 	
@@ -68,6 +73,19 @@
 		    <version>1.8</version>
 		</dependency>
 
+	    <dependency> 
+	      <groupId>info.bliki.wiki</groupId> 
+	      <artifactId>bliki-core</artifactId> 
+	      <version>3.0.16</version> 
+	    </dependency>
+    
+    	<dependency>
+			<groupId>org.apache.uima</groupId>
+			<artifactId>uimaj-core</artifactId>
+			<version>2.3.1</version>
+			<scope>compile</scope>
+		</dependency>
+		
 		<dependency>
 			<groupId>junit</groupId>
 			<artifactId>junit</artifactId>

Modified: incubator/opennlp/sandbox/wikinews-importer/samples/TypeSystem.xml
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/wikinews-importer/samples/TypeSystem.xml?rev=1143582&r1=1143581&r2=1143582&view=diff
==============================================================================
--- incubator/opennlp/sandbox/wikinews-importer/samples/TypeSystem.xml (original)
+++ incubator/opennlp/sandbox/wikinews-importer/samples/TypeSystem.xml Wed Jul  6 22:10:42 2011
@@ -32,34 +32,53 @@
 			<description></description>
 			<supertypeName>uima.tcas.Annotation</supertypeName>
 		</typeDescription>
-		
+
+		<typeDescription>
+			<name>org.apache.opennlp.annotations.SubHeadline</name>
+			<description></description>
+			<supertypeName>uima.tcas.Annotation</supertypeName>
+		</typeDescription>
+
 		<typeDescription>
 			<name>org.apache.opennlp.annotations.Paragraph</name>
 			<description></description>
 			<supertypeName>uima.tcas.Annotation</supertypeName>
 		</typeDescription>
-		
+
 		<typeDescription>
 			<name>org.apache.opennlp.annotations.Sentence</name>
 			<description></description>
 			<supertypeName>uima.tcas.Annotation</supertypeName>
 		</typeDescription>
-		
+
 		<typeDescription>
 			<name>org.apache.opennlp.annotations.Token</name>
 			<description></description>
 			<supertypeName>uima.tcas.Annotation</supertypeName>
 		</typeDescription>
-		
+
 		<typeDescription>
 			<name>org.apache.opennlp.annotations.Person</name>
 			<description></description>
 			<supertypeName>uima.tcas.Annotation</supertypeName>
 		</typeDescription>
+
 		<typeDescription>
 			<name>org.apache.opennlp.annotations.Organization</name>
 			<description></description>
 			<supertypeName>uima.tcas.Annotation</supertypeName>
 		</typeDescription>
+
+		<typeDescription>
+			<name>org.apache.opennlp.annotations.WikiLink</name>
+			<supertypeName>uima.tcas.Annotation</supertypeName>
+			<features>
+				<featureDescription>
+					<name>link</name>
+					<description></description>
+					<rangeTypeName>uima.cas.String</rangeTypeName>
+				</featureDescription>
+			</features>
+		</typeDescription>
 	</types>
 </typeSystemDescription>
\ No newline at end of file

Added: incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/AnnotatingMarkupParser.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/AnnotatingMarkupParser.java?rev=1143582&view=auto
==============================================================================
--- incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/AnnotatingMarkupParser.java (added)
+++ incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/AnnotatingMarkupParser.java Wed Jul  6 22:10:42 2011
@@ -0,0 +1,284 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.opennlp.wikinews_importer;
+
+import info.bliki.htmlcleaner.ContentToken;
+import info.bliki.htmlcleaner.TagNode;
+import info.bliki.wiki.filter.ITextConverter;
+import info.bliki.wiki.filter.WPList;
+import info.bliki.wiki.filter.WPTable;
+import info.bliki.wiki.model.Configuration;
+import info.bliki.wiki.model.IWikiModel;
+import info.bliki.wiki.model.ImageFormat;
+import info.bliki.wiki.model.WikiModel;
+import info.bliki.wiki.tags.WPATag;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+/**
+ * Parse mediawiki markup to strip the formatting info and extract a simple text
+ * version suitable for NLP along with header, paragraph and link position
+ * annotations.
+ * 
+ * Use the {@code #convert(String)} and {@code #getWikiLinks()} methods.
+ * 
+ * Due to the constraints imposed by the {@code ITextConverter} /
+ * {@code WikiModel} API, this class is not thread safe: only one instance
+ * should be run by thread.
+ */
+public class AnnotatingMarkupParser implements ITextConverter {
+
+    public static final String HREF_ATTR_KEY = "href";
+
+    public static final String WIKILINK_TITLE_ATTR_KEY = "title";
+
+    public static final String WIKILINK_TARGET_ATTR_KEY = "href";
+
+    public static final String WIKIOBJECT_ATTR_KEY = "wikiobject";
+
+    public static final Set<String> PARAGRAPH_TAGS = new HashSet<String>(
+            Arrays.asList("p"));
+
+    public static final Set<String> HEADING_TAGS = new HashSet<String>(
+            Arrays.asList("h1", "h2", "h3", "h4", "h5", "h6"));
+
+    public static final Pattern INTERWIKI_PATTERN = Pattern.compile("http://[\\w-]+\\.wikipedia\\.org/wiki/.*");
+
+    protected final List<Annotation> wikilinks = new ArrayList<Annotation>();
+
+    protected final List<Annotation> headers = new ArrayList<Annotation>();
+
+    protected final List<Annotation> paragraphs = new ArrayList<Annotation>();
+
+    protected String languageCode = "en";
+
+    protected final WikiModel model;
+
+    protected String redirect;
+
+    protected String text;
+
+    protected static final Pattern REDIRECT_PATTERN = Pattern.compile("^#REDIRECT \\[\\[([^\\]]*)\\]\\]");
+
+    public AnnotatingMarkupParser() {
+        model = makeWikiModel(languageCode);
+    }
+
+    public AnnotatingMarkupParser(String languageCode) {
+        this.languageCode = languageCode;
+        model = makeWikiModel(languageCode);
+    }
+
+    public WikiModel makeWikiModel(String languageCode) {
+        return new WikiModel(String.format(
+                "http:/%s.wikipedia.org/wiki/${image}", languageCode),
+                String.format("http://%s.wikipedia.org/wiki/${title}",
+                        languageCode)) {
+            @Override
+            public String getRawWikiContent(String namespace,
+                    String articleName, Map<String, String> templateParameters) {
+                // disable template support
+                // TODO: we need to readd template support at least for dates
+                return "";
+            }
+        };
+    }
+
+
+    public void nodesToText(List<? extends Object> nodes, Appendable buffer,
+            IWikiModel model) throws IOException {
+        CountingAppendable countingBuffer;
+        if (buffer instanceof CountingAppendable) {
+            countingBuffer = (CountingAppendable) buffer;
+        } else {
+            // wrap
+            countingBuffer = new CountingAppendable(buffer);
+        }
+
+        if (nodes != null && !nodes.isEmpty()) {
+            try {
+                int level = model.incrementRecursionLevel();
+                if (level > Configuration.RENDERER_RECURSION_LIMIT) {
+                    countingBuffer.append("Error - recursion limit exceeded"
+                            + " rendering tags in PlainTextConverter#nodesToText().");
+                    return;
+                }
+                for (Object node : nodes) {
+                    if (node instanceof WPATag) {
+                        // extract wikilink annotations
+                        WPATag tag = (WPATag) node;
+                        String wikilinkLabel = (String) tag.getAttributes().get(
+                                WIKILINK_TITLE_ATTR_KEY);
+                        String wikilinkTarget = (String) tag.getAttributes().get(
+                                WIKILINK_TARGET_ATTR_KEY);
+                        if (wikilinkLabel != null) {
+                            int colonIdx = -1; // wikilinkLabel.indexOf(':');
+                            if (colonIdx == -1) {
+                                // do not serialize non-topic wiki-links such as
+                                // translation links missing from the
+                                // INTERWIKI_LINK map
+                                int start = countingBuffer.currentPosition;
+                                tag.getBodyString(countingBuffer);
+                                int end = countingBuffer.currentPosition;
+                                if (!wikilinkTarget.startsWith("#")) {
+                                  // TODO: wikilink label is not important,since that is the covered text?
+                                    wikilinks.add(new Annotation(start, end, wikilinkLabel, wikilinkTarget));
+                                }
+                            }
+                        } else {
+                            tag.getBodyString(countingBuffer);
+                        }
+
+                    } else if (node instanceof ContentToken) {
+                        ContentToken contentToken = (ContentToken) node;
+                        countingBuffer.append(contentToken.getContent());
+                    } else if (node instanceof List) {
+                    } else if (node instanceof WPList) {
+                    } else if (node instanceof WPTable) {
+                        // ignore lists and tables since they most of the time
+                        // do not hold grammatically correct
+                        // interesting sentences that are representative of the
+                        // language.
+                    } else if (node instanceof TagNode) {
+                        TagNode tagNode = (TagNode) node;
+                        Map<String, String> attributes = tagNode.getAttributes();
+                        Map<String, Object> oAttributes = tagNode.getObjectAttributes();
+                        boolean hasSpecialHandling = false;
+                        String tagName = tagNode.getName();
+                        int tagBegin = countingBuffer.currentPosition;
+                        
+                        if ("ref".equals(tagName)) {
+                            // ignore the references since they do not hold
+                            // interesting text content
+                            hasSpecialHandling = true;
+                        } else if (oAttributes != null
+                                && oAttributes.get(WIKIOBJECT_ATTR_KEY) instanceof ImageFormat) {
+                            // the caption of images often holds well formed
+                            // sentences with links to entities
+                            hasSpecialHandling = true;
+                            ImageFormat iformat = (ImageFormat) oAttributes.get(WIKIOBJECT_ATTR_KEY);
+                            imageNodeToText(tagNode, iformat, countingBuffer,
+                                    model);
+                        }
+                        if (!hasSpecialHandling) {
+                            nodesToText(tagNode.getChildren(), countingBuffer,
+                                    model);
+                        }
+                        if (PARAGRAPH_TAGS.contains(tagName)) {
+                            paragraphs.add(new Annotation(tagBegin,
+                                    countingBuffer.currentPosition,
+                                    "paragraph", tagName));
+                            countingBuffer.append("\n\n");
+                        } else if (HEADING_TAGS.contains(tagName)) {
+                            headers.add(new Annotation(tagBegin,
+                                countingBuffer.currentPosition, "heading",
+                                    tagName));
+                            countingBuffer.append("\n\n");
+                        } else if ("a".equals(tagName)) {
+                          String href = attributes.get(HREF_ATTR_KEY);
+                          
+                          // TODO: How to get covered text here? Is not needed anyway right?!
+                          wikilinks.add(new Annotation(tagBegin, countingBuffer.currentPosition,
+                              "", href));
+                        }
+                          
+                    }
+                }
+            } finally {
+                model.decrementRecursionLevel();
+            }
+        }
+    }
+
+    public void imageNodeToText(TagNode tagNode, ImageFormat imageFormat,
+            Appendable buffer, IWikiModel model) throws IOException {
+//        nodesToText(tagNode.getChildren(), buffer, model);
+    }
+
+    public boolean noLinks() {
+        return true;
+    }
+
+    public List<Annotation> getWikiLinkAnnotations() {
+        return wikilinks;
+    }
+
+    public List<Annotation> getHeaderAnnotations() {
+        return headers;
+    }
+
+    public List<Annotation> getParagraphAnnotations() {
+        return paragraphs;
+    }
+
+    public List<String> getParagraphs() {
+        List<String> texts = new ArrayList<String>();
+        for (Annotation p : paragraphs) {
+            texts.add(text.substring(p.begin, p.end));
+        }
+        return texts;
+    }
+
+    public List<String> getHeaders() {
+        List<String> texts = new ArrayList<String>();
+        for (Annotation h : headers) {
+            texts.add(text.substring(h.begin, h.end));
+        }
+        return texts;
+    }
+
+    public String getRedirect() {
+        return redirect;
+    }
+
+    public class CountingAppendable implements Appendable {
+
+        public int currentPosition = 0;
+
+        final protected Appendable wrappedBuffer;
+
+        public CountingAppendable(Appendable wrappedBuffer) {
+            this.wrappedBuffer = wrappedBuffer;
+        }
+
+        public Appendable append(CharSequence charSeq) throws IOException {
+            currentPosition += charSeq.length();
+            return wrappedBuffer.append(charSeq);
+        }
+
+        public Appendable append(char aChar) throws IOException {
+            currentPosition += 1;
+            return wrappedBuffer.append(aChar);
+        }
+
+        public Appendable append(CharSequence charSeq, int start, int end)
+                throws IOException {
+            currentPosition += end - start;
+            return wrappedBuffer.append(charSeq, start, end);
+        }
+
+    }
+
+}

Propchange: incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/AnnotatingMarkupParser.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/Annotation.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/Annotation.java?rev=1143582&view=auto
==============================================================================
--- incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/Annotation.java (added)
+++ incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/Annotation.java Wed Jul  6 22:10:42 2011
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.opennlp.wikinews_importer;
+
+public class Annotation {
+
+    public final int begin;
+
+    public final int end;
+
+    public final String label;
+    
+    public final String value;
+
+    public Annotation(int start, int end, String label, String value) {
+        this.begin = start;
+        this.end = end;
+        this.label = label;
+        this.value = value;
+    }
+
+}

Propchange: incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/Annotation.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/UimaUtil.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/UimaUtil.java?rev=1143582&view=auto
==============================================================================
--- incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/UimaUtil.java (added)
+++ incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/UimaUtil.java Wed Jul  6 22:10:42 2011
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.opennlp.wikinews_importer;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import org.apache.uima.ResourceSpecifierFactory;
+import org.apache.uima.UIMAFramework;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.impl.XmiCasDeserializer;
+import org.apache.uima.cas.impl.XmiCasSerializer;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.resource.metadata.FsIndexDescription;
+import org.apache.uima.resource.metadata.TypePriorities;
+import org.apache.uima.resource.metadata.TypeSystemDescription;
+import org.apache.uima.resource.metadata.impl.FsIndexDescription_impl;
+import org.apache.uima.util.CasCreationUtils;
+import org.apache.uima.util.InvalidXMLException;
+import org.apache.uima.util.XMLInputSource;
+import org.apache.uima.util.XMLParser;
+import org.apache.uima.util.XMLSerializer;
+import org.xml.sax.SAXException;
+
+public class UimaUtil {
+
+  static TypeSystemDescription createTypeSystemDescription(InputStream in) {
+
+    // Note:
+    // Type System location is not set correctly,
+    // resolving a referenced type system will fail
+
+    XMLInputSource xmlTypeSystemSource = new XMLInputSource(in, new File(""));
+
+    XMLParser xmlParser = UIMAFramework.getXMLParser();
+
+    TypeSystemDescription typeSystemDesciptor;
+
+    try {
+      typeSystemDesciptor = (TypeSystemDescription) xmlParser
+          .parse(xmlTypeSystemSource);
+
+      typeSystemDesciptor.resolveImports();
+    } catch (InvalidXMLException e) {
+      e.printStackTrace();
+      typeSystemDesciptor = null;
+    }
+
+    return typeSystemDesciptor;
+  }
+
+  static CAS createEmptyCAS(TypeSystemDescription typeSystem) {
+    ResourceSpecifierFactory resourceSpecifierFactory = UIMAFramework
+        .getResourceSpecifierFactory();
+    TypePriorities typePriorities = resourceSpecifierFactory
+        .createTypePriorities();
+
+    FsIndexDescription indexDesciptor = new FsIndexDescription_impl();
+    indexDesciptor.setLabel("TOPIndex");
+    indexDesciptor.setTypeName("uima.cas.TOP");
+    indexDesciptor.setKind(FsIndexDescription.KIND_SORTED);
+
+    CAS cas;
+    try {
+      cas = CasCreationUtils.createCas(typeSystem, typePriorities,
+          new FsIndexDescription[] { indexDesciptor });
+    } catch (ResourceInitializationException e) {
+      e.printStackTrace();
+      cas = null;
+    }
+
+    return cas;
+  }
+
+  static void deserializeXmiCAS(CAS cas, InputStream xmiIn) throws IOException {
+
+    SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
+    saxParserFactory.setValidating(false);
+
+    SAXParser saxParser;
+
+    try {
+      saxParser = saxParserFactory.newSAXParser();
+    } catch (ParserConfigurationException e) {
+      throw new IllegalStateException(
+          "SAXParser should be configured correctly!", e);
+    } catch (SAXException e) {
+      throw new IllegalStateException("SAX error while creating parser!", e);
+    }
+
+    XmiCasDeserializer dezerializer = new XmiCasDeserializer(
+        cas.getTypeSystem());
+
+    try {
+      saxParser.parse(xmiIn, dezerializer.getXmiCasHandler(cas));
+    } catch (SAXException e) {
+      throw new IOException("Invalid XMI input!", e);
+    }
+  }
+  
+  static void serializeCASToXmi(CAS cas, OutputStream out) throws IOException {
+    XmiCasSerializer xmiSerializer = new XmiCasSerializer(cas.getTypeSystem());
+
+    XMLSerializer xmlSerialzer = new XMLSerializer(out, true);  
+    
+    try {
+      xmiSerializer.serialize(cas, xmlSerialzer.getContentHandler());
+    } catch (SAXException e) {
+      e.printStackTrace();
+    }
+  }
+}

Propchange: incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/UimaUtil.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsConverter.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsConverter.java?rev=1143582&view=auto
==============================================================================
--- incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsConverter.java (added)
+++ incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsConverter.java Wed Jul  6 22:10:42 2011
@@ -0,0 +1,216 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.opennlp.wikinews_importer;
+
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.UnsupportedEncodingException;
+import java.net.URLEncoder;
+import java.util.ArrayList;
+import java.util.List;
+
+import info.bliki.wiki.dump.IArticleFilter;
+import info.bliki.wiki.dump.Siteinfo;
+import info.bliki.wiki.dump.WikiArticle;
+import info.bliki.wiki.dump.WikiXMLParser;
+
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.Feature;
+import org.apache.uima.cas.Type;
+import org.apache.uima.cas.text.AnnotationFS;
+import org.apache.uima.resource.metadata.TypeSystemDescription;
+import org.xml.sax.SAXException;
+
+/**
+ * Demo application which reads a compressed or uncompressed Wikipedia XML dump
+ * file (depending on the given file extension <i>.gz</i>, <i>.bz2</i> or
+ * <i>.xml</i>) and prints the title and wiki text.
+ * 
+ */
+public class WikinewsConverter {
+
+  /**
+   * Print title an content of all the wiki pages in the dump.
+   * 
+   */
+  static class CASArticleFilter implements IArticleFilter {
+
+    private final TypeSystemDescription tsDesc;
+    private List<String> endOfArtilceMarkers = new ArrayList<String>();
+    
+    CASArticleFilter(TypeSystemDescription tsDesc) {
+      
+      this.tsDesc = tsDesc;
+      
+      endOfArtilceMarkers.add("{{haveyoursay}}");
+      endOfArtilceMarkers.add("== Sources ==");
+      endOfArtilceMarkers.add("==Sources==");
+      endOfArtilceMarkers.add("== Source ==");
+      endOfArtilceMarkers.add("==Source==");
+      endOfArtilceMarkers.add("==References==");
+      endOfArtilceMarkers.add("== References ==");
+      endOfArtilceMarkers.add("=== References===");
+    }
+    
+    
+      public static String titleToUri(String title) {
+      try {
+          return URLEncoder.encode(title.replaceAll(" ", "_"), "UTF-8");
+      } catch (UnsupportedEncodingException e) {
+          throw new RuntimeException(e);
+      }
+    }
+    
+    public void process(WikiArticle page, Siteinfo siteinfo)
+        throws SAXException {
+      
+      if (page.getIntegerNamespace() == 0 && page.isMain()) {
+
+        if (page.getText().toLowerCase().contains("{publish}")) {
+          
+          String pageText = page.getText();
+          
+          int cutIndex = -1;
+          
+          for (String endMarker : endOfArtilceMarkers) {
+            
+            int endMarkerIndex = pageText.indexOf(endMarker);
+            if (endMarkerIndex != -1) {
+              cutIndex = endMarkerIndex;
+              break;
+            }
+          }
+          
+          if (cutIndex == -1)
+            cutIndex = pageText.length();
+          
+          pageText = pageText.substring(0, cutIndex);
+          
+          WikinewsWikiModel wikiModel = new WikinewsWikiModel("http://en.wikinews.org/wiki/${image}", 
+              "http://en.wikinews.org/wiki/${title}");
+          
+          AnnotatingMarkupParser converter = new AnnotatingMarkupParser();
+          String plainStr = wikiModel.render(converter, pageText);
+          
+          CAS articleCAS = UimaUtil.createEmptyCAS(tsDesc);
+          
+          // TODO: find a way to nicely add title ..
+          StringBuilder articleText = new StringBuilder();
+          articleText.append(page.getTitle());
+          
+          int endOffsetTitle = articleText.length();
+          
+          articleText.append("\n");
+          articleText.append("\n");
+          
+          int bodyOffset = articleText.length();
+          
+          articleText.append(plainStr); // Note: Add offset to annotations ... by this
+          
+          articleCAS.setDocumentLanguage("en");
+          articleCAS.setDocumentText(articleText.toString());
+          
+          AnnotationFS headlineAnnotation = articleCAS.createAnnotation(articleCAS.getTypeSystem()
+              .getType("org.apache.opennlp.annotations.Headline"),
+              0, endOffsetTitle);
+          
+          articleCAS.addFsToIndexes(headlineAnnotation);
+          
+          for (Annotation paraAnn : converter.getParagraphAnnotations()) {
+            AnnotationFS paraAnnFS = articleCAS.createAnnotation(articleCAS.getTypeSystem()
+                .getType("org.apache.opennlp.annotations.Paragraph"),
+                bodyOffset + paraAnn.begin, bodyOffset + paraAnn.end);
+            
+            articleCAS.addFsToIndexes(paraAnnFS);
+          }
+          
+          for (Annotation subHeadAnn : converter.getHeaderAnnotations()) {
+            AnnotationFS subHeadAnnFS = articleCAS.createAnnotation(
+                articleCAS.getTypeSystem()
+                .getType("org.apache.opennlp.annotations.SubHeadline"),
+                bodyOffset + subHeadAnn.begin, bodyOffset + subHeadAnn.end);
+            
+            articleCAS.addFsToIndexes(subHeadAnnFS);
+          }
+          
+          Type wikiLinkType = articleCAS.getTypeSystem()
+              .getType("org.apache.opennlp.annotations.WikiLink");
+          Feature linkFeature = wikiLinkType.getFeatureByBaseName("link");
+          
+          for (Annotation wikiLinkAnn : converter.getWikiLinkAnnotations()) {
+            AnnotationFS wikiLinkAnnFS = articleCAS.createAnnotation(
+                articleCAS.getTypeSystem()
+                .getType("org.apache.opennlp.annotations.WikiLink"),
+                bodyOffset + wikiLinkAnn.begin, bodyOffset + wikiLinkAnn.end);
+            
+            wikiLinkAnnFS.setStringValue(linkFeature, wikiLinkAnn.value);
+            
+            articleCAS.addFsToIndexes(wikiLinkAnnFS);
+          }
+          
+          CAS markupCas = articleCAS.createView("WikiMarkup");
+          markupCas.setDocumentText(page.toString());
+          
+          // now serialize CAS
+          OutputStream casOut = null;
+          try {
+              casOut = new FileOutputStream("articles/" + titleToUri(page.getTitle()) + ".xmi");
+              
+              UimaUtil.serializeCASToXmi(articleCAS, casOut);
+          }
+          catch (IOException e) {
+            e.printStackTrace();
+          }
+          finally {
+            try {
+            if (casOut != null)
+                casOut.close();
+              } catch (IOException e) {
+              }
+          }
+          
+        }
+      }
+    }
+  }
+
+  /**
+   * @param args
+   */
+  public static void main(String[] args) throws Exception {
+    if (args.length != 1) {
+      System.err.println("Usage: Parser <XML-FILE>"); 
+      // TODO: add folder where file are written here
+      System.exit(-1);
+    }
+    
+    TypeSystemDescription tsDesc = UimaUtil.createTypeSystemDescription(
+        new FileInputStream("samples/TypeSystem.xml"));
+
+    String bz2Filename = args[0];
+    try {
+      IArticleFilter handler = new CASArticleFilter(tsDesc);
+      WikiXMLParser wxp = new WikiXMLParser(bz2Filename, handler);
+      wxp.parse();
+    } catch (Exception e) {
+      e.printStackTrace();
+    }
+  }
+}

Propchange: incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsConverter.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsWikiModel.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsWikiModel.java?rev=1143582&view=auto
==============================================================================
--- incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsWikiModel.java (added)
+++ incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsWikiModel.java Wed Jul  6 22:10:42 2011
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.opennlp.wikinews_importer;
+
+import java.util.Map;
+
+import info.bliki.wiki.model.WikiModel;
+
+public class WikinewsWikiModel extends WikiModel {
+
+  public WikinewsWikiModel(String imageBaseURL, String linkBaseURL) {
+    super(imageBaseURL, linkBaseURL);
+  }
+
+  @Override
+  public String getRawWikiContent(String namespace, String articleName,
+      Map<String, String> map) {
+
+    String result = super.getRawWikiContent(namespace, articleName, map);
+    if (result == null) {
+      
+      // Maybe use special handling for date, ... ?!
+      
+      if (articleName.equals("w"))
+        return map.get("1");
+      
+      return "";
+    }
+    else {
+      return result;
+    }
+  }
+}

Propchange: incubator/opennlp/sandbox/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsWikiModel.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain