You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2012/03/29 16:18:00 UTC
svn commit: r1306876 - in /opennlp/trunk/opennlp-tools/src:
main/java/opennlp/tools/formats/frenchtreebank/
test/java/opennlp/tools/formats/frenchtreebank/
Author: joern
Date: Thu Mar 29 14:17:59 2012
New Revision: 1306876
URL: http://svn.apache.org/viewvc?rev=1306876&view=rev
Log:
OPENNLP-342 First implementation of French Treebank parser.
Added:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitDocumentHandler.java (with props)
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStream.java (with props)
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/frenchtreebank/
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStreamTest.java (with props)
Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitDocumentHandler.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitDocumentHandler.java?rev=1306876&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitDocumentHandler.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitDocumentHandler.java Thu Mar 29 14:17:59 2012
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.frenchtreebank;
+
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Stack;
+
+import opennlp.tools.parser.AbstractBottomUpParser;
+import opennlp.tools.parser.Constituent;
+import opennlp.tools.parser.Parse;
+import opennlp.tools.util.Span;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+class ConstitDocumentHandler extends DefaultHandler {
+
+ private static final String SENT_ELEMENT_NAME = "SENT";
+ private static final String WORD_ELEMENT_NAME = "w";
+ private static final String COMPOUND_ATTR_NAME = "compound";
+
+ private static final String SENT_TYPE_NAME = "S";
+
+ private final List<Parse> parses;
+
+ private boolean insideSentenceElement;
+
+ /**
+ * A token buffer, a token might be build up by multiple
+ * {@link #characters(char[], int, int)} calls.
+ */
+ private final StringBuilder tokenBuffer = new StringBuilder();
+
+ private final StringBuilder text = new StringBuilder();
+
+ private int offset;
+ private final Stack<Constituent> stack = new Stack<Constituent>();
+ private final List<Constituent> cons = new LinkedList<Constituent>();
+
+ ConstitDocumentHandler(List<Parse> parses) {
+ this.parses = parses;
+ }
+
+ private String compoundCat;
+
+ @Override
+ public void startElement(String uri, String localName, String qName,
+ Attributes attributes) throws SAXException {
+
+ String type = qName;
+
+ boolean isCompoundWord = false;
+
+ if (SENT_ELEMENT_NAME.equals(qName)) {
+ // Clear everything to be ready for the next sentence
+ text.setLength(0);
+ offset = 0;
+ stack.clear();
+ cons.clear();
+
+ type = SENT_TYPE_NAME;
+
+ insideSentenceElement = true;
+ }
+ else if (WORD_ELEMENT_NAME.equals(qName)) {
+
+ // insideCompoundElement
+ if (attributes.getValue(COMPOUND_ATTR_NAME) != null) {
+ isCompoundWord = "yes".equals(COMPOUND_ATTR_NAME);
+ }
+
+ String cat = attributes.getValue("cat");
+
+ if (isCompoundWord) {
+ compoundCat = cat;
+ }
+
+ if (cat != null) {
+ String subcat = attributes.getValue("subcat");
+ type = cat + (subcat != null ? subcat : "");
+ }
+ else {
+ String catint = attributes.getValue("catint");
+ type = compoundCat + (catint != null ? catint : "");
+ }
+ }
+
+ stack.push(new Constituent(type, new Span(offset, offset)));
+
+ tokenBuffer.setLength(0);
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length) throws SAXException {
+ tokenBuffer.append(ch, start, length);
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String qName)
+ throws SAXException {
+
+ boolean isCreateConstituent = true;
+
+ if (insideSentenceElement) {
+ if (WORD_ELEMENT_NAME.equals(qName)) {
+ String token = tokenBuffer.toString().trim();
+
+ if (token.length() > 0) {
+ cons.add(new Constituent(AbstractBottomUpParser.TOK_NODE,
+ new Span(offset, offset + token.length())));
+
+ text.append(token).append(" ");
+ offset += token.length() + 1;
+ }
+ else {
+ isCreateConstituent = false;
+ }
+ }
+
+ Constituent unfinishedCon = stack.pop();
+
+ if (isCreateConstituent) {
+ int start = unfinishedCon.getSpan().getStart();
+ if (start < offset) {
+ cons.add(new Constituent(unfinishedCon.getLabel(), new Span(start, offset-1)));
+ }
+ }
+
+ if (SENT_ELEMENT_NAME.equals(qName)) {
+ // Finished parsing sentence, now put everything together and create
+ // a Parse object
+
+ String txt = text.toString();
+ int tokenIndex = -1;
+ Parse p = new Parse(txt, new Span(0, txt.length()), AbstractBottomUpParser.TOP_NODE, 1,0);
+ for (int ci=0;ci < cons.size();ci++) {
+ Constituent con = cons.get(ci);
+ String type = con.getLabel();
+ if (!type.equals(AbstractBottomUpParser.TOP_NODE)) {
+ if (type == AbstractBottomUpParser.TOK_NODE) {
+ tokenIndex++;
+ }
+ Parse c = new Parse(txt, con.getSpan(), type, 1,tokenIndex);
+ p.insert(c);
+ }
+ }
+ parses.add(p);
+
+ insideSentenceElement = false;
+ }
+ }
+ }
+}
Propchange: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitDocumentHandler.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStream.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStream.java?rev=1306876&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStream.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStream.java Thu Mar 29 14:17:59 2012
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.frenchtreebank;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import org.xml.sax.SAXException;
+
+import opennlp.tools.parser.Parse;
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+
+public class ConstitParseSampleStream extends FilterObjectStream<byte[], Parse> {
+
+ private SAXParser saxParser;
+
+ private List<Parse> parses = new ArrayList<Parse>();
+
+ protected ConstitParseSampleStream(ObjectStream<byte[]> samples) {
+ super(samples);
+
+ SAXParserFactory factory = SAXParserFactory.newInstance();
+ try {
+ saxParser = factory.newSAXParser();
+ } catch (ParserConfigurationException e) {
+ throw new IllegalStateException(e);
+ } catch (SAXException e) {
+ throw new IllegalStateException(e);
+ }
+ }
+
+ public Parse read() throws IOException {
+
+
+ if (parses.isEmpty()) {
+ byte[] xmlbytes = samples.read();
+
+ if (xmlbytes != null) {
+
+ List<Parse> producedParses = new ArrayList<Parse>();
+ try {
+ saxParser.parse(new ByteArrayInputStream(xmlbytes), new ConstitDocumentHandler(producedParses));
+ } catch (SAXException e) {
+ throw new IOException(e);
+ }
+
+ parses.addAll(producedParses);
+ }
+ }
+
+ if (parses.size() > 0) {
+ return parses.remove(0);
+ }
+ else {
+ return null;
+ }
+ }
+}
Propchange: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStream.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStreamTest.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStreamTest.java?rev=1306876&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStreamTest.java (added)
+++ opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStreamTest.java Thu Mar 29 14:17:59 2012
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.frenchtreebank;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+import opennlp.tools.parser.Parse;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.ObjectStreamUtils;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+public class ConstitParseSampleStreamTest {
+
+ private String sample1Tokens[] = new String[]{
+ "L'",
+ "autonomie",
+ "de",
+ "la",
+ "Bundesbank",
+ ",",
+ "la",
+ "politique",
+ "de",
+ "stabilité",
+ "qu'",
+ "elle",
+ "a",
+ "fait",
+ "prévaloir",
+ "(",
+ "avec",
+ "moins",
+ "de",
+ "succès",
+ "et",
+ "de",
+ "sévérité",
+ "qu'",
+ "on",
+ "ne",
+ "le",
+ "dit",
+ ",",
+ "mais",
+ "tout",
+ "est",
+ "relatif",
+ ")",
+ ",",
+ "est",
+ "une",
+ "pièce",
+ "essentielle",
+ "de",
+ "la",
+ "division",
+ "des",
+ "pouvoirs",
+ "en",
+ "Allemagne",
+ "."
+ };
+
+ /**
+ * Reads sample1.xml into a byte array.
+ *
+ * @return byte array containing sample1.xml.
+ */
+ static byte[] getSample1() throws IOException {
+ ByteArrayOutputStream out = new ByteArrayOutputStream();
+
+ InputStream sampleIn =
+ ConstitParseSampleStreamTest.class.getResourceAsStream("sample1.xml");
+
+ byte buffer[] = new byte[1024];
+ int length;
+ try {
+ while ((length = sampleIn.read(buffer)) > 0) {
+ out.write(buffer, 0, length);
+ }
+ } finally {
+ sampleIn.close();
+ }
+
+ return out.toByteArray();
+ }
+
+ @Test
+ public void testThereIsExactlyOneSent() throws IOException {
+ ObjectStream<Parse> samples =
+ new ConstitParseSampleStream(ObjectStreamUtils.createObjectStream(getSample1()));
+
+ Assert.assertNotNull(samples.read());
+ Assert.assertNull(samples.read());
+ Assert.assertNull(samples.read());
+ }
+
+ @Test
+ public void testTokensAreCorrect() throws IOException {
+
+ ObjectStream<Parse> samples =
+ new ConstitParseSampleStream(ObjectStreamUtils.createObjectStream(getSample1()));
+
+ Parse p = samples.read();
+
+ Parse[] tagNodes = p.getTagNodes();
+ String[] tokens = new String[tagNodes.length];
+ for (int ti=0;ti<tagNodes.length;ti++){
+ tokens[ti] = tagNodes[ti].toString();
+ }
+
+ Assert.assertArrayEquals(sample1Tokens, tokens);
+ }
+}
Propchange: opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStreamTest.java
------------------------------------------------------------------------------
svn:mime-type = text/plain