You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2012/03/29 16:18:00 UTC

svn commit: r1306876 - in /opennlp/trunk/opennlp-tools/src: main/java/opennlp/tools/formats/frenchtreebank/ test/java/opennlp/tools/formats/frenchtreebank/

Author: joern
Date: Thu Mar 29 14:17:59 2012
New Revision: 1306876

URL: http://svn.apache.org/viewvc?rev=1306876&view=rev
Log:
OPENNLP-342 First implementation of French Treebank parser.

Added:
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitDocumentHandler.java   (with props)
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStream.java   (with props)
    opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/frenchtreebank/
    opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStreamTest.java   (with props)

Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitDocumentHandler.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitDocumentHandler.java?rev=1306876&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitDocumentHandler.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitDocumentHandler.java Thu Mar 29 14:17:59 2012
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.frenchtreebank;
+
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Stack;
+
+import opennlp.tools.parser.AbstractBottomUpParser;
+import opennlp.tools.parser.Constituent;
+import opennlp.tools.parser.Parse;
+import opennlp.tools.util.Span;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+class ConstitDocumentHandler extends DefaultHandler {
+  
+  private static final String SENT_ELEMENT_NAME = "SENT";
+  private static final String WORD_ELEMENT_NAME = "w";
+  private static final String COMPOUND_ATTR_NAME = "compound";
+  
+  private static final String SENT_TYPE_NAME = "S";
+  
+  private final List<Parse> parses;
+
+  private boolean insideSentenceElement;
+  
+  /**
+   * A token buffer, a token might be build up by multiple
+   * {@link #characters(char[], int, int)} calls.
+   */
+  private final StringBuilder tokenBuffer = new StringBuilder();
+
+  private final StringBuilder text = new StringBuilder();
+  
+  private int offset;
+  private final Stack<Constituent> stack = new Stack<Constituent>();
+  private final List<Constituent> cons = new LinkedList<Constituent>();
+  
+  ConstitDocumentHandler(List<Parse> parses) {
+    this.parses = parses;
+  }
+  
+  private String compoundCat;
+  
+  @Override
+  public void startElement(String uri, String localName, String qName,
+      Attributes attributes) throws SAXException {
+    
+    String type = qName;
+
+    boolean isCompoundWord = false;
+    
+    if (SENT_ELEMENT_NAME.equals(qName)) {
+      // Clear everything to be ready for the next sentence
+      text.setLength(0);
+      offset = 0;
+      stack.clear();
+      cons.clear();
+      
+      type = SENT_TYPE_NAME;
+      
+      insideSentenceElement = true;
+    }
+    else if (WORD_ELEMENT_NAME.equals(qName)) {
+      
+      // insideCompoundElement
+      if (attributes.getValue(COMPOUND_ATTR_NAME) != null) {
+        isCompoundWord = "yes".equals(COMPOUND_ATTR_NAME);
+      }
+      
+      String cat = attributes.getValue("cat");
+      
+      if (isCompoundWord) {
+        compoundCat = cat;
+      }
+      
+      if (cat != null) {
+        String subcat = attributes.getValue("subcat");
+        type = cat + (subcat != null ? subcat : "");
+      }
+      else {
+        String catint = attributes.getValue("catint");
+        type = compoundCat + (catint != null ? catint : "");
+      }
+    }
+    
+    stack.push(new Constituent(type, new Span(offset, offset)));
+    
+    tokenBuffer.setLength(0);
+  }
+  
+  @Override
+  public void characters(char[] ch, int start, int length) throws SAXException {
+    tokenBuffer.append(ch, start, length);
+  }
+  
+  @Override
+  public void endElement(String uri, String localName, String qName)
+      throws SAXException {
+    
+    boolean isCreateConstituent = true;
+    
+    if (insideSentenceElement) {
+      if (WORD_ELEMENT_NAME.equals(qName)) {
+        String token = tokenBuffer.toString().trim();
+        
+        if (token.length() > 0) {
+          cons.add(new Constituent(AbstractBottomUpParser.TOK_NODE,
+              new Span(offset, offset + token.length())));
+          
+          text.append(token).append(" ");
+          offset += token.length() + 1;
+        }
+        else {
+          isCreateConstituent = false;
+        }
+      }
+      
+      Constituent unfinishedCon = stack.pop();
+      
+      if (isCreateConstituent) {
+        int start = unfinishedCon.getSpan().getStart();
+        if (start < offset) {
+          cons.add(new Constituent(unfinishedCon.getLabel(), new Span(start, offset-1)));
+        }
+      }
+      
+      if (SENT_ELEMENT_NAME.equals(qName)) {
+        // Finished parsing sentence, now put everything together and create
+        // a Parse object
+        
+        String txt = text.toString();
+        int tokenIndex = -1;
+        Parse p = new Parse(txt, new Span(0, txt.length()), AbstractBottomUpParser.TOP_NODE, 1,0);
+        for (int ci=0;ci < cons.size();ci++) {
+          Constituent con = cons.get(ci);
+          String type = con.getLabel();
+          if (!type.equals(AbstractBottomUpParser.TOP_NODE)) {
+            if (type == AbstractBottomUpParser.TOK_NODE) {
+              tokenIndex++;
+            }
+            Parse c = new Parse(txt, con.getSpan(), type, 1,tokenIndex);
+            p.insert(c);
+          }
+        }
+        parses.add(p);
+        
+        insideSentenceElement = false;
+      }
+    }
+  }
+}

Propchange: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitDocumentHandler.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStream.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStream.java?rev=1306876&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStream.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStream.java Thu Mar 29 14:17:59 2012
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.frenchtreebank;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import org.xml.sax.SAXException;
+
+import opennlp.tools.parser.Parse;
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+
+public class ConstitParseSampleStream extends FilterObjectStream<byte[], Parse> {
+
+  private SAXParser saxParser;
+
+  private List<Parse> parses = new ArrayList<Parse>();
+  
+  protected ConstitParseSampleStream(ObjectStream<byte[]> samples) {
+    super(samples);
+    
+    SAXParserFactory factory = SAXParserFactory.newInstance();
+    try {
+      saxParser = factory.newSAXParser();
+    } catch (ParserConfigurationException e) {
+      throw new IllegalStateException(e);
+    } catch (SAXException e) {
+      throw new IllegalStateException(e);
+    }
+  }
+
+  public Parse read() throws IOException {
+    
+    
+    if (parses.isEmpty()) {
+      byte[] xmlbytes = samples.read();
+      
+      if (xmlbytes != null) {
+      
+        List<Parse> producedParses = new ArrayList<Parse>();
+        try {
+          saxParser.parse(new ByteArrayInputStream(xmlbytes), new ConstitDocumentHandler(producedParses));
+        } catch (SAXException e) {
+          throw new IOException(e);
+        }
+        
+        parses.addAll(producedParses);
+      }
+    }
+    
+    if (parses.size() > 0) {
+      return parses.remove(0);
+    }
+    else {
+      return null;
+    }
+  }
+}

Propchange: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStream.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStreamTest.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStreamTest.java?rev=1306876&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStreamTest.java (added)
+++ opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStreamTest.java Thu Mar 29 14:17:59 2012
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.frenchtreebank;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+import opennlp.tools.parser.Parse;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.ObjectStreamUtils;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+public class ConstitParseSampleStreamTest {
+
+  private String sample1Tokens[] = new String[]{
+      "L'",
+      "autonomie",
+      "de",
+      "la",
+      "Bundesbank",
+      ",",
+      "la",
+      "politique",
+      "de",
+      "stabilité",
+      "qu'",
+      "elle",
+      "a",
+      "fait",
+      "prévaloir",
+      "(",
+      "avec",
+      "moins",
+      "de",
+      "succès",
+      "et",
+      "de",
+      "sévérité",
+      "qu'",
+      "on",
+      "ne",
+      "le",
+      "dit",
+      ",",
+      "mais",
+      "tout",
+      "est",
+      "relatif",
+      ")",
+      ",",
+      "est",
+      "une",
+      "pièce",
+      "essentielle",
+      "de",
+      "la",
+      "division",
+      "des",
+      "pouvoirs",
+      "en",
+      "Allemagne",
+      "."
+  };
+  
+  /**
+   * Reads sample1.xml into a byte array.
+   * 
+   * @return byte array containing sample1.xml.
+   */
+  static byte[] getSample1() throws IOException {
+    ByteArrayOutputStream out = new ByteArrayOutputStream();
+    
+    InputStream sampleIn =
+        ConstitParseSampleStreamTest.class.getResourceAsStream("sample1.xml");
+    
+    byte buffer[] = new byte[1024];
+    int length;
+    try {
+      while ((length = sampleIn.read(buffer)) > 0) {
+        out.write(buffer, 0, length);
+      }
+    } finally {
+      sampleIn.close();
+    }
+    
+    return out.toByteArray();
+  }
+
+  @Test
+  public void testThereIsExactlyOneSent() throws IOException {
+    ObjectStream<Parse> samples = 
+        new ConstitParseSampleStream(ObjectStreamUtils.createObjectStream(getSample1()));
+    
+    Assert.assertNotNull(samples.read());
+    Assert.assertNull(samples.read());
+    Assert.assertNull(samples.read());
+  }
+  
+  @Test
+  public void testTokensAreCorrect() throws IOException {
+    
+    ObjectStream<Parse> samples = 
+        new ConstitParseSampleStream(ObjectStreamUtils.createObjectStream(getSample1()));
+    
+    Parse p = samples.read();
+    
+    Parse[] tagNodes = p.getTagNodes();
+    String[] tokens = new String[tagNodes.length];
+    for (int ti=0;ti<tagNodes.length;ti++){
+      tokens[ti] = tagNodes[ti].toString();
+    }
+    
+    Assert.assertArrayEquals(sample1Tokens, tokens);
+  }
+}

Propchange: opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStreamTest.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain