You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2015/12/20 21:12:49 UTC

svn commit: r1721058 - in /tika/trunk: CHANGES.txt tika-parsers/src/main/java/org/apache/tika/parser/ner/NamedEntityParser.java

Author: mattmann
Date: Sun Dec 20 20:12:49 2015
New Revision: 1721058

URL: http://svn.apache.org/viewvc?rev=1721058&view=rev
Log:
Fix for TIKA-1815 Text content from parser is empty when NamedEntityParser is enabled contributed by Thamme Gowda <tg...@gmail.com> this closes #67

Modified:
    tika/trunk/CHANGES.txt
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/NamedEntityParser.java

Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1721058&r1=1721057&r2=1721058&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Sun Dec 20 20:12:49 2015
@@ -1,5 +1,8 @@
 Release 1.12 - Current Development
 
+  * The NamedEntityParser was enhanced to generate text content
+    in addition to metadata (TIKA-1815).
+
   * A significant speed-up is made to the GeoTopicParser by
     using the new REST server capabilities from Lucene Geo
     Gazetteer (TIKA-1803).

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/NamedEntityParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/NamedEntityParser.java?rev=1721058&r1=1721057&r2=1721058&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/NamedEntityParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/NamedEntityParser.java Sun Dec 20 20:12:49 2015
@@ -26,6 +26,8 @@ import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.ner.opennlp.OpenNLPNERecogniser;
+import org.apache.tika.parser.ner.regex.RegexNERecogniser;
+import org.apache.tika.sax.XHTMLContentHandler;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.xml.sax.ContentHandler;
@@ -59,7 +61,8 @@ public class NamedEntityParser extends A
     public static final Logger LOG = LoggerFactory.getLogger(NamedEntityParser.class);
     public static final Set<MediaType> MEDIA_TYPES = new HashSet<>();
     public static final String MD_KEY_PREFIX = "NER_";
-    public static final String DEFAULT_NER_IMPL = OpenNLPNERecogniser.class.getName();
+    public static final String DEFAULT_NER_IMPL =
+            OpenNLPNERecogniser.class.getName() + "," + RegexNERecogniser.class.getName();
     public static final String SYS_PROP_NER_IMPL = "ner.impl.class";
 
     public Tika secondaryParser;
@@ -147,5 +150,27 @@ public class NamedEntityParser extends A
                 }
             }
         }
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(contentHandler, metadata);
+        extractOutput(text.trim(), xhtml);
+    }
+
+    /**
+     * writes the content to the given XHTML
+     * content handler
+     *
+     * @param content
+     *          the content which needs to be written
+     * @param xhtml
+     *          XHTML content handler
+     * @throws SAXException
+     *           if the XHTML SAX events could not be handled
+     *
+     */
+    private void extractOutput(String content, XHTMLContentHandler xhtml) throws SAXException{
+        xhtml.startDocument();
+        xhtml.startElement("div");
+        xhtml.characters(content);
+        xhtml.endElement("div");
+        xhtml.endDocument();
     }
 }