You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2015/12/20 21:12:49 UTC
svn commit: r1721058 - in /tika/trunk: CHANGES.txt
tika-parsers/src/main/java/org/apache/tika/parser/ner/NamedEntityParser.java
Author: mattmann
Date: Sun Dec 20 20:12:49 2015
New Revision: 1721058
URL: http://svn.apache.org/viewvc?rev=1721058&view=rev
Log:
Fix for TIKA-1815 Text content from parser is empty when NamedEntityParser is enabled contributed by Thamme Gowda <tg...@gmail.com> this closes #67
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/NamedEntityParser.java
Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1721058&r1=1721057&r2=1721058&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Sun Dec 20 20:12:49 2015
@@ -1,5 +1,8 @@
Release 1.12 - Current Development
+ * The NamedEntityParser was enhanced to generate text content
+ in addition to metadata (TIKA-1815).
+
* A significant speed-up is made to the GeoTopicParser by
using the new REST server capabilities from Lucene Geo
Gazetteer (TIKA-1803).
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/NamedEntityParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/NamedEntityParser.java?rev=1721058&r1=1721057&r2=1721058&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/NamedEntityParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/NamedEntityParser.java Sun Dec 20 20:12:49 2015
@@ -26,6 +26,8 @@ import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.ner.opennlp.OpenNLPNERecogniser;
+import org.apache.tika.parser.ner.regex.RegexNERecogniser;
+import org.apache.tika.sax.XHTMLContentHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
@@ -59,7 +61,8 @@ public class NamedEntityParser extends A
public static final Logger LOG = LoggerFactory.getLogger(NamedEntityParser.class);
public static final Set<MediaType> MEDIA_TYPES = new HashSet<>();
public static final String MD_KEY_PREFIX = "NER_";
- public static final String DEFAULT_NER_IMPL = OpenNLPNERecogniser.class.getName();
+ public static final String DEFAULT_NER_IMPL =
+ OpenNLPNERecogniser.class.getName() + "," + RegexNERecogniser.class.getName();
public static final String SYS_PROP_NER_IMPL = "ner.impl.class";
public Tika secondaryParser;
@@ -147,5 +150,27 @@ public class NamedEntityParser extends A
}
}
}
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(contentHandler, metadata);
+ extractOutput(text.trim(), xhtml);
+ }
+
+ /**
+ * writes the content to the given XHTML
+ * content handler
+ *
+ * @param content
+ * the content which needs to be written
+ * @param xhtml
+ * XHTML content handler
+ * @throws SAXException
+ * if the XHTML SAX events could not be handled
+ *
+ */
+ private void extractOutput(String content, XHTMLContentHandler xhtml) throws SAXException{
+ xhtml.startDocument();
+ xhtml.startElement("div");
+ xhtml.characters(content);
+ xhtml.endElement("div");
+ xhtml.endDocument();
}
}