You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2015/11/17 18:29:26 UTC
svn commit: r1714835 - in /tika/trunk: ./ tika-parsers/
tika-parsers/src/main/java/org/apache/tika/parser/ner/
tika-parsers/src/main/java/org/apache/tika/parser/ner/corenlp/
tika-parsers/src/main/java/org/apache/tika/parser/ner/opennlp/
tika-parsers/sr...
Author: mattmann
Date: Tue Nov 17 17:29:25 2015
New Revision: 1714835
URL: http://svn.apache.org/viewvc?rev=1714835&view=rev
Log:
Fix for TIKA-1787: Include Stanford Name Entity Recognition in Tika contributed by Thamme Gowda N and Yueheng He this closes #61 this closes #62
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/NERecogniser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/NamedEntityParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/corenlp/
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/corenlp/CoreNLPNERecogniser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/opennlp/
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNERecogniser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNameFinder.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/regex/
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/regex/RegexNERecogniser.java
tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/ner/
tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/ner/regex/
tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/ner/regex/ner-regex.txt
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ner/
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ner/NamedEntityParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ner/regex/
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ner/regex/RegexNERecogniserTest.java
tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/
tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/
tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/opennlp/
tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/opennlp/ModelGetter.groovy
tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/opennlp/get-models.sh (with props)
tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/regex/
tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/regex/ner-regex.txt
tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/tika-config.xml
Modified:
tika/trunk/.gitignore
tika/trunk/CHANGES.txt
tika/trunk/tika-parsers/pom.xml
Modified: tika/trunk/.gitignore
URL: http://svn.apache.org/viewvc/tika/trunk/.gitignore?rev=1714835&r1=1714834&r2=1714835&view=diff
==============================================================================
--- tika/trunk/.gitignore (original)
+++ tika/trunk/.gitignore Tue Nov 17 17:29:25 2015
@@ -7,5 +7,6 @@ target
*.iml
*.ipr
*.iws
+*.bin
nbactions.xml
nb-configuration.xml
\ No newline at end of file
Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1714835&r1=1714834&r2=1714835&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Tue Nov 17 17:29:25 2015
@@ -1,5 +1,9 @@
Release 1.12 - Current Development
+ * Provide NamedEntityParser which exposes Named Entity Recognition
+ from OpenNLP and Stanford NER providers (TIKA-1787, GitHub-61,
+ GitHub-62).
+
* Allow XHTMLContentHandler to pass attributes of html element
via Markus Jelsma (TIKA-1782).
Modified: tika/trunk/tika-parsers/pom.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/pom.xml?rev=1714835&r1=1714834&r2=1714835&view=diff
==============================================================================
--- tika/trunk/tika-parsers/pom.xml (original)
+++ tika/trunk/tika-parsers/pom.xml Tue Nov 17 17:29:25 2015
@@ -450,4 +450,43 @@
<system>Jenkins</system>
<url>https://builds.apache.org/job/Tika-trunk/</url>
</ciManagement>
+
+ <profiles>
+ <profile>
+ <id>testSetup</id>
+ <activation>
+ <!-- auto activate -->
+ <file>
+ <missing>${basedir}/src/test/resources/org/apache/tika/parser/ner/opennlp/ner-person.bin</missing>
+ </file>
+ </activation>
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.maven</groupId>
+ <artifactId>maven-model</artifactId>
+ <version>3.3.3</version>
+ </dependency>
+ </dependencies>
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.codehaus.groovy.maven</groupId>
+ <artifactId>gmaven-plugin</artifactId>
+ <executions>
+ <execution>
+ <id>testSetup</id>
+ <phase>test-compile</phase>
+ <goals>
+ <goal>execute</goal>
+ </goals>
+ <configuration>
+ <source>${basedir}/src/test/resources/org/apache/tika/parser/ner/opennlp/ModelGetter.groovy</source>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
+ </profile>
+ </profiles>
</project>
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/NERecogniser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/NERecogniser.java?rev=1714835&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/NERecogniser.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/NERecogniser.java Tue Nov 17 17:29:25 2015
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.ner;
+
+import java.lang.String;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * Defines a contract for named entity recogniser. The NER contract includes {@link #isAvailable()},
+ * {@link #getEntityTypes()} and {@link #recognise( String )}
+ */
+public interface NERecogniser {
+
+ //the common named entity classes
+ String LOCATION = "LOCATION";
+ String PERSON = "PERSON";
+ String ORGANIZATION = "ORGANIZATION";
+ String MISCELLANEOUS = "MISCELLANEOUS";
+ String TIME = "TIME";
+ String DATE = "DATE";
+ String PERCENT = "PERCENT";
+ String MONEY = "MONEY";
+
+ /**
+ * checks if this Named Entity recogniser is available for service
+ * @return true if this recogniser is ready to recognise, false otherwise
+ */
+ boolean isAvailable();
+
+ /**
+ * gets a set of entity types whose names are recognisable by this
+ * @return set of entity types/classes
+ */
+ Set<String> getEntityTypes();
+
+ /**
+ * call for name recognition action from text
+ * @param text text with possibly contains names
+ * @return map of entityType -> set of names
+ */
+ Map<String, Set<String>> recognise(String text);
+}
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/NamedEntityParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/NamedEntityParser.java?rev=1714835&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/NamedEntityParser.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/NamedEntityParser.java Tue Nov 17 17:29:25 2015
@@ -0,0 +1,151 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.ner;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.Tika;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.ner.opennlp.OpenNLPNERecogniser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ *
+ * This implementation of {@link org.apache.tika.parser.Parser} extracts
+ * entity names from text content and adds it to the metadata.
+ * <p>All the metadata keys will have a common suffix {@value #MD_KEY_PREFIX}</p>
+ * <p>The Named Entity recogniser implementation can be changed by setting the
+ * system property {@value #SYS_PROP_NER_IMPL} value to a name of class that
+ * implements {@link NERecogniser} contract</p>
+ * @see OpenNLPNERecogniser
+ * @see NERecogniser
+ *
+ */
+public class NamedEntityParser extends AbstractParser {
+
+ public static final Logger LOG = LoggerFactory.getLogger(NamedEntityParser.class);
+ public static final Set<MediaType> MEDIA_TYPES = new HashSet<>();
+ public static final String MD_KEY_PREFIX = "NER_";
+ public static final String DEFAULT_NER_IMPL = OpenNLPNERecogniser.class.getName();
+ public static final String SYS_PROP_NER_IMPL = "ner.impl.class";
+
+ public Tika secondaryParser;
+
+ static {
+ MEDIA_TYPES.add(MediaType.TEXT_PLAIN);
+ }
+
+ private List<NERecogniser> nerChain;
+ private volatile boolean initialized = false;
+ private volatile boolean available = false;
+
+ private synchronized void initialize(ParseContext context) {
+ if (initialized) {
+ return;
+ }
+ initialized = true;
+
+ //TODO: read class name from context or config
+ //There can be multiple classes in the form of comma separated class names;
+ String classNamesString = System.getProperty(SYS_PROP_NER_IMPL,
+ DEFAULT_NER_IMPL);
+ String[] classNames = classNamesString.split(",");
+ this.nerChain = new ArrayList<>(classNames.length);
+ for (String className : classNames) {
+ className = className.trim();
+ LOG.info("going to load, instantiate and bind the instance of {}",
+ className);
+ try {
+ NERecogniser recogniser =
+ (NERecogniser) Class.forName(className).newInstance();
+ LOG.info("{} is available ? {}", className,
+ recogniser.isAvailable());
+ if (recogniser.isAvailable()) {
+ nerChain.add(recogniser);
+ }
+ } catch (Exception e) {
+ LOG.error(e.getMessage(), e);
+ }
+ }
+ try {
+ TikaConfig config = new TikaConfig();
+ this.secondaryParser = new Tika(config);
+ this.available = !nerChain.isEmpty();
+ LOG.info("Number of NERecognisers in chain {}", nerChain.size());
+ } catch (Exception e){
+ LOG.error(e.getMessage(), e);
+ this.available = false;
+ }
+ }
+
+ public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
+ return MEDIA_TYPES;
+ }
+
+ public void parse(InputStream inputStream, ContentHandler contentHandler,
+ Metadata metadata, ParseContext parseContext)
+ throws IOException, SAXException, TikaException {
+
+ if (!initialized) {
+ initialize(parseContext);
+ }
+ if (!available) {
+ return;
+ }
+
+ Reader reader = MediaType.TEXT_PLAIN.toString()
+ .equals(metadata.get(Metadata.CONTENT_TYPE))
+ ? new InputStreamReader(inputStream, StandardCharsets.UTF_8)
+ : secondaryParser.parse(inputStream);
+
+ String text = IOUtils.toString(reader);
+ IOUtils.closeQuietly(reader);
+
+ for (NERecogniser ner : nerChain) {
+ Map<String, Set<String>> names = ner.recognise(text);
+ if (names != null) {
+ for (Map.Entry<String, Set<String>> entry : names.entrySet()) {
+ if (entry.getValue() != null) {
+ String mdKey = MD_KEY_PREFIX + entry.getKey();
+ for (String name : entry.getValue()) {
+ metadata.add(mdKey, name);
+ }
+ }
+ }
+ }
+ }
+ }
+}
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/corenlp/CoreNLPNERecogniser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/corenlp/CoreNLPNERecogniser.java?rev=1714835&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/corenlp/CoreNLPNERecogniser.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/corenlp/CoreNLPNERecogniser.java Tue Nov 17 17:29:25 2015
@@ -0,0 +1,169 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ner.corenlp;
+
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.parser.ner.NERecogniser;
+import org.json.JSONObject;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.lang.reflect.Field;
+import java.lang.reflect.Method;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+import java.util.Set;
+
+/**
+ * This class offers an implementation of {@link NERecogniser} based on
+ * CRF classifiers from Stanford CoreNLP. This NER requires additional setup,
+ * due to runtime binding to Stanford CoreNLP.
+ * See <a href="http://wiki.apache.org/tika/TikaAndNER#CoreNLP">
+ * Tika NER Wiki</a> for configuring this recogniser.
+ * @see NERecogniser
+ *
+ */
+public class CoreNLPNERecogniser implements NERecogniser {
+
+ private static final Logger LOG = LoggerFactory.getLogger(CoreNLPNERecogniser.class);
+
+ //default model paths
+ public static final String NER_3CLASS_MODEL = "edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz";
+ public static final String NER_4CLASS_MODEL = "edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz";
+ public static final String NER_7CLASS_MODEL = "edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz";
+ /**
+ * default Model path
+ */
+ public static final String DEFAULT_MODEL_PATH = NER_7CLASS_MODEL;
+ public static final String MODEL_PROP_NAME = "ner.corenlp.model";
+
+ public static final Set<String> ENTITY_TYPES = new HashSet<String>(){{
+ add(PERSON);
+ add(TIME);
+ add(LOCATION);
+ add(ORGANIZATION);
+ add(MONEY);
+ add(PERCENT);
+ add(DATE);
+ }};
+ private static final String CLASSIFIER_CLASS_NAME = "edu.stanford.nlp.ie.crf.CRFClassifier";
+
+ private boolean available = false;
+ private Field firstField;
+ private Field secondField;
+ private Field thirdField;
+ private Object classifierInstance;
+ private Method classifyMethod;
+
+ public CoreNLPNERecogniser(){
+ this(System.getProperty(MODEL_PROP_NAME, DEFAULT_MODEL_PATH));
+ }
+
+ /**
+ * Creates a NERecogniser by loading model from given path
+ * @param modelPath path to NER model file
+ */
+ public CoreNLPNERecogniser(String modelPath) {
+ try {
+ Properties props = new Properties();
+ Class<?> classifierClass = Class.forName(CLASSIFIER_CLASS_NAME);
+ Method loadMethod = classifierClass.getMethod("getClassifier", String.class, Properties.class);
+ classifierInstance = loadMethod.invoke(classifierClass, modelPath, props);
+ classifyMethod = classifierClass.getMethod("classifyToCharacterOffsets", String.class);
+
+ //these fields are for accessing result
+ Class<?> tripleClass = Class.forName("edu.stanford.nlp.util.Triple");
+ this.firstField = tripleClass.getField("first");
+ this.secondField = tripleClass.getField("second");
+ this.thirdField = tripleClass.getField("third");
+ this.available = true;
+ } catch (Exception e) {
+ LOG.warn("{} while trying to load the model from {}", e.getMessage(), modelPath);
+ }
+ LOG.info("Available for service ? {}", available);
+ }
+
+ /**
+ *
+ * @return {@code true} if model was available, valid and was able to initialise the classifier.
+ * returns {@code false} when this recogniser is not available for service.
+ */
+ public boolean isAvailable() {
+ return available;
+ }
+
+ /**
+ * Gets set of entity types recognised by this recogniser
+ * @return set of entity classes/types
+ */
+ public Set<String> getEntityTypes() {
+ return ENTITY_TYPES;
+ }
+
+ /**
+ * recognises names of entities in the text
+ * @param text text which possibly contains names
+ * @return map of entity type -> set of names
+ */
+ public Map<String, Set<String>> recognise(String text) {
+ Map<String, Set<String>> names = new HashMap<>();
+ try {
+ Object result = classifyMethod.invoke(classifierInstance, text);
+ List entries = (List) result;
+ for (Object entry : entries) {
+ String entityType = (String) firstField.get(entry);
+ if (!names.containsKey(entityType)) {
+ names.put(entityType, new HashSet<String>());
+ }
+ Integer start = (Integer) secondField.get(entry);
+ Integer end = (Integer) thirdField.get(entry);
+ String name = text.substring(start, end);
+ //Clean repeating spaces, replace line breaks and tabs with single space
+ name = name.trim().replaceAll("(\\s\\s+)|\n|\t", " ");
+ if (!name.isEmpty()) {
+ names.get(entityType).add(name);
+ }
+ }
+
+ } catch (Exception e) {
+ LOG.debug(e.getMessage(), e);
+ }
+ return names;
+ }
+
+ public static void main(String[] args) throws IOException {
+ if (args.length != 1) {
+ System.err.println("Error: Invalid Args");
+ System.err.println("This tool finds names inside text");
+ System.err.println("Usage: <path/to/text/file>");
+ return;
+ }
+
+ try (FileInputStream stream = new FileInputStream(args[0])) {
+ String text = IOUtils.toString(stream);
+ CoreNLPNERecogniser ner = new CoreNLPNERecogniser();
+ Map<String, Set<String>> names = ner.recognise(text);
+ JSONObject jNames = new JSONObject(names);
+ System.out.println(jNames.toString(2));
+ }
+ }
+}
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNERecogniser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNERecogniser.java?rev=1714835&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNERecogniser.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNERecogniser.java Tue Nov 17 17:29:25 2015
@@ -0,0 +1,151 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.ner.opennlp;
+
+import org.apache.tika.parser.ner.NERecogniser;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+
+/**
+ *
+ * This implementation of {@link NERecogniser} chains an array of
+ * {@link OpenNLPNameFinder}s for which NER models are
+ * available in classpath.
+ *
+ * The following models are scanned during initialization via class loader.:
+ *
+ * <table>
+ * <tr>
+ * <th>Entity Type</th><th>Path</th>
+ * </tr>
+ * <tr>
+ * <td>{@value PERSON}</td><td> {@value PERSON_FILE}</td>
+ * </tr>
+ * <tr>
+ * <td>{@value LOCATION}</td><td>{@value LOCATION_FILE}</td>
+ * </tr>
+ * <tr>
+ * <td>{@value ORGANIZATION}</td><td>{@value ORGANIZATION_FILE}</td>
+ * </tr>
+ * <tr>
+ * <td>{@value TIME}</td><td>{@value TIME_FILE}</td>
+ * </tr>
+ * <tr>
+ * <td>{@value DATE}</td><td>{@value DATE_FILE}</td>
+ * </tr>
+ * <tr>
+ * <td>{@value PERCENT}</td><td>{@value PERCENT_FILE}</td>
+ * </tr>
+ * <tr>
+ * <td>{@value MONEY}</td><td>{@value MONEY_FILE}</td>
+ * </tr>
+ * </table>
+ *
+ * @see org.apache.tika.parser.ner.NamedEntityParser#DEFAULT_NER_IMPL
+ */
+public class OpenNLPNERecogniser implements NERecogniser {
+
+ public static final String MODELS_DIR = OpenNLPNERecogniser.class
+ .getPackage().getName().replace(".", "/");
+ public static final String PERSON_FILE = "ner-person.bin";
+ public static final String LOCATION_FILE = "ner-location.bin";
+ public static final String ORGANIZATION_FILE = "ner-organization.bin";
+ public static final String TIME_FILE = "ner-time.bin";
+ public static final String DATE_FILE = "ner-date.bin";
+ public static final String PERCENT_FILE = "ner-percentage.bin";
+ public static final String MONEY_FILE = "ner-money.bin";
+
+
+ //Default (English) Models for the common 7 classes of named types
+ public static final String NER_PERSON_MODEL = MODELS_DIR + "/" + PERSON_FILE;
+ public static final String NER_LOCATION_MODEL = MODELS_DIR + "/" + LOCATION_FILE;
+ public static final String NER_ORGANIZATION_MODEL = MODELS_DIR + "/" + ORGANIZATION_FILE;
+ public static final String NER_TIME_MODEL = MODELS_DIR + "/" + TIME_FILE;
+ public static final String NER_DATE_MODEL = MODELS_DIR + "/" + DATE_FILE;
+ public static final String NER_PERCENT_MODEL = MODELS_DIR + "/" + PERCENT_FILE;
+ public static final String NER_MONEY_MODEL = MODELS_DIR + "/" + MONEY_FILE;
+
+ public static final Map<String, String> DEFAULT_MODELS =
+ new HashMap<String, String>(){{
+ put(PERSON, NER_PERSON_MODEL);
+ put(LOCATION, NER_LOCATION_MODEL);
+ put(ORGANIZATION, NER_ORGANIZATION_MODEL);
+ put(TIME, NER_TIME_MODEL);
+ put(DATE, NER_DATE_MODEL);
+ put(PERCENT, NER_PERCENT_MODEL);
+ put(MONEY, NER_MONEY_MODEL);
+ }};
+
+ private Set<String> entityTypes;
+ private List<OpenNLPNameFinder> nameFinders;
+ private boolean available;
+
+ /**
+ * Creates a default chain of Name finders using default OpenNLP recognizers
+ */
+ public OpenNLPNERecogniser(){
+ this(DEFAULT_MODELS);
+ }
+
+ /**
+ * Creates a chain of Named Entity recognisers
+ * @param models map of entityType -> model path
+ * NOTE: the model path should be known to class loader.
+ */
+ public OpenNLPNERecogniser(Map<String, String> models){
+ this.nameFinders = new ArrayList<>();
+ this.entityTypes = new HashSet<>();
+ for (Map.Entry<String, String> entry : models.entrySet()) {
+ OpenNLPNameFinder finder =
+ new OpenNLPNameFinder(entry.getKey(), entry.getValue());
+ if (finder.isAvailable()) {
+ this.nameFinders.add(finder);
+ this.entityTypes.add(entry.getKey());
+ }
+ }
+ this.entityTypes = Collections.unmodifiableSet(this.entityTypes);
+ this.available = nameFinders.size() > 0; //at least one finder is present
+ }
+
+ @Override
+ public boolean isAvailable() {
+ return available;
+ }
+
+ @Override
+ public Set<String> getEntityTypes() {
+ return entityTypes;
+ }
+
+ @Override
+ public Map<String, Set<String>> recognise(String text) {
+ String[] tokens = OpenNLPNameFinder.tokenize(text);
+ Map<String, Set<String>> names = new HashMap<>();
+ for (OpenNLPNameFinder finder : nameFinders) {
+ names.putAll(finder.findNames(tokens));
+ }
+ return names;
+ }
+}
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNameFinder.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNameFinder.java?rev=1714835&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNameFinder.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNameFinder.java Tue Nov 17 17:29:25 2015
@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.ner.opennlp;
+
+import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.namefind.TokenNameFinderModel;
+import opennlp.tools.util.Span;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.parser.ner.NERecogniser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * An implementation of {@link NERecogniser} that finds names in text using Open NLP Model.
+ * This implementation works with only one entity type. For chain this name finder instances,
+ * see {@link OpenNLPNERecogniser}
+ */
+public class OpenNLPNameFinder implements NERecogniser {
+
+ private static final Logger LOG = LoggerFactory.getLogger(OpenNLPNameFinder.class);
+ private final String nameType;
+ private final Set<String> nameTypes;
+ private NameFinderME nameFinder;
+ private boolean available;
+
+ /**
+ * Creates OpenNLP name finder
+ * @param nameType the entity type recognised by the given NER model
+ * @param nerModelPath path to ner model
+ */
+ public OpenNLPNameFinder(String nameType, String nerModelPath) {
+ this.nameTypes = Collections.singleton(nameType);
+ this.nameType = nameType;
+ InputStream nerModelStream = getClass().getClassLoader().getResourceAsStream(nerModelPath);
+ try {
+ if (nerModelStream != null){
+ TokenNameFinderModel model = new TokenNameFinderModel(nerModelStream);
+ this.nameFinder = new NameFinderME(model);
+ this.available = true;
+ } else {
+ LOG.warn("Couldn't find model from {} using class loader", nerModelPath);
+ }
+ } catch (IOException e) {
+ LOG.error(e.getMessage(), e);
+ } finally {
+ IOUtils.closeQuietly(nerModelStream);
+ }
+ LOG.info("{} NER : Available for service ? {}", nameType, available);
+ }
+
+ @Override
+ public boolean isAvailable() {
+ return available;
+ }
+
+ @Override
+ public Set<String> getEntityTypes() {
+ return nameTypes;
+ }
+
+ public static String[] tokenize(String text){
+ //NOTE: replace this with a NLP tokenizer tool
+ //clean + split
+ return text.trim().replaceAll("(\\s\\s+)", " ").split("\\s");
+ }
+
+ @Override
+ public synchronized Map<String, Set<String>> recognise(String text) {
+ String[] tokens = tokenize(text);
+ return findNames(tokens);
+ }
+
+ /**
+ * finds names from given array of tokens
+ * @param tokens the tokens array
+ * @return map of EntityType -> set of entity names
+ */
+ public Map<String, Set<String>> findNames(String[] tokens) {
+ Span[] nameSpans = nameFinder.find(tokens);
+ String[] names = Span.spansToStrings(nameSpans, tokens);
+ Map<String, Set<String>> result = new HashMap<>();
+ if (names != null && names.length > 0) {
+ result.put(nameType, new HashSet<>(Arrays.asList(names)));
+ }
+ nameFinder.clearAdaptiveData();
+ return result;
+ }
+}
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/regex/RegexNERecogniser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/regex/RegexNERecogniser.java?rev=1714835&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/regex/RegexNERecogniser.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/regex/RegexNERecogniser.java Tue Nov 17 17:29:25 2015
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.ner.regex;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.parser.ner.NERecogniser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * This class offers an implementation of {@link NERecogniser} based on
+ * Regular Expressions.
+ *<p>
+ * The default configuration file {@value NER_REGEX_FILE} is used when no
+ * argument constructor is used to instantiate this class. The regex file is
+ * loaded via {@link Class#getResourceAsStream(String)}, so the file should be
+ * placed in the same package path as of this class.
+ * </p>
+ * The format of regex configuration as follows:
+ * <pre>
+ * ENTITY_TYPE1=REGEX1
+ * ENTITY_TYPE2=REGEX2
+ * </pre>
+ *
+ * <i>For example, to extract week day from text:</i>
+ * <pre>WEEK_DAY=(?i)((sun)|(mon)|(tues)|(thurs)|(fri)|((sat)(ur)?))(day)?
+ * </pre>
+ * @since Nov. 7, 2015
+ */
+public class RegexNERecogniser implements NERecogniser {
+
+ public static final String NER_REGEX_FILE = "ner-regex.txt";
+ private static Logger LOG = LoggerFactory.getLogger(RegexNERecogniser.class);
+
+ public Set<String> entityTypes = new HashSet<>();
+ public Map<String, Pattern> patterns;
+ private boolean available = false;
+
+ private static RegexNERecogniser INSTANCE;
+
+ public RegexNERecogniser(){
+ this(RegexNERecogniser.class.getResourceAsStream(NER_REGEX_FILE));
+ }
+
+ public RegexNERecogniser(InputStream stream){
+ try {
+ patterns = new HashMap<>();
+ List<String> lines = IOUtils.readLines(stream, StandardCharsets.UTF_8);
+ IOUtils.closeQuietly(stream);
+ for (String line : lines) {
+ line = line.trim();
+ if (line.isEmpty() || line.startsWith("#")){ //empty or comment
+ continue; //skip
+ }
+
+ int delim = line.indexOf('=');
+ if (delim < 0) { //delim not found
+ //skip
+ LOG.error("Skipped : Invalid config : {} ", line);
+ continue;
+ }
+ String type = line.substring(0, delim).trim();
+ String patternStr = line.substring(delim+1, line.length()).trim();
+ patterns.put(type, Pattern.compile(patternStr));
+ entityTypes.add(type);
+ }
+ } catch (Exception e) {
+ LOG.error(e.getMessage(), e);
+ }
+ available = !entityTypes.isEmpty();
+ }
+
+ public synchronized static RegexNERecogniser getInstance() {
+ if (INSTANCE == null) {
+ INSTANCE = new RegexNERecogniser();
+ }
+ return INSTANCE;
+ }
+
+ @Override
+ public boolean isAvailable() {
+ return available;
+ }
+
+ @Override
+ public Set<String> getEntityTypes() {
+ return entityTypes;
+ }
+
+ /**
+ * finds matching sub groups in text
+ * @param text text containing interesting sub strings
+ * @param pattern pattern to find sub strings
+ * @return set of sub strings if any found, or null if none found
+ */
+ public Set<String> findMatches(String text, Pattern pattern){
+ Set<String> results = null;
+ Matcher matcher = pattern.matcher(text);
+ if (matcher.find()) {
+ results = new HashSet<>();
+ results.add(matcher.group(0));
+ while (matcher.find()) {
+ results.add(matcher.group(0));
+ }
+ }
+ return results;
+ }
+
+ @Override
+ public Map<String, Set<String>> recognise(String text) {
+ Map<String, Set<String>> result = new HashMap<>();
+ for (Map.Entry<String, Pattern> entry : patterns.entrySet()) {
+ Set<String> names = findMatches(text, entry.getValue());
+ if (names != null) {
+ result.put(entry.getKey(), names);
+ }
+ }
+ return result;
+ }
+}
Added: tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/ner/regex/ner-regex.txt
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/ner/regex/ner-regex.txt?rev=1714835&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/ner/regex/ner-regex.txt (added)
+++ tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/ner/regex/ner-regex.txt Tue Nov 17 17:29:25 2015
@@ -0,0 +1,22 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# The pattern as follows
+# type = regex
+# the first occurrence of '=' separates type from its regex
+
+# WEEK_DAY=(?i)((sun)|(mon)|(tues)|(thurs)|(fri)|((sat)(ur)?))(day)?
\ No newline at end of file
Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ner/NamedEntityParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ner/NamedEntityParserTest.java?rev=1714835&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ner/NamedEntityParserTest.java (added)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ner/NamedEntityParserTest.java Tue Nov 17 17:29:25 2015
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ner;
+
+import org.apache.tika.Tika;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ner.opennlp.OpenNLPNERecogniser;
+import org.apache.tika.parser.ner.regex.RegexNERecogniser;
+import org.junit.Test;
+
+import java.io.ByteArrayInputStream;
+import java.nio.charset.Charset;
+import java.util.Arrays;
+import java.util.HashSet;
+
+import static org.junit.Assert.assertTrue;
+
+/**
+ *Test case for {@link NamedEntityParser}
+ */
+public class NamedEntityParserTest {
+
+ public static final String CONFIG_FILE = "tika-config.xml";
+
+ @Test
+ public void testParse() throws Exception {
+
+ //test config is added to resources directory
+ TikaConfig config = new TikaConfig(getClass().getResourceAsStream(CONFIG_FILE));
+ Tika tika = new Tika(config);
+ String text = "I am student at University of Southern California (USC)," +
+ " located in Los Angeles . USC's football team is called by name Trojans." +
+ " Mr. John McKay was a head coach of the team from 1960 - 1975";
+ Metadata md = new Metadata();
+ tika.parse(new ByteArrayInputStream(text.getBytes(Charset.defaultCharset())), md);
+
+ HashSet<String> set = new HashSet<String>();
+ set.addAll(Arrays.asList(md.getValues("X-Parsed-By")));
+ assertTrue(set.contains(NamedEntityParser.class.getName()));
+
+ set.clear();
+ set.addAll(Arrays.asList(md.getValues("NER_PERSON")));
+ assertTrue(set.contains("John McKay"));
+
+ set.clear();
+ set.addAll(Arrays.asList(md.getValues("NER_LOCATION")));
+ assertTrue(set.contains("Los Angeles"));
+
+ set.clear();
+ set.addAll(Arrays.asList(md.getValues("NER_ORGANIZATION")));
+ assertTrue(set.contains("University of Southern California"));
+
+ set.clear();
+ set.addAll(Arrays.asList(md.getValues("NER_DATE")));
+ assertTrue(set.contains("1960 - 1975"));
+
+ }
+
+ @Test
+ public void testNerChain() throws Exception {
+ String classNames = OpenNLPNERecogniser.class.getName()
+ + "," + RegexNERecogniser.class.getName();
+ System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, classNames);
+ TikaConfig config = new TikaConfig(getClass().getResourceAsStream(CONFIG_FILE));
+ Tika tika = new Tika(config);
+ String text = "University of Southern California (USC), is located in Los Angeles ." +
+ " Campus is busy from monday to saturday";
+ Metadata md = new Metadata();
+ tika.parse(new ByteArrayInputStream(text.getBytes(Charset.defaultCharset())), md);
+ HashSet<String> keys = new HashSet<>(Arrays.asList(md.names()));
+ assertTrue(keys.contains("NER_WEEK_DAY"));
+ assertTrue(keys.contains("NER_LOCATION"));
+
+ }
+}
\ No newline at end of file
Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ner/regex/RegexNERecogniserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ner/regex/RegexNERecogniserTest.java?rev=1714835&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ner/regex/RegexNERecogniserTest.java (added)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ner/regex/RegexNERecogniserTest.java Tue Nov 17 17:29:25 2015
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright owlocationNameEntitieship.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ner.regex;
+
+import org.apache.tika.Tika;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ner.NamedEntityParser;
+import org.junit.Test;
+
+import java.io.ByteArrayInputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import static org.junit.Assert.assertTrue;
+
+public class RegexNERecogniserTest {
+
+ @Test
+ public void testGetEntityTypes() throws Exception {
+
+ String text = "Hey, Lets meet on this Sunday or MONDAY because i am busy on Saturday";
+ System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, RegexNERecogniser.class.getName());
+
+ Tika tika = new Tika(new TikaConfig(NamedEntityParser.class.getResourceAsStream("tika-config.xml")));
+ Metadata md = new Metadata();
+ tika.parse(new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)), md);
+
+ Set<String> days = new HashSet<>(Arrays.asList(md.getValues("NER_WEEK_DAY")));
+ assertTrue(days.contains("Sunday"));
+ assertTrue(days.contains("MONDAY"));
+ assertTrue(days.contains("Saturday"));
+ assertTrue(days.size() == 3); //and nothing else
+
+
+ }
+}
\ No newline at end of file
Added: tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/opennlp/ModelGetter.groovy
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/opennlp/ModelGetter.groovy?rev=1714835&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/opennlp/ModelGetter.groovy (added)
+++ tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/opennlp/ModelGetter.groovy Tue Nov 17 17:29:25 2015
@@ -0,0 +1,93 @@
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * This file downloads Apache OpenNLP NER models for testing the NamedEntityParser
+ */
+
+import org.apache.commons.io.IOUtils
+
+/**
+ * Copies input stream to output stream, additionally printing the progress.
+ * NOTE: this is optimized for large content
+ * @param inStr source stream
+ * @param outStr target stream
+ * @param totalLength the total length of the content (used to calculate progress)
+ * @return
+ */
+def copyWithProgress(InputStream inStr, OutputStream outStr, long totalLength){
+ int PROGRESS_DELAY = 1000;
+ byte[] buffer = new byte[1024 * 4]
+ long count = 0
+ int len
+ long tt = System.currentTimeMillis()
+ while ((len = inStr.read(buffer)) > 0) {
+ outStr.write(buffer, 0, len)
+ count += len
+ if (System.currentTimeMillis() - tt > PROGRESS_DELAY) {
+ println "${count * 100.0/totalLength}% : $count bytes of $totalLength"
+ tt = System.currentTimeMillis()
+ }
+ }
+ println "Copy complete. "
+ IOUtils.closeQuietly(inStr)
+ IOUtils.closeQuietly(outStr)
+}
+
+/**
+ * Downloads file
+ * @param urlStr url of file
+ * @param file path to store file
+ * @return
+ */
+def downloadFile(String urlStr, File file) {
+ println "GET : $urlStr -> $file"
+ urlConn = new URL(urlStr).openConnection()
+ contentLength = urlConn.getContentLengthLong()
+
+ file.getParentFile().mkdirs()
+ inStream = urlConn.getInputStream()
+ outStream = new FileOutputStream(file)
+ copyWithProgress(inStream, outStream, contentLength)
+ IOUtils.closeQuietly(outStream)
+ IOUtils.closeQuietly(inStream)
+ println "Download Complete.."
+}
+
+
+def urlPrefix = "http://opennlp.sourceforge.net/models-1.5"
+def prefixPath = "src/test/resources/org/apache/tika/parser/ner/opennlp/"
+
+// detecting proper path for test resources
+if (new File("tika-parsers").exists() && new File("tika-app").exists() ) {
+ // running from parent maven project, but resources should go to sub-module
+ prefixPath = "tika-parsers/" + prefixPath
+}
+
+def modelFiles = //filePath : url
+ [ (prefixPath + "ner-person.bin"): (urlPrefix + "/en-ner-person.bin"),
+ (prefixPath + "ner-location.bin"): (urlPrefix + "/en-ner-location.bin"),
+ (prefixPath + "ner-organization.bin"): (urlPrefix + "/en-ner-organization.bin"),
+ (prefixPath + "ner-date.bin"): (urlPrefix + "/en-ner-date.bin")]
+
+for (def entry : modelFiles) {
+ File file = new File(entry.key)
+ if (!file.exists()) {
+ downloadFile(entry.value, file)
+ }
+}
\ No newline at end of file
Added: tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/opennlp/get-models.sh
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/opennlp/get-models.sh?rev=1714835&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/opennlp/get-models.sh (added)
+++ tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/opennlp/get-models.sh Tue Nov 17 17:29:25 2015
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+echo "Getting OpenNLP NER models"
+wget "http://opennlp.sourceforge.net/models-1.5/en-ner-person.bin" -O ner-person.bin
+wget "http://opennlp.sourceforge.net/models-1.5/en-ner-location.bin" -O ner-location.bin
+wget "http://opennlp.sourceforge.net/models-1.5/en-ner-organization.bin" -O ner-organization.bin
+
+# Additional 4
+wget "http://opennlp.sourceforge.net/models-1.5/en-ner-date.bin" -O ner-date.bin
+wget "http://opennlp.sourceforge.net/models-1.5/en-ner-money.bin" -O ner-money.bin
+wget "http://opennlp.sourceforge.net/models-1.5/en-ner-time.bin" -O ner-time.bin
+wget "http://opennlp.sourceforge.net/models-1.5/en-ner-percentage.bin" -O ner-percentage.bin
\ No newline at end of file
Propchange: tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/opennlp/get-models.sh
------------------------------------------------------------------------------
svn:executable = *
Added: tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/regex/ner-regex.txt
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/regex/ner-regex.txt?rev=1714835&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/regex/ner-regex.txt (added)
+++ tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/regex/ner-regex.txt Tue Nov 17 17:29:25 2015
@@ -0,0 +1,17 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+WEEK_DAY=(?i)((sun)|(mon)|(tues)|(thurs)|(fri)|((sat)(ur)?))(day)?
\ No newline at end of file
Added: tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/tika-config.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/tika-config.xml?rev=1714835&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/tika-config.xml (added)
+++ tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/tika-config.xml Tue Nov 17 17:29:25 2015
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.ner.NamedEntityParser">
+ <mime>text/plain</mime>
+ <mime>text/html</mime>
+ <mime>application/xhtml+xml</mime>
+ </parser>
+ </parsers>
+
+</properties>
\ No newline at end of file