You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2015/11/17 18:29:26 UTC

svn commit: r1714835 - in /tika/trunk: ./ tika-parsers/ tika-parsers/src/main/java/org/apache/tika/parser/ner/ tika-parsers/src/main/java/org/apache/tika/parser/ner/corenlp/ tika-parsers/src/main/java/org/apache/tika/parser/ner/opennlp/ tika-parsers/sr...

Author: mattmann
Date: Tue Nov 17 17:29:25 2015
New Revision: 1714835

URL: http://svn.apache.org/viewvc?rev=1714835&view=rev
Log:
Fix for TIKA-1787: Include Stanford Name Entity Recognition in Tika contributed by Thamme Gowda N and Yueheng He this closes #61 this closes #62

Added:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/NERecogniser.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/NamedEntityParser.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/corenlp/
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/corenlp/CoreNLPNERecogniser.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/opennlp/
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNERecogniser.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNameFinder.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/regex/
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/regex/RegexNERecogniser.java
    tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/ner/
    tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/ner/regex/
    tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/ner/regex/ner-regex.txt
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ner/
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ner/NamedEntityParserTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ner/regex/
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ner/regex/RegexNERecogniserTest.java
    tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/
    tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/
    tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/opennlp/
    tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/opennlp/ModelGetter.groovy
    tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/opennlp/get-models.sh   (with props)
    tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/regex/
    tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/regex/ner-regex.txt
    tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/tika-config.xml
Modified:
    tika/trunk/.gitignore
    tika/trunk/CHANGES.txt
    tika/trunk/tika-parsers/pom.xml

Modified: tika/trunk/.gitignore
URL: http://svn.apache.org/viewvc/tika/trunk/.gitignore?rev=1714835&r1=1714834&r2=1714835&view=diff
==============================================================================
--- tika/trunk/.gitignore (original)
+++ tika/trunk/.gitignore Tue Nov 17 17:29:25 2015
@@ -7,5 +7,6 @@ target
 *.iml
 *.ipr
 *.iws
+*.bin
 nbactions.xml
 nb-configuration.xml
\ No newline at end of file

Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1714835&r1=1714834&r2=1714835&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Tue Nov 17 17:29:25 2015
@@ -1,5 +1,9 @@
 Release 1.12 - Current Development
 
+  * Provide NamedEntityParser which exposes Named Entity Recognition
+    from OpenNLP and Stanford NER providers (TIKA-1787, GitHub-61,
+    GitHub-62).
+
   * Allow XHTMLContentHandler to pass attributes of html element 
     via Markus Jelsma (TIKA-1782).
 

Modified: tika/trunk/tika-parsers/pom.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/pom.xml?rev=1714835&r1=1714834&r2=1714835&view=diff
==============================================================================
--- tika/trunk/tika-parsers/pom.xml (original)
+++ tika/trunk/tika-parsers/pom.xml Tue Nov 17 17:29:25 2015
@@ -450,4 +450,43 @@
     <system>Jenkins</system>
     <url>https://builds.apache.org/job/Tika-trunk/</url>
   </ciManagement>
+
+  <profiles>
+    <profile>
+      <id>testSetup</id>
+      <activation>
+        <!-- auto activate -->
+        <file>
+          <missing>${basedir}/src/test/resources/org/apache/tika/parser/ner/opennlp/ner-person.bin</missing>
+        </file>
+      </activation>
+      <dependencies>
+        <dependency>
+          <groupId>org.apache.maven</groupId>
+          <artifactId>maven-model</artifactId>
+          <version>3.3.3</version>
+        </dependency>
+      </dependencies>
+      <build>
+        <plugins>
+          <plugin>
+            <groupId>org.codehaus.groovy.maven</groupId>
+            <artifactId>gmaven-plugin</artifactId>
+            <executions>
+              <execution>
+                <id>testSetup</id>
+                <phase>test-compile</phase>
+                <goals>
+                  <goal>execute</goal>
+                </goals>
+                <configuration>
+                  <source>${basedir}/src/test/resources/org/apache/tika/parser/ner/opennlp/ModelGetter.groovy</source>
+                </configuration>
+              </execution>
+            </executions>
+          </plugin>
+        </plugins>
+      </build>
+    </profile>
+  </profiles>
 </project>

Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/NERecogniser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/NERecogniser.java?rev=1714835&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/NERecogniser.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/NERecogniser.java Tue Nov 17 17:29:25 2015
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.ner;
+
+import java.lang.String;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * Defines a contract for named entity recogniser. The NER contract includes {@link #isAvailable()},
+ * {@link #getEntityTypes()} and {@link #recognise( String )}
+ */
+public interface NERecogniser {
+
+    //the common named entity classes
+    String LOCATION = "LOCATION";
+    String PERSON = "PERSON";
+    String ORGANIZATION = "ORGANIZATION";
+    String MISCELLANEOUS = "MISCELLANEOUS";
+    String TIME = "TIME";
+    String DATE = "DATE";
+    String PERCENT = "PERCENT";
+    String MONEY = "MONEY";
+
+    /**
+     * checks if this Named Entity recogniser is available for service
+     * @return true if this recogniser is ready to recognise, false otherwise
+     */
+    boolean isAvailable();
+
+    /**
+     *  gets a set of entity types whose names are recognisable by this
+     * @return set of entity types/classes
+     */
+    Set<String> getEntityTypes();
+
+    /**
+     * call for name recognition action from text
+     * @param text text with possibly contains names
+     * @return map of entityType -> set of names
+     */
+    Map<String, Set<String>> recognise(String text);
+}

Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/NamedEntityParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/NamedEntityParser.java?rev=1714835&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/NamedEntityParser.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/NamedEntityParser.java Tue Nov 17 17:29:25 2015
@@ -0,0 +1,151 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.ner;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.Tika;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.ner.opennlp.OpenNLPNERecogniser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ *
+ * This implementation of {@link org.apache.tika.parser.Parser} extracts
+ * entity names from text content and adds it to the metadata.
+ * <p>All the metadata keys will have a common suffix {@value #MD_KEY_PREFIX}</p>
+ * <p>The Named Entity recogniser implementation can be changed by setting the
+ * system property {@value #SYS_PROP_NER_IMPL} value to a name of class that
+ * implements {@link NERecogniser} contract</p>
+ * @see OpenNLPNERecogniser
+ * @see NERecogniser
+ *
+ */
+public class NamedEntityParser extends AbstractParser {
+
+    public static final Logger LOG = LoggerFactory.getLogger(NamedEntityParser.class);
+    public static final Set<MediaType> MEDIA_TYPES = new HashSet<>();
+    public static final String MD_KEY_PREFIX = "NER_";
+    public static final String DEFAULT_NER_IMPL = OpenNLPNERecogniser.class.getName();
+    public static final String SYS_PROP_NER_IMPL = "ner.impl.class";
+
+    public Tika secondaryParser;
+
+    static {
+        MEDIA_TYPES.add(MediaType.TEXT_PLAIN);
+    }
+
+    private List<NERecogniser> nerChain;
+    private volatile boolean initialized = false;
+    private volatile boolean available = false;
+
+    private synchronized void initialize(ParseContext context) {
+        if (initialized) {
+            return;
+        }
+        initialized = true;
+
+        //TODO: read class name from context or config
+        //There can be multiple classes in the form of comma separated class names;
+        String classNamesString = System.getProperty(SYS_PROP_NER_IMPL,
+                DEFAULT_NER_IMPL);
+        String[] classNames = classNamesString.split(",");
+        this.nerChain = new ArrayList<>(classNames.length);
+        for (String className : classNames) {
+            className = className.trim();
+            LOG.info("going to load, instantiate and bind the instance of {}",
+                    className);
+            try {
+                NERecogniser recogniser =
+                        (NERecogniser) Class.forName(className).newInstance();
+                LOG.info("{} is available ? {}", className,
+                        recogniser.isAvailable());
+                if (recogniser.isAvailable()) {
+                    nerChain.add(recogniser);
+                }
+            } catch (Exception e) {
+                LOG.error(e.getMessage(), e);
+            }
+        }
+        try {
+            TikaConfig config = new TikaConfig();
+            this.secondaryParser = new Tika(config);
+            this.available = !nerChain.isEmpty();
+            LOG.info("Number of NERecognisers in chain {}", nerChain.size());
+        } catch (Exception e){
+            LOG.error(e.getMessage(), e);
+            this.available = false;
+        }
+    }
+
+    public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
+        return MEDIA_TYPES;
+    }
+
+    public void parse(InputStream inputStream, ContentHandler contentHandler,
+                      Metadata metadata, ParseContext parseContext)
+            throws IOException, SAXException, TikaException {
+
+        if (!initialized) {
+            initialize(parseContext);
+        }
+        if (!available) {
+            return;
+        }
+
+        Reader reader = MediaType.TEXT_PLAIN.toString()
+                .equals(metadata.get(Metadata.CONTENT_TYPE))
+                ? new InputStreamReader(inputStream, StandardCharsets.UTF_8)
+                : secondaryParser.parse(inputStream);
+
+        String text = IOUtils.toString(reader);
+        IOUtils.closeQuietly(reader);
+
+        for (NERecogniser ner : nerChain) {
+            Map<String, Set<String>> names = ner.recognise(text);
+            if (names != null) {
+                for (Map.Entry<String, Set<String>> entry : names.entrySet()) {
+                    if (entry.getValue() != null) {
+                        String mdKey = MD_KEY_PREFIX + entry.getKey();
+                        for (String name : entry.getValue()) {
+                            metadata.add(mdKey, name);
+                        }
+                    }
+                }
+            }
+        }
+    }
+}

Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/corenlp/CoreNLPNERecogniser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/corenlp/CoreNLPNERecogniser.java?rev=1714835&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/corenlp/CoreNLPNERecogniser.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/corenlp/CoreNLPNERecogniser.java Tue Nov 17 17:29:25 2015
@@ -0,0 +1,169 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ner.corenlp;
+
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.parser.ner.NERecogniser;
+import org.json.JSONObject;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.lang.reflect.Field;
+import java.lang.reflect.Method;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+import java.util.Set;
+
+/**
+ *  This class offers an implementation of {@link NERecogniser} based on
+ *  CRF classifiers from Stanford CoreNLP. This NER requires additional setup,
+ *  due to runtime binding to Stanford CoreNLP.
+ *  See <a href="http://wiki.apache.org/tika/TikaAndNER#CoreNLP">
+ *      Tika NER Wiki</a> for configuring this recogniser.
+ *  @see NERecogniser
+ *
+ */
+public class CoreNLPNERecogniser implements NERecogniser {
+
+    private static final Logger LOG = LoggerFactory.getLogger(CoreNLPNERecogniser.class);
+
+    //default model paths
+    public static final String NER_3CLASS_MODEL = "edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz";
+    public static final String NER_4CLASS_MODEL = "edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz";
+    public static final String NER_7CLASS_MODEL = "edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz";
+    /**
+     * default Model path
+     */
+    public static final String DEFAULT_MODEL_PATH = NER_7CLASS_MODEL;
+    public static final String MODEL_PROP_NAME = "ner.corenlp.model";
+
+    public static final Set<String> ENTITY_TYPES = new HashSet<String>(){{
+        add(PERSON);
+        add(TIME);
+        add(LOCATION);
+        add(ORGANIZATION);
+        add(MONEY);
+        add(PERCENT);
+        add(DATE);
+    }};
+    private static final String CLASSIFIER_CLASS_NAME = "edu.stanford.nlp.ie.crf.CRFClassifier";
+
+    private boolean available = false;
+    private Field firstField;
+    private Field secondField;
+    private Field thirdField;
+    private Object classifierInstance;
+    private Method classifyMethod;
+
+    public CoreNLPNERecogniser(){
+        this(System.getProperty(MODEL_PROP_NAME, DEFAULT_MODEL_PATH));
+    }
+
+    /**
+     * Creates a NERecogniser by loading model from given path
+     * @param modelPath path to NER model file
+     */
+    public CoreNLPNERecogniser(String modelPath) {
+        try {
+            Properties props = new Properties();
+            Class<?> classifierClass = Class.forName(CLASSIFIER_CLASS_NAME);
+            Method loadMethod = classifierClass.getMethod("getClassifier", String.class, Properties.class);
+            classifierInstance = loadMethod.invoke(classifierClass, modelPath, props);
+            classifyMethod = classifierClass.getMethod("classifyToCharacterOffsets", String.class);
+
+            //these fields are for accessing result
+            Class<?> tripleClass = Class.forName("edu.stanford.nlp.util.Triple");
+            this.firstField = tripleClass.getField("first");
+            this.secondField = tripleClass.getField("second");
+            this.thirdField = tripleClass.getField("third");
+            this.available = true;
+        } catch (Exception e) {
+            LOG.warn("{} while trying to load the model from {}", e.getMessage(), modelPath);
+        }
+        LOG.info("Available for service ? {}", available);
+    }
+
+    /**
+     *
+     * @return {@code true} if model was available, valid and was able to initialise the classifier.
+     * returns {@code false} when this recogniser is not available for service.
+     */
+    public boolean isAvailable() {
+        return available;
+    }
+
+    /**
+     * Gets set of entity types recognised by this recogniser
+     * @return set of entity classes/types
+     */
+    public Set<String> getEntityTypes() {
+        return ENTITY_TYPES;
+    }
+
+    /**
+     * recognises names of entities in the text
+     * @param text text which possibly contains names
+     * @return map of entity type -> set of names
+     */
+    public Map<String, Set<String>> recognise(String text) {
+        Map<String, Set<String>> names = new HashMap<>();
+        try {
+            Object result = classifyMethod.invoke(classifierInstance, text);
+            List entries = (List) result;
+            for (Object entry : entries) {
+                String entityType = (String) firstField.get(entry);
+                if (!names.containsKey(entityType)) {
+                    names.put(entityType, new HashSet<String>());
+                }
+                Integer start = (Integer) secondField.get(entry);
+                Integer end = (Integer) thirdField.get(entry);
+                String name = text.substring(start, end);
+                //Clean repeating spaces, replace line breaks and tabs with single space
+                name = name.trim().replaceAll("(\\s\\s+)|\n|\t", " ");
+                if (!name.isEmpty()) {
+                    names.get(entityType).add(name);
+                }
+            }
+
+        } catch (Exception e) {
+            LOG.debug(e.getMessage(), e);
+        }
+        return names;
+    }
+
+    public static void main(String[] args) throws IOException {
+        if (args.length != 1) {
+            System.err.println("Error: Invalid Args");
+            System.err.println("This tool finds names inside text");
+            System.err.println("Usage: <path/to/text/file>");
+            return;
+        }
+
+        try (FileInputStream stream = new FileInputStream(args[0])) {
+            String text = IOUtils.toString(stream);
+            CoreNLPNERecogniser ner = new CoreNLPNERecogniser();
+            Map<String, Set<String>> names = ner.recognise(text);
+            JSONObject jNames = new JSONObject(names);
+            System.out.println(jNames.toString(2));
+        }
+    }
+}

Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNERecogniser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNERecogniser.java?rev=1714835&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNERecogniser.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNERecogniser.java Tue Nov 17 17:29:25 2015
@@ -0,0 +1,151 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.ner.opennlp;
+
+import org.apache.tika.parser.ner.NERecogniser;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+
+/**
+ *
+ * This implementation of {@link NERecogniser} chains an array of
+ * {@link OpenNLPNameFinder}s for which NER models are
+ * available in classpath.
+ *
+ * The following models are scanned during initialization via class loader.:
+ *
+ * <table>
+ *     <tr>
+ *         <th>Entity Type</th><th>Path</th>
+ *     </tr>
+ *     <tr>
+ *         <td>{@value PERSON}</td><td> {@value PERSON_FILE}</td>
+ *     </tr>
+ *     <tr>
+ *         <td>{@value LOCATION}</td><td>{@value LOCATION_FILE}</td>
+ *     </tr>
+ *     <tr>
+ *         <td>{@value ORGANIZATION}</td><td>{@value ORGANIZATION_FILE}</td>
+ *     </tr>
+ *     <tr>
+ *         <td>{@value TIME}</td><td>{@value TIME_FILE}</td>
+ *     </tr>
+ *     <tr>
+ *         <td>{@value DATE}</td><td>{@value DATE_FILE}</td>
+ *     </tr>
+ *     <tr>
+ *         <td>{@value PERCENT}</td><td>{@value PERCENT_FILE}</td>
+ *     </tr>
+ *     <tr>
+ *         <td>{@value MONEY}</td><td>{@value MONEY_FILE}</td>
+ *     </tr>
+ * </table>
+ *
+ * @see org.apache.tika.parser.ner.NamedEntityParser#DEFAULT_NER_IMPL
+ */
+public class OpenNLPNERecogniser implements NERecogniser {
+
+    public static final String MODELS_DIR = OpenNLPNERecogniser.class
+            .getPackage().getName().replace(".", "/");
+    public static final String PERSON_FILE = "ner-person.bin";
+    public static final String LOCATION_FILE = "ner-location.bin";
+    public static final String ORGANIZATION_FILE = "ner-organization.bin";
+    public static final String TIME_FILE = "ner-time.bin";
+    public static final String DATE_FILE = "ner-date.bin";
+    public static final String PERCENT_FILE = "ner-percentage.bin";
+    public static final String MONEY_FILE = "ner-money.bin";
+
+
+    //Default (English) Models for the common 7 classes of named types
+    public static final String NER_PERSON_MODEL = MODELS_DIR + "/" + PERSON_FILE;
+    public static final String NER_LOCATION_MODEL = MODELS_DIR + "/" + LOCATION_FILE;
+    public static final String NER_ORGANIZATION_MODEL = MODELS_DIR + "/" + ORGANIZATION_FILE;
+    public static final String NER_TIME_MODEL = MODELS_DIR + "/" + TIME_FILE;
+    public static final String NER_DATE_MODEL = MODELS_DIR + "/" + DATE_FILE;
+    public static final String NER_PERCENT_MODEL = MODELS_DIR + "/" + PERCENT_FILE;
+    public static final String NER_MONEY_MODEL = MODELS_DIR + "/" + MONEY_FILE;
+
+    public static final Map<String, String> DEFAULT_MODELS =
+            new HashMap<String, String>(){{
+                put(PERSON, NER_PERSON_MODEL);
+                put(LOCATION, NER_LOCATION_MODEL);
+                put(ORGANIZATION, NER_ORGANIZATION_MODEL);
+                put(TIME, NER_TIME_MODEL);
+                put(DATE, NER_DATE_MODEL);
+                put(PERCENT, NER_PERCENT_MODEL);
+                put(MONEY, NER_MONEY_MODEL);
+            }};
+
+    private Set<String> entityTypes;
+    private List<OpenNLPNameFinder> nameFinders;
+    private boolean available;
+
+    /**
+     * Creates a default chain of Name finders using default OpenNLP recognizers
+     */
+    public OpenNLPNERecogniser(){
+        this(DEFAULT_MODELS);
+    }
+
+    /**
+     * Creates a chain of Named Entity recognisers
+     * @param models map of entityType -> model path
+     * NOTE: the model path should be known to class loader.
+     */
+    public OpenNLPNERecogniser(Map<String, String> models){
+        this.nameFinders = new ArrayList<>();
+        this.entityTypes = new HashSet<>();
+        for (Map.Entry<String, String> entry : models.entrySet()) {
+            OpenNLPNameFinder finder =
+                    new OpenNLPNameFinder(entry.getKey(), entry.getValue());
+            if (finder.isAvailable()) {
+                this.nameFinders.add(finder);
+                this.entityTypes.add(entry.getKey());
+            }
+        }
+        this.entityTypes = Collections.unmodifiableSet(this.entityTypes);
+        this.available = nameFinders.size() > 0; //at least one finder is present
+    }
+
+    @Override
+    public boolean isAvailable() {
+        return available;
+    }
+
+    @Override
+    public Set<String> getEntityTypes() {
+        return entityTypes;
+    }
+
+    @Override
+    public Map<String, Set<String>> recognise(String text) {
+        String[] tokens = OpenNLPNameFinder.tokenize(text);
+        Map<String, Set<String>> names = new HashMap<>();
+        for (OpenNLPNameFinder finder : nameFinders) {
+            names.putAll(finder.findNames(tokens));
+        }
+        return names;
+    }
+}

Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNameFinder.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNameFinder.java?rev=1714835&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNameFinder.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNameFinder.java Tue Nov 17 17:29:25 2015
@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.ner.opennlp;
+
+import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.namefind.TokenNameFinderModel;
+import opennlp.tools.util.Span;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.parser.ner.NERecogniser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * An implementation of {@link NERecogniser} that finds names in text using Open NLP Model.
+ * This implementation works with only one entity type. For chain this name finder instances,
+ * see {@link OpenNLPNERecogniser}
+ */
+public class OpenNLPNameFinder implements NERecogniser {
+
+    private static final Logger LOG = LoggerFactory.getLogger(OpenNLPNameFinder.class);
+    private final String nameType;
+    private final Set<String> nameTypes;
+    private NameFinderME nameFinder;
+    private boolean available;
+
+    /**
+     * Creates OpenNLP name finder
+     * @param nameType the entity type recognised by the given NER model
+     * @param nerModelPath path to ner model
+     */
+    public OpenNLPNameFinder(String nameType, String nerModelPath) {
+        this.nameTypes = Collections.singleton(nameType);
+        this.nameType = nameType;
+        InputStream nerModelStream  = getClass().getClassLoader().getResourceAsStream(nerModelPath);
+        try {
+            if (nerModelStream != null){
+                TokenNameFinderModel model = new TokenNameFinderModel(nerModelStream);
+                this.nameFinder = new NameFinderME(model);
+                this.available = true;
+            } else {
+                LOG.warn("Couldn't find model from {} using class loader", nerModelPath);
+            }
+        } catch (IOException e) {
+            LOG.error(e.getMessage(), e);
+        } finally {
+            IOUtils.closeQuietly(nerModelStream);
+        }
+        LOG.info("{} NER : Available for service ? {}", nameType, available);
+    }
+
+    @Override
+    public boolean isAvailable() {
+        return available;
+    }
+
+    @Override
+    public Set<String> getEntityTypes() {
+        return nameTypes;
+    }
+
+    public static String[] tokenize(String text){
+        //NOTE: replace this with a NLP tokenizer tool
+        //clean + split
+        return text.trim().replaceAll("(\\s\\s+)", " ").split("\\s");
+    }
+
+    @Override
+    public synchronized Map<String, Set<String>> recognise(String text) {
+        String[] tokens = tokenize(text);
+        return findNames(tokens);
+    }
+
+    /**
+     * finds names from given array of tokens
+     * @param tokens the tokens array
+     * @return map of EntityType -> set of entity names
+     */
+    public Map<String, Set<String>> findNames(String[] tokens) {
+        Span[] nameSpans = nameFinder.find(tokens);
+        String[] names = Span.spansToStrings(nameSpans, tokens);
+        Map<String, Set<String>> result = new HashMap<>();
+        if (names != null && names.length > 0) {
+            result.put(nameType, new HashSet<>(Arrays.asList(names)));
+        }
+        nameFinder.clearAdaptiveData();
+        return result;
+    }
+}

Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/regex/RegexNERecogniser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/regex/RegexNERecogniser.java?rev=1714835&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/regex/RegexNERecogniser.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ner/regex/RegexNERecogniser.java Tue Nov 17 17:29:25 2015
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.ner.regex;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.parser.ner.NERecogniser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * This class offers an implementation of {@link NERecogniser} based on
+ * Regular Expressions.
+ *<p>
+ * The default configuration file {@value NER_REGEX_FILE} is used when no
+ * argument constructor is used to instantiate this class. The regex file is
+ * loaded via {@link Class#getResourceAsStream(String)}, so the file should be
+ * placed in the same package path as of this class.
+ * </p>
+ * The format of regex configuration as follows:
+ * <pre>
+ * ENTITY_TYPE1=REGEX1
+ * ENTITY_TYPE2=REGEX2
+ * </pre>
+ *
+ * <i>For example, to extract week day from text:</i>
+ * <pre>WEEK_DAY=(?i)((sun)|(mon)|(tues)|(thurs)|(fri)|((sat)(ur)?))(day)?
+ * </pre>
+ * @since Nov. 7, 2015
+ */
+public class RegexNERecogniser implements NERecogniser {
+
+    public static final String NER_REGEX_FILE = "ner-regex.txt";
+    private static Logger LOG = LoggerFactory.getLogger(RegexNERecogniser.class);
+
+    public Set<String> entityTypes = new HashSet<>();
+    public Map<String, Pattern> patterns;
+    private boolean available = false;
+
+    private static RegexNERecogniser INSTANCE;
+
+    public RegexNERecogniser(){
+        this(RegexNERecogniser.class.getResourceAsStream(NER_REGEX_FILE));
+    }
+
+    public RegexNERecogniser(InputStream stream){
+        try {
+            patterns = new HashMap<>();
+            List<String> lines = IOUtils.readLines(stream, StandardCharsets.UTF_8);
+            IOUtils.closeQuietly(stream);
+            for (String line : lines) {
+                line = line.trim();
+                if (line.isEmpty() || line.startsWith("#")){ //empty or comment
+                    continue;                                //skip
+                }
+
+                int delim = line.indexOf('=');
+                if (delim < 0) { //delim not found
+                    //skip
+                    LOG.error("Skipped : Invalid config : {} ", line);
+                    continue;
+                }
+                String type = line.substring(0, delim).trim();
+                String patternStr = line.substring(delim+1, line.length()).trim();
+                patterns.put(type, Pattern.compile(patternStr));
+                entityTypes.add(type);
+            }
+        } catch (Exception e) {
+            LOG.error(e.getMessage(), e);
+        }
+        available = !entityTypes.isEmpty();
+    }
+
+    public synchronized static RegexNERecogniser getInstance() {
+        if (INSTANCE == null) {
+            INSTANCE = new RegexNERecogniser();
+        }
+        return INSTANCE;
+    }
+
+    @Override
+    public boolean isAvailable() {
+        return available;
+    }
+
+    @Override
+    public Set<String> getEntityTypes() {
+        return entityTypes;
+    }
+
+    /**
+     * finds matching sub groups in text
+     * @param text text containing interesting sub strings
+     * @param pattern pattern to find sub strings
+     * @return set of sub strings if any found, or null if none found
+     */
+    public Set<String> findMatches(String text, Pattern pattern){
+        Set<String> results = null;
+        Matcher matcher = pattern.matcher(text);
+        if (matcher.find()) {
+            results = new HashSet<>();
+            results.add(matcher.group(0));
+            while (matcher.find()) {
+                results.add(matcher.group(0));
+            }
+        }
+        return results;
+    }
+
+    @Override
+    public Map<String, Set<String>> recognise(String text) {
+        Map<String, Set<String>> result = new HashMap<>();
+        for (Map.Entry<String, Pattern> entry : patterns.entrySet()) {
+            Set<String> names = findMatches(text, entry.getValue());
+            if (names != null) {
+                result.put(entry.getKey(), names);
+            }
+        }
+        return result;
+    }
+}

Added: tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/ner/regex/ner-regex.txt
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/ner/regex/ner-regex.txt?rev=1714835&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/ner/regex/ner-regex.txt (added)
+++ tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/ner/regex/ner-regex.txt Tue Nov 17 17:29:25 2015
@@ -0,0 +1,22 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+ 
+
+# The pattern as follows
+# type = regex
+# the first occurrence of '=' separates type from its regex
+
+# WEEK_DAY=(?i)((sun)|(mon)|(tues)|(thurs)|(fri)|((sat)(ur)?))(day)?
\ No newline at end of file

Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ner/NamedEntityParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ner/NamedEntityParserTest.java?rev=1714835&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ner/NamedEntityParserTest.java (added)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ner/NamedEntityParserTest.java Tue Nov 17 17:29:25 2015
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ner;
+
+import org.apache.tika.Tika;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ner.opennlp.OpenNLPNERecogniser;
+import org.apache.tika.parser.ner.regex.RegexNERecogniser;
+import org.junit.Test;
+
+import java.io.ByteArrayInputStream;
+import java.nio.charset.Charset;
+import java.util.Arrays;
+import java.util.HashSet;
+
+import static org.junit.Assert.assertTrue;
+
+/**
+ *Test case for {@link NamedEntityParser}
+ */
+public class NamedEntityParserTest {
+
+    public static final String CONFIG_FILE = "tika-config.xml";
+
+    @Test
+    public void testParse() throws Exception {
+
+        //test config is added to resources directory
+        TikaConfig config = new TikaConfig(getClass().getResourceAsStream(CONFIG_FILE));
+        Tika tika = new Tika(config);
+        String text = "I am student at University of Southern California (USC)," +
+                " located in Los Angeles . USC's football team is called by name Trojans." +
+                " Mr. John McKay was a head coach of the team from 1960 - 1975";
+        Metadata md = new Metadata();
+        tika.parse(new ByteArrayInputStream(text.getBytes(Charset.defaultCharset())), md);
+
+        HashSet<String> set = new HashSet<String>();
+        set.addAll(Arrays.asList(md.getValues("X-Parsed-By")));
+        assertTrue(set.contains(NamedEntityParser.class.getName()));
+
+        set.clear();
+        set.addAll(Arrays.asList(md.getValues("NER_PERSON")));
+        assertTrue(set.contains("John McKay"));
+
+        set.clear();
+        set.addAll(Arrays.asList(md.getValues("NER_LOCATION")));
+        assertTrue(set.contains("Los Angeles"));
+
+        set.clear();
+        set.addAll(Arrays.asList(md.getValues("NER_ORGANIZATION")));
+        assertTrue(set.contains("University of Southern California"));
+
+        set.clear();
+        set.addAll(Arrays.asList(md.getValues("NER_DATE")));
+        assertTrue(set.contains("1960 - 1975"));
+
+    }
+
+    @Test
+    public void testNerChain() throws Exception {
+        String classNames = OpenNLPNERecogniser.class.getName()
+                + "," + RegexNERecogniser.class.getName();
+        System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, classNames);
+        TikaConfig config = new TikaConfig(getClass().getResourceAsStream(CONFIG_FILE));
+        Tika tika = new Tika(config);
+        String text = "University of Southern California (USC), is located in Los Angeles ." +
+                " Campus is busy from monday to saturday";
+        Metadata md = new Metadata();
+        tika.parse(new ByteArrayInputStream(text.getBytes(Charset.defaultCharset())), md);
+        HashSet<String> keys = new HashSet<>(Arrays.asList(md.names()));
+        assertTrue(keys.contains("NER_WEEK_DAY"));
+        assertTrue(keys.contains("NER_LOCATION"));
+
+    }
+}
\ No newline at end of file

Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ner/regex/RegexNERecogniserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ner/regex/RegexNERecogniserTest.java?rev=1714835&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ner/regex/RegexNERecogniserTest.java (added)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ner/regex/RegexNERecogniserTest.java Tue Nov 17 17:29:25 2015
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright owlocationNameEntitieship.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ner.regex;
+
+import org.apache.tika.Tika;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ner.NamedEntityParser;
+import org.junit.Test;
+
+import java.io.ByteArrayInputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import static org.junit.Assert.assertTrue;
+
+public class RegexNERecogniserTest {
+
+    @Test
+    public void testGetEntityTypes() throws Exception {
+
+        String text = "Hey, Lets meet on this Sunday or MONDAY because i am busy on Saturday";
+        System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, RegexNERecogniser.class.getName());
+
+        Tika tika = new Tika(new TikaConfig(NamedEntityParser.class.getResourceAsStream("tika-config.xml")));
+        Metadata md = new Metadata();
+        tika.parse(new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)), md);
+
+        Set<String> days = new HashSet<>(Arrays.asList(md.getValues("NER_WEEK_DAY")));
+        assertTrue(days.contains("Sunday"));
+        assertTrue(days.contains("MONDAY"));
+        assertTrue(days.contains("Saturday"));
+        assertTrue(days.size() == 3); //and nothing else
+
+
+    }
+}
\ No newline at end of file

Added: tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/opennlp/ModelGetter.groovy
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/opennlp/ModelGetter.groovy?rev=1714835&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/opennlp/ModelGetter.groovy (added)
+++ tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/opennlp/ModelGetter.groovy Tue Nov 17 17:29:25 2015
@@ -0,0 +1,93 @@
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * This file downloads Apache OpenNLP NER models for testing the NamedEntityParser
+ */
+
+import org.apache.commons.io.IOUtils
+
+/**
+ * Copies input stream to output stream, additionally printing the progress.
+ * NOTE: this is optimized for large content
+ * @param inStr source stream
+ * @param outStr target stream
+ * @param totalLength the total length of the content (used to calculate progress)
+ * @return
+ */
+def copyWithProgress(InputStream inStr, OutputStream outStr, long totalLength){
+    int PROGRESS_DELAY = 1000;
+    byte[] buffer = new byte[1024 * 4]
+    long count = 0
+    int len
+    long tt = System.currentTimeMillis()
+    while ((len = inStr.read(buffer)) > 0) {
+        outStr.write(buffer, 0, len)
+        count += len
+        if (System.currentTimeMillis() - tt > PROGRESS_DELAY) {
+            println "${count * 100.0/totalLength}% : $count bytes of $totalLength"
+            tt = System.currentTimeMillis()
+        }
+    }
+    println "Copy complete. "
+    IOUtils.closeQuietly(inStr)
+    IOUtils.closeQuietly(outStr)
+}
+
+/**
+ * Downloads file
+ * @param urlStr url of file
+ * @param file path to store file
+ * @return
+ */
+def downloadFile(String urlStr, File file) {
+    println "GET : $urlStr -> $file"
+    urlConn = new URL(urlStr).openConnection()
+    contentLength = urlConn.getContentLengthLong()
+
+    file.getParentFile().mkdirs()
+    inStream = urlConn.getInputStream()
+    outStream = new FileOutputStream(file)
+    copyWithProgress(inStream, outStream, contentLength)
+    IOUtils.closeQuietly(outStream)
+    IOUtils.closeQuietly(inStream)
+    println "Download Complete.."
+}
+
+
+def urlPrefix = "http://opennlp.sourceforge.net/models-1.5"
+def prefixPath = "src/test/resources/org/apache/tika/parser/ner/opennlp/"
+
+// detecting proper path for test resources
+if (new File("tika-parsers").exists() && new File("tika-app").exists()  ) {
+    // running from parent maven project, but resources should go to sub-module
+    prefixPath = "tika-parsers/" + prefixPath
+}
+
+def modelFiles = //filePath : url
+        [ (prefixPath + "ner-person.bin"): (urlPrefix + "/en-ner-person.bin"),
+          (prefixPath + "ner-location.bin"): (urlPrefix + "/en-ner-location.bin"),
+          (prefixPath + "ner-organization.bin"): (urlPrefix + "/en-ner-organization.bin"),
+          (prefixPath + "ner-date.bin"): (urlPrefix + "/en-ner-date.bin")]
+
+for (def entry : modelFiles) {
+    File file = new File(entry.key)
+    if (!file.exists()) {
+        downloadFile(entry.value, file)
+    }
+}
\ No newline at end of file

Added: tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/opennlp/get-models.sh
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/opennlp/get-models.sh?rev=1714835&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/opennlp/get-models.sh (added)
+++ tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/opennlp/get-models.sh Tue Nov 17 17:29:25 2015
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+echo "Getting OpenNLP NER models"
+wget "http://opennlp.sourceforge.net/models-1.5/en-ner-person.bin" -O ner-person.bin
+wget "http://opennlp.sourceforge.net/models-1.5/en-ner-location.bin" -O ner-location.bin
+wget "http://opennlp.sourceforge.net/models-1.5/en-ner-organization.bin" -O ner-organization.bin
+
+# Additional 4
+wget "http://opennlp.sourceforge.net/models-1.5/en-ner-date.bin" -O ner-date.bin
+wget "http://opennlp.sourceforge.net/models-1.5/en-ner-money.bin" -O ner-money.bin
+wget "http://opennlp.sourceforge.net/models-1.5/en-ner-time.bin" -O ner-time.bin
+wget "http://opennlp.sourceforge.net/models-1.5/en-ner-percentage.bin" -O ner-percentage.bin
\ No newline at end of file

Propchange: tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/opennlp/get-models.sh
------------------------------------------------------------------------------
    svn:executable = *

Added: tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/regex/ner-regex.txt
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/regex/ner-regex.txt?rev=1714835&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/regex/ner-regex.txt (added)
+++ tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/regex/ner-regex.txt Tue Nov 17 17:29:25 2015
@@ -0,0 +1,17 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+WEEK_DAY=(?i)((sun)|(mon)|(tues)|(thurs)|(fri)|((sat)(ur)?))(day)?
\ No newline at end of file

Added: tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/tika-config.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/tika-config.xml?rev=1714835&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/tika-config.xml (added)
+++ tika/trunk/tika-parsers/src/test/resources/org/apache/tika/parser/ner/tika-config.xml Tue Nov 17 17:29:25 2015
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+    <parsers>
+        <parser class="org.apache.tika.parser.ner.NamedEntityParser">
+            <mime>text/plain</mime>
+            <mime>text/html</mime>
+            <mime>application/xhtml+xml</mime>
+        </parser>
+    </parsers>
+
+</properties>
\ No newline at end of file