You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2016/03/02 06:42:22 UTC

[13/20] tika git commit: fix for TIKA-1876 contributed by manalishah

fix for TIKA-1876 contributed by manalishah


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/c809690e
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/c809690e
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/c809690e

Branch: refs/heads/master
Commit: c809690ec87ffa600018dbc5eee6d6756645adb0
Parents: ed762b7
Author: manali <ma...@gmail.com>
Authored: Fri Feb 26 19:58:06 2016 -0800
Committer: manali <ma...@gmail.com>
Committed: Fri Feb 26 19:58:06 2016 -0800

----------------------------------------------------------------------
 .gitignore                                      |   2 +-
 .../tika/parser/ner/nltk/NLTKNERecogniser.java  | 137 +++++++++++++++++++
 .../tika/parser/ner/nltk/NLTKServer.properties  |  16 +++
 .../parser/ner/nltk/NLTKNERecogniserTest.java   |  54 ++++++++
 4 files changed, 208 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/c809690e/.gitignore
----------------------------------------------------------------------
diff --git a/.gitignore b/.gitignore
index c262c68..8093709 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,4 +9,4 @@ target
 *.iws
 *.bin
 nbactions.xml
-nb-configuration.xml
\ No newline at end of file
+nb-configuration.xml*.DS_Store

http://git-wip-us.apache.org/repos/asf/tika/blob/c809690e/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
new file mode 100644
index 0000000..850f4dd
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ner.nltk;
+
+import org.apache.tika.parser.ner.NERecogniser;
+import org.json.simple.JSONObject;
+import org.json.simple.parser.JSONParser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.*;
+import java.util.*;
+import javax.ws.rs.core.MediaType;
+import javax.ws.rs.core.Response;
+
+import org.apache.cxf.jaxrs.client.WebClient;
+
+/**
+ *  This class offers an implementation of {@link NERecogniser} based on
+ *  ne_chunk() module of NLTK. This NER requires additional setup,
+ *  due to Http requests to an endpoint server that runs NLTK.
+ *  This endpoint has been implemented as pip/setuptools installable python module
+ *  See <a href="https://github.com/manalishah/NLTKRest"></a>
+ *  See <a href="http://wiki.apache.org/tika/TikaAndNLTK">
+ *
+ */
+public class NLTKNERecogniser implements NERecogniser {
+
+    private static final Logger LOG = LoggerFactory.getLogger(NLTKNERecogniser.class);
+    private static boolean available = false;
+    private static final String NLTK_REST_HOST = "http://localhost:8881";
+     /**
+     * some common entities identified by NLTK
+     */
+    public static final Set<String> ENTITY_TYPES = new HashSet<String>(){{
+        add("NAMES");
+    }};
+
+    String restHostUrlStr;
+    public NLTKNERecogniser(){
+        try {
+
+            String restHostUrlStr="";
+            try {
+                restHostUrlStr = readRestUrl();
+            } catch (IOException e) {
+                e.printStackTrace();
+            }
+
+            if (restHostUrlStr == null
+                    || (restHostUrlStr != null && restHostUrlStr.equals(""))) {
+                this.restHostUrlStr = NLTK_REST_HOST;
+            } else {
+                this.restHostUrlStr = restHostUrlStr;
+            }
+            //check if nltkrest is running 
+            Response response = WebClient.create(restHostUrlStr).accept(MediaType.TEXT_HTML).get();
+            int responseCode = response.getStatus();
+            if(responseCode == 200){
+                available = true;
+            }
+            else{
+                LOG.info("NLTKRest Server is not running");
+            }
+
+        } catch (Exception e) {
+            LOG.debug(e.getMessage(), e);
+        }
+    }
+
+    private static String readRestUrl() throws IOException {
+        Properties nltkProperties = new Properties();
+        nltkProperties.load(NLTKNERecogniser.class
+                .getResourceAsStream("NLTKServer.properties"));
+
+        return nltkProperties.getProperty("nltk.server.url");
+    }
+
+    /**
+     * @return {@code true} if server endpoint is available.
+     * returns {@code false} if server endpoint is not avaliable for service.
+     */
+    public boolean isAvailable() {
+        return available;
+    }
+
+    /**
+     * Gets set of entity types recognised by this recogniser
+     * @return set of entity classes/types
+     */
+    public Set<String> getEntityTypes() {
+        return ENTITY_TYPES;
+    }
+
+    /**
+     * recognises names of entities in the text
+     * @param text text which possibly contains names
+     * @return map of entity type -> set of names
+     */
+    public Map<String, Set<String>> recognise(String text) {
+        Map<String, Set<String>> entities = new HashMap<>();
+        try {
+            int port = 8881;
+            String url = restHostUrlStr + "/nltk";
+            Response response = WebClient.create(url).accept(MediaType.TEXT_HTML).post(text);
+            int responseCode = response.getStatus();
+            if (responseCode == 200) {
+                String result = response.readEntity(String.class);
+                JSONParser parser = new JSONParser();
+                JSONObject j = (JSONObject) parser.parse(result);
+                Set s = entities.put("NAMES", new HashSet((Collection) j.get("names")));
+            }
+        }
+        catch (Exception e) {
+            LOG.debug(e.getMessage(), e);
+        }
+        ENTITY_TYPES.clear();
+        ENTITY_TYPES.addAll(entities.keySet());
+        return entities;
+    }
+
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c809690e/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties b/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties
new file mode 100644
index 0000000..5909b69
--- /dev/null
+++ b/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties
@@ -0,0 +1,16 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+nltk.server.url=http://localhost:8881

http://git-wip-us.apache.org/repos/asf/tika/blob/c809690e/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
new file mode 100644
index 0000000..5c1307f
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright owlocationNameEntitieship.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ner.nltk;
+
+import org.apache.commons.logging.Log;
+import org.apache.tika.Tika;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ner.NamedEntityParser;
+import org.junit.Ignore;
+import org.junit.Test;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import static org.junit.Assert.assertTrue;
+
+public class NLTKNERecogniserTest {
+    @Test
+    public void testGetEntityTypes() throws Exception {
+        String text = "America is a big country.";
+        System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, NLTKNERecogniser.class.getName());
+        Tika tika = new Tika(new TikaConfig(NamedEntityParser.class.getResourceAsStream("tika-config.xml")));
+        Metadata md = new Metadata();
+        tika.parse(new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)), md);
+
+        Set<String> names = new HashSet<>(Arrays.asList(md.getValues("NER_NAMES")));
+        if(names.size() == 0) {
+            return;
+        }
+        else {
+            assertTrue(names.contains("America"));
+            assertTrue(names.size() == 1); //and nothing else
+        }
+    }
+}