You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2016/03/02 06:42:22 UTC
[13/20] tika git commit: fix for TIKA-1876 contributed by manalishah
fix for TIKA-1876 contributed by manalishah
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/c809690e
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/c809690e
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/c809690e
Branch: refs/heads/master
Commit: c809690ec87ffa600018dbc5eee6d6756645adb0
Parents: ed762b7
Author: manali <ma...@gmail.com>
Authored: Fri Feb 26 19:58:06 2016 -0800
Committer: manali <ma...@gmail.com>
Committed: Fri Feb 26 19:58:06 2016 -0800
----------------------------------------------------------------------
.gitignore | 2 +-
.../tika/parser/ner/nltk/NLTKNERecogniser.java | 137 +++++++++++++++++++
.../tika/parser/ner/nltk/NLTKServer.properties | 16 +++
.../parser/ner/nltk/NLTKNERecogniserTest.java | 54 ++++++++
4 files changed, 208 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/c809690e/.gitignore
----------------------------------------------------------------------
diff --git a/.gitignore b/.gitignore
index c262c68..8093709 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,4 +9,4 @@ target
*.iws
*.bin
nbactions.xml
-nb-configuration.xml
\ No newline at end of file
+nb-configuration.xml*.DS_Store
http://git-wip-us.apache.org/repos/asf/tika/blob/c809690e/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
new file mode 100644
index 0000000..850f4dd
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ner.nltk;
+
+import org.apache.tika.parser.ner.NERecogniser;
+import org.json.simple.JSONObject;
+import org.json.simple.parser.JSONParser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.*;
+import java.util.*;
+import javax.ws.rs.core.MediaType;
+import javax.ws.rs.core.Response;
+
+import org.apache.cxf.jaxrs.client.WebClient;
+
+/**
+ * This class offers an implementation of {@link NERecogniser} based on
+ * ne_chunk() module of NLTK. This NER requires additional setup,
+ * due to Http requests to an endpoint server that runs NLTK.
+ * This endpoint has been implemented as pip/setuptools installable python module
+ * See <a href="https://github.com/manalishah/NLTKRest"></a>
+ * See <a href="http://wiki.apache.org/tika/TikaAndNLTK">
+ *
+ */
+public class NLTKNERecogniser implements NERecogniser {
+
+ private static final Logger LOG = LoggerFactory.getLogger(NLTKNERecogniser.class);
+ private static boolean available = false;
+ private static final String NLTK_REST_HOST = "http://localhost:8881";
+ /**
+ * some common entities identified by NLTK
+ */
+ public static final Set<String> ENTITY_TYPES = new HashSet<String>(){{
+ add("NAMES");
+ }};
+
+ String restHostUrlStr;
+ public NLTKNERecogniser(){
+ try {
+
+ String restHostUrlStr="";
+ try {
+ restHostUrlStr = readRestUrl();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+
+ if (restHostUrlStr == null
+ || (restHostUrlStr != null && restHostUrlStr.equals(""))) {
+ this.restHostUrlStr = NLTK_REST_HOST;
+ } else {
+ this.restHostUrlStr = restHostUrlStr;
+ }
+ //check if nltkrest is running
+ Response response = WebClient.create(restHostUrlStr).accept(MediaType.TEXT_HTML).get();
+ int responseCode = response.getStatus();
+ if(responseCode == 200){
+ available = true;
+ }
+ else{
+ LOG.info("NLTKRest Server is not running");
+ }
+
+ } catch (Exception e) {
+ LOG.debug(e.getMessage(), e);
+ }
+ }
+
+ private static String readRestUrl() throws IOException {
+ Properties nltkProperties = new Properties();
+ nltkProperties.load(NLTKNERecogniser.class
+ .getResourceAsStream("NLTKServer.properties"));
+
+ return nltkProperties.getProperty("nltk.server.url");
+ }
+
+ /**
+ * @return {@code true} if server endpoint is available.
+ * returns {@code false} if server endpoint is not avaliable for service.
+ */
+ public boolean isAvailable() {
+ return available;
+ }
+
+ /**
+ * Gets set of entity types recognised by this recogniser
+ * @return set of entity classes/types
+ */
+ public Set<String> getEntityTypes() {
+ return ENTITY_TYPES;
+ }
+
+ /**
+ * recognises names of entities in the text
+ * @param text text which possibly contains names
+ * @return map of entity type -> set of names
+ */
+ public Map<String, Set<String>> recognise(String text) {
+ Map<String, Set<String>> entities = new HashMap<>();
+ try {
+ int port = 8881;
+ String url = restHostUrlStr + "/nltk";
+ Response response = WebClient.create(url).accept(MediaType.TEXT_HTML).post(text);
+ int responseCode = response.getStatus();
+ if (responseCode == 200) {
+ String result = response.readEntity(String.class);
+ JSONParser parser = new JSONParser();
+ JSONObject j = (JSONObject) parser.parse(result);
+ Set s = entities.put("NAMES", new HashSet((Collection) j.get("names")));
+ }
+ }
+ catch (Exception e) {
+ LOG.debug(e.getMessage(), e);
+ }
+ ENTITY_TYPES.clear();
+ ENTITY_TYPES.addAll(entities.keySet());
+ return entities;
+ }
+
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c809690e/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties b/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties
new file mode 100644
index 0000000..5909b69
--- /dev/null
+++ b/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+nltk.server.url=http://localhost:8881
http://git-wip-us.apache.org/repos/asf/tika/blob/c809690e/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
new file mode 100644
index 0000000..5c1307f
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright owlocationNameEntitieship.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ner.nltk;
+
+import org.apache.commons.logging.Log;
+import org.apache.tika.Tika;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ner.NamedEntityParser;
+import org.junit.Ignore;
+import org.junit.Test;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import static org.junit.Assert.assertTrue;
+
+public class NLTKNERecogniserTest {
+ @Test
+ public void testGetEntityTypes() throws Exception {
+ String text = "America is a big country.";
+ System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, NLTKNERecogniser.class.getName());
+ Tika tika = new Tika(new TikaConfig(NamedEntityParser.class.getResourceAsStream("tika-config.xml")));
+ Metadata md = new Metadata();
+ tika.parse(new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)), md);
+
+ Set<String> names = new HashSet<>(Arrays.asList(md.getValues("NER_NAMES")));
+ if(names.size() == 0) {
+ return;
+ }
+ else {
+ assertTrue(names.contains("America"));
+ assertTrue(names.size() == 1); //and nothing else
+ }
+ }
+}