You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by dm...@apache.org on 2020/11/10 00:18:34 UTC

[tika] branch branch_1x updated: TIKA-3191: Updated GrobidRESTParser to use new API location, whilst retaining backwards compatibility

This is an automated email from the ASF dual-hosted git repository.

dmeikle pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_1x by this push:
     new 1757667  TIKA-3191: Updated GrobidRESTParser to use new API location, whilst retaining backwards compatibility
1757667 is described below

commit 175766713ec404418f349206dc43ffb9730994e2
Author: David Meikle <dm...@apache.org>
AuthorDate: Mon Nov 9 23:54:57 2020 +0000

    TIKA-3191: Updated GrobidRESTParser to use new API location, whilst retaining backwards compatibility
---
 .../tika/parser/journal/GrobidRESTParser.java      | 51 +++++++++++++++-------
 1 file changed, 35 insertions(+), 16 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java
index 110c504..4eb1e31 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java
@@ -29,6 +29,7 @@ import org.apache.cxf.jaxrs.client.WebClient;
 import org.apache.cxf.jaxrs.ext.multipart.Attachment;
 import org.apache.cxf.jaxrs.ext.multipart.ContentDisposition;
 import org.apache.cxf.jaxrs.ext.multipart.MultipartBody;
+import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
 import org.slf4j.Logger;
@@ -39,14 +40,15 @@ public class GrobidRESTParser {
 
     private static final Logger LOG = LoggerFactory.getLogger(GrobidRESTParser.class);
 
-
     private static final String GROBID_REST_HOST = "http://localhost:8080";
 
-    private static final String GROBID_ISALIVE_PATH = "/grobid"; // isalive
-    // doesn't work
-    // nfc why
+    private Boolean legacyMode = null;
+
+    private static final String GROBID_ISALIVE_PATH = "/api/isalive";
+    private static final String GROBID_PROCESSHEADER_PATH = "/api/processHeaderDocument";
 
-    private static final String GROBID_PROCESSHEADER_PATH = "/processHeaderDocument";
+    private static final String GROBID_LEGACY_ISALIVE_PATH = "/grobid";
+    private static final String GROBID_LEGACY_PROCESSHEADER_PATH = "/processHeaderDocument";
 
     private String restHostUrlStr;
 
@@ -58,8 +60,7 @@ public class GrobidRESTParser {
             LOG.warn("can't read rest url", e);
         }
 
-        if (restHostUrlStr == null
-                || (restHostUrlStr != null && restHostUrlStr.equals(""))) {
+        if (restHostUrlStr == null || restHostUrlStr.equals("")) {
             this.restHostUrlStr = GROBID_REST_HOST;
         } else {
             this.restHostUrlStr = restHostUrlStr;
@@ -75,12 +76,15 @@ public class GrobidRESTParser {
         Attachment att = new Attachment("input", new FileInputStream(pdfFile), cd);
         MultipartBody body = new MultipartBody(att);
 
-        Response response = WebClient
-                .create(restHostUrlStr + GROBID_PROCESSHEADER_PATH)
-                .accept(MediaType.APPLICATION_XML).type(MediaType.MULTIPART_FORM_DATA)
-                .post(body);
-
         try {
+            checkMode();
+            Response response = WebClient
+                    .create(restHostUrlStr
+                            + (legacyMode ? GROBID_LEGACY_PROCESSHEADER_PATH : GROBID_PROCESSHEADER_PATH))
+                    .accept(MediaType.APPLICATION_XML).type(MediaType.MULTIPART_FORM_DATA)
+                    .post(body);
+
+
             String resp = response.readEntity(String.class);
             Metadata teiMet = new TEIDOMParser().parse(resp, context);
             for (String key : teiMet.names()) {
@@ -99,14 +103,29 @@ public class GrobidRESTParser {
         return grobidProperties.getProperty("grobid.server.url");
     }
 
+    private void checkMode() throws TikaException {
+        if (legacyMode != null) {
+            return;
+        }
+        Response response = WebClient.create(restHostUrlStr + GROBID_ISALIVE_PATH).head();
+        if (response.getStatus() == 200) {
+            legacyMode = false;
+            return;
+        }
+        response = WebClient.create(restHostUrlStr + GROBID_LEGACY_ISALIVE_PATH).head();
+        if (response.getStatus() == 200) {
+            legacyMode = true;
+            return;
+        }
+        throw new TikaException("Cannot connect to Grobid Service");
+    }
+
     protected static boolean canRun() {
         Response response = null;
-
         try {
-            response = WebClient.create(readRestUrl() + GROBID_ISALIVE_PATH)
-                    .accept(MediaType.TEXT_HTML).get();
+            response = WebClient.create(readRestUrl() + GROBID_ISALIVE_PATH).get();
             String resp = response.readEntity(String.class);
-            return resp != null && !resp.equals("") && resp.startsWith("<h4>");
+            return resp != null && !resp.equals("") && resp.startsWith("true");
         } catch (Exception e) {
             //swallow...can't run
             return false;