You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/08/15 15:23:53 UTC
[tika] branch main updated: Update Grobid parsers (#1280)
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 6d8dbcdd3 Update Grobid parsers (#1280)
6d8dbcdd3 is described below
commit 6d8dbcdd35f448623a0080ec0ecdc62f93dc1359
Author: Luca Foppiano <lf...@users.noreply.github.com>
AuthorDate: Wed Aug 16 00:23:48 2023 +0900
Update Grobid parsers (#1280)
* Update default values
* improve check mechanism, correct config file
---
.../tika/parser/journal/GrobidRESTParser.java | 7 ++---
.../tika/parser/ner/grobid/GrobidNERecogniser.java | 31 +++++++++++++++-------
.../tika/parser/journal/GrobidExtractor.properties | 2 +-
.../tika/parser/ner/grobid/GrobidServer.properties | 4 +--
4 files changed, 29 insertions(+), 15 deletions(-)
diff --git a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java
index cc2841880..d7aedd660 100644
--- a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java
+++ b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java
@@ -41,7 +41,7 @@ public class GrobidRESTParser {
private static final Logger LOG = LoggerFactory.getLogger(GrobidRESTParser.class);
- private static final String GROBID_REST_HOST = "http://localhost:8080";
+ private static final String GROBID_REST_HOST = "http://localhost:8070";
private static final String GROBID_ISALIVE_PATH = "/api/isalive";
private static final String GROBID_PROCESSHEADER_PATH = "/api/processHeaderDocument";
private static final String GROBID_LEGACY_ISALIVE_PATH = "/grobid";
@@ -96,8 +96,9 @@ public class GrobidRESTParser {
try {
checkMode();
Response response = WebClient.create(restHostUrlStr +
- (legacyMode ? GROBID_LEGACY_PROCESSHEADER_PATH : GROBID_PROCESSHEADER_PATH))
- .accept(MediaType.APPLICATION_XML).type(MediaType.MULTIPART_FORM_DATA)
+ (legacyMode ? GROBID_LEGACY_PROCESSHEADER_PATH : GROBID_PROCESSHEADER_PATH))
+ .accept(MediaType.APPLICATION_XML)
+ .type(MediaType.MULTIPART_FORM_DATA)
.post(body);
diff --git a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/grobid/GrobidNERecogniser.java b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/grobid/GrobidNERecogniser.java
index 1f173e381..1812b4b82 100644
--- a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/grobid/GrobidNERecogniser.java
+++ b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/grobid/GrobidNERecogniser.java
@@ -47,10 +47,11 @@ public class GrobidNERecogniser implements NERecogniser {
add("NORMALIZED_MEASUREMENTS");
add("MEASUREMENT_TYPES");
}
- };
+ };
private static final Logger LOG = LoggerFactory.getLogger(GrobidNERecogniser.class);
- private static final String GROBID_REST_HOST = "http://localhost:8080";
- private static boolean available = false;
+ private static final String GROBID_REST_HOST = "http://localhost:8060";
+ private static final String ISALIVE_URL = "/service/isalive";
+ private boolean available = false;
private String restHostUrlStr;
@@ -62,7 +63,6 @@ public class GrobidNERecogniser implements NERecogniser {
restHostUrlStr = readRestUrl();
} catch (IOException e) {
LOG.warn("couldn't read rest url", e);
-
}
if (restHostUrlStr == null || restHostUrlStr.equals("")) {
@@ -71,18 +71,30 @@ public class GrobidNERecogniser implements NERecogniser {
this.restHostUrlStr = restHostUrlStr;
}
+ this.available = isServerAlive(restHostUrlStr);
+
+ } catch (Exception e) {
+ LOG.info(e.getMessage(), e);
+ }
+ }
+
+ private static boolean isServerAlive(String restHostUrlStr) {
+ boolean available = false;
+ try {
Response response =
- WebClient.create(restHostUrlStr).accept(MediaType.APPLICATION_JSON).get();
+ WebClient.create(restHostUrlStr + ISALIVE_URL)
+ .get();
int responseCode = response.getStatus();
if (responseCode == 200) {
available = true;
} else {
- LOG.info("Grobid REST Server is not running");
+ LOG.info("Grobid Quantities REST Server is not running");
}
-
} catch (Exception e) {
- LOG.info(e.getMessage(), e);
+ LOG.info("Grobid Quantities REST Server is not running", e);
}
+
+ return available;
}
/**
@@ -173,7 +185,8 @@ public class GrobidNERecogniser implements NERecogniser {
try {
String url = restHostUrlStr + readRestEndpoint();
Response response =
- WebClient.create(url).accept(MediaType.APPLICATION_JSON).post("text=" + text);
+ WebClient.create(url).accept(MediaType.APPLICATION_JSON)
+ .post("text=" + text);
int responseCode = response.getStatus();
if (responseCode == 200) {
diff --git a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/resources/org/apache/tika/parser/journal/GrobidExtractor.properties b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/resources/org/apache/tika/parser/journal/GrobidExtractor.properties
index 488f0c593..002b0430c 100644
--- a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/resources/org/apache/tika/parser/journal/GrobidExtractor.properties
+++ b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/resources/org/apache/tika/parser/journal/GrobidExtractor.properties
@@ -13,4 +13,4 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-grobid.server.url=http://localhost:8080
+grobid.server.url=http://localhost:8070
diff --git a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/resources/org/apache/tika/parser/ner/grobid/GrobidServer.properties b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/resources/org/apache/tika/parser/ner/grobid/GrobidServer.properties
index 3fc609af4..0803e38fd 100644
--- a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/resources/org/apache/tika/parser/ner/grobid/GrobidServer.properties
+++ b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/resources/org/apache/tika/parser/ner/grobid/GrobidServer.properties
@@ -13,5 +13,5 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-grobid.server.url=http://localhost:8080
-grobid.endpoint.text=/processQuantityText
+grobid.server.url=http://localhost:8060
+grobid.endpoint.text=/service/processQuantityText