You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2022/08/09 08:40:38 UTC
[nutch] branch master updated: NUTCH-2956 index-geoip: dependency upgrades and improvements - upgrade to geoip2 3.0.1 - exclude transitive dependencies (Jackson) provided as Nutch core deps - read also GeoLite2-*.mmdb files - review index field names in plugin and Nutch Solr schema: - fix typos in field names - remove unused fields from schema
This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 8fc4f17ac NUTCH-2956 index-geoip: dependency upgrades and improvements - upgrade to geoip2 3.0.1 - exclude transitive dependencies (Jackson) provided as Nutch core deps - read also GeoLite2-*.mmdb files - review index field names in plugin and Nutch Solr schema: - fix typos in field names - remove unused fields from schema
8fc4f17ac is described below
commit 8fc4f17acc5da28c22ef4e77c2316e20e5976f02
Author: Sebastian Nagel <sn...@apache.org>
AuthorDate: Sat Aug 6 15:04:10 2022 +0200
NUTCH-2956 index-geoip: dependency upgrades and improvements
- upgrade to geoip2 3.0.1
- exclude transitive dependencies (Jackson) provided as Nutch core deps
- read also GeoLite2-*.mmdb files
- review index field names in plugin and Nutch Solr schema:
- fix typos in field names
- remove unused fields from schema
---
conf/nutch-default.xml | 3 +-
src/plugin/index-geoip/ivy.xml | 11 +++--
src/plugin/index-geoip/plugin.xml | 7 +---
.../nutch/indexer/geoip/GeoIPDocumentCreator.java | 49 ++++++++++++----------
.../nutch/indexer/geoip/GeoIPIndexingFilter.java | 34 ++++++++-------
src/plugin/indexer-solr/schema.xml | 3 +-
6 files changed, 57 insertions(+), 50 deletions(-)
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 7faa6fdcd..bb9aae1b3 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -2112,7 +2112,8 @@ Add scoring-metadata to the list of active plugins
'domainDatabase', 'ispDatabase' or 'insightsService'. If you wish to use any one of the
Database options, you should make one of GeoIP2-City.mmdb, GeoIP2-Connection-Type.mmdb,
GeoIP2-Domain.mmdb or GeoIP2-ISP.mmdb files respectively available on the classpath and
- available at runtime.
+ available at runtime. Alternatively, also the GeoLite2 IP databases (GeoLite2-*.mmdb)
+ can be used.
</description>
</property>
diff --git a/src/plugin/index-geoip/ivy.xml b/src/plugin/index-geoip/ivy.xml
index 4fa6f71a7..2eda5a63f 100644
--- a/src/plugin/index-geoip/ivy.xml
+++ b/src/plugin/index-geoip/ivy.xml
@@ -36,12 +36,11 @@
</publications>
<dependencies>
- <dependency org="com.maxmind.geoip2" name="geoip2" rev="2.12.0" >
- <!-- Exlude due to classpath issues -->
- <exclude org="org.apache.httpcomponents" name="httpclient" />
- <exclude org="org.apache.httpcomponents" name="httpcore" />
- <exclude org="commons-codec" name="commons-codec" />
- <exclude org="commons-logging" name="commons-logging" />
+ <dependency org="com.maxmind.geoip2" name="geoip2" rev="3.0.1">
+ <!-- Exlude libs provided in Nutch core -->
+ <exclude org="com.fasterxml.jackson.core" name="jackson-annotations" />
+ <exclude org="com.fasterxml.jackson.core" name="jackson-databind" />
+ <exclude org="com.fasterxml.jackson.core" name="jackson-core" />
</dependency>
</dependencies>
diff --git a/src/plugin/index-geoip/plugin.xml b/src/plugin/index-geoip/plugin.xml
index 6148f59e5..c4efadf94 100644
--- a/src/plugin/index-geoip/plugin.xml
+++ b/src/plugin/index-geoip/plugin.xml
@@ -25,11 +25,8 @@
<library name="index-geoip.jar">
<export name="*"/>
</library>
- <library name="geoip2-2.12.0.jar"/>
- <library name="jackson-annotations-2.9.5.jar"/>
- <library name="jackson-core-2.9.5.jar"/>
- <library name="jackson-databind-2.9.5.jar"/>
- <library name="maxmind-db-1.2.2.jar"/>
+ <library name="geoip2-3.0.1.jar"/>
+ <library name="maxmind-db-2.0.0.jar"/>
</runtime>
<requires>
diff --git a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java
index 1c697a205..64b3862be 100644
--- a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java
+++ b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java
@@ -17,13 +17,17 @@
package org.apache.nutch.indexer.geoip;
import java.io.IOException;
+import java.lang.invoke.MethodHandles;
import java.net.InetAddress;
import java.net.UnknownHostException;
import org.apache.nutch.indexer.NutchDocument;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import com.maxmind.geoip2.DatabaseReader;
import com.maxmind.geoip2.WebServiceClient;
+import com.maxmind.geoip2.exception.AddressNotFoundException;
import com.maxmind.geoip2.exception.GeoIp2Exception;
import com.maxmind.geoip2.model.InsightsResponse;
import com.maxmind.geoip2.model.CityResponse;
@@ -54,28 +58,17 @@ import com.maxmind.geoip2.record.Traits;
*/
public class GeoIPDocumentCreator {
- /**
- * Add field to document but only if value isn't null
- * @param doc the {@link NutchDocument} to augment
- * @param name the name of the target field
- * @param value the String value to associate with the target field
- */
- public static void addIfNotNull(NutchDocument doc, String name,
- String value) {
- if (value != null) {
- doc.add(name, value);
- }
- }
+ private static final Logger LOG = LoggerFactory
+ .getLogger(MethodHandles.lookup().lookupClass());
/**
* Add field to document but only if value isn't null
* @param doc the {@link NutchDocument} to augment
* @param name the name of the target field
- * @param value the {@link java.lang.Integer} value to
- * associate with the target field
+ * @param value the String value to associate with the target field
*/
public static void addIfNotNull(NutchDocument doc, String name,
- Integer value) {
+ Object value) {
if (value != null) {
doc.add(name, value);
}
@@ -87,7 +80,6 @@ public class GeoIPDocumentCreator {
addIfNotNull(doc, "ip", serverIp);
InsightsResponse response = client
.insights(InetAddress.getByName(serverIp));
- // CityResponse response = client.city(InetAddress.getByName(serverIp));
City city = response.getCity();
addIfNotNull(doc, "cityName", city.getName()); // 'Minneapolis'
@@ -103,7 +95,7 @@ public class GeoIPDocumentCreator {
addIfNotNull(doc, "countryIsoCode", country.getIsoCode()); // 'US'
addIfNotNull(doc, "countryName", country.getName()); // 'United States'
addIfNotNull(doc, "countryConfidence", country.getConfidence()); // 99
- addIfNotNull(doc, "countryGeoName", country.getGeoNameId());
+ addIfNotNull(doc, "countryGeoNameId", country.getGeoNameId());
Location location = response.getLocation();
addIfNotNull(doc, "latLon", location.getLatitude() + "," + location.getLongitude()); // 44.9733,
@@ -121,7 +113,7 @@ public class GeoIPDocumentCreator {
Subdivision subdivision = response.getMostSpecificSubdivision();
addIfNotNull(doc, "subDivName", subdivision.getName()); // 'Minnesota'
- addIfNotNull(doc, "subDivIdoCode", subdivision.getIsoCode()); // 'MN'
+ addIfNotNull(doc, "subDivIsoCode", subdivision.getIsoCode()); // 'MN'
addIfNotNull(doc, "subDivConfidence", subdivision.getConfidence()); // 90
addIfNotNull(doc, "subDivGeoNameId", subdivision.getGeoNameId());
@@ -169,7 +161,13 @@ public class GeoIPDocumentCreator {
public static NutchDocument createDocFromDomainDb(String serverIp,
NutchDocument doc, DatabaseReader reader) throws UnknownHostException,
IOException, GeoIp2Exception {
- DomainResponse response = reader.domain(InetAddress.getByName(serverIp));
+ DomainResponse response;
+ try {
+ response = reader.domain(InetAddress.getByName(serverIp));
+ } catch (AddressNotFoundException e) {
+ LOG.debug("IP address not found: {}", serverIp);
+ return doc;
+ }
addIfNotNull(doc, "ip", serverIp);
addIfNotNull(doc, "domain", response.getDomain());
return doc;
@@ -189,7 +187,14 @@ public class GeoIPDocumentCreator {
NutchDocument doc, DatabaseReader reader) throws UnknownHostException,
IOException, GeoIp2Exception {
addIfNotNull(doc, "ip", serverIp);
- CityResponse response = reader.city(InetAddress.getByName(serverIp));
+
+ CityResponse response;
+ try {
+ response = reader.city(InetAddress.getByName(serverIp));
+ } catch (AddressNotFoundException e) {
+ LOG.debug("IP address not found: {}", serverIp);
+ return doc;
+ }
City city = response.getCity();
addIfNotNull(doc, "cityName", city.getName()); // 'Minneapolis'
@@ -206,7 +211,7 @@ public class GeoIPDocumentCreator {
addIfNotNull(doc, "countryIsoCode", country.getIsoCode()); // 'US'
addIfNotNull(doc, "countryName", country.getName()); // 'United States'
addIfNotNull(doc, "countryConfidence", country.getConfidence()); // 99
- addIfNotNull(doc, "countryGeoName", country.getGeoNameId());
+ addIfNotNull(doc, "countryGeoNameId", country.getGeoNameId());
Location location = response.getLocation();
addIfNotNull(doc, "latLon", location.getLatitude() + "," + location.getLongitude()); // 44.9733,
@@ -224,7 +229,7 @@ public class GeoIPDocumentCreator {
Subdivision subdivision = response.getMostSpecificSubdivision();
addIfNotNull(doc, "subDivName", subdivision.getName()); // 'Minnesota'
- addIfNotNull(doc, "subDivIdoCode", subdivision.getIsoCode()); // 'MN'
+ addIfNotNull(doc, "subDivIsoCode", subdivision.getIsoCode()); // 'MN'
addIfNotNull(doc, "subDivConfidence", subdivision.getConfidence()); // 90
addIfNotNull(doc, "subDivGeoNameId", subdivision.getGeoNameId());
return doc;
diff --git a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java
index 4e2127365..ea30b8c7b 100644
--- a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java
+++ b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java
@@ -87,7 +87,8 @@ import com.maxmind.geoip2.WebServiceClient;
* 'domainDatabase', 'ispDatabase' or 'insightsService'. If you wish to use any one of the
* Database options, you should make one of GeoIP2-City.mmdb, GeoIP2-Connection-Type.mmdb,
* GeoIP2-Domain.mmdb or GeoIP2-ISP.mmdb files respectively available on the Hadoop classpath
- * and available at runtime. This can be achieved by adding it to $NUTCH_HOME/conf
+ * and available at runtime. This can be achieved by adding it to `$NUTCH_HOME/conf`.
+ * Alternatively, also the GeoLite2 IP databases (GeoLite2-*.mmdb) can be used.
* </description>
* </property>
*
@@ -152,24 +153,29 @@ public class GeoIPIndexingFilter implements IndexingFilter {
conf.getInt("index.geoip.userid", 12345),
conf.get("index.geoip.licensekey")).build();
} else {
- String db = null;
+ String dbSuffix = null;
if (usage.equalsIgnoreCase("cityDatabase")) {
- db = "GeoIP2-City.mmdb";
+ dbSuffix = "-City.mmdb";
} else if (usage.equalsIgnoreCase("connectionTypeDatabase")) {
- db = "GeoIP2-Connection-Type.mmdb";
+ dbSuffix = "-Connection-Type.mmdb";
} else if (usage.equalsIgnoreCase("domainDatabase")) {
- db = "GeoIP2-Domain.mmdb";
+ dbSuffix = "-Domain.mmdb";
} else if (usage.equalsIgnoreCase("ispDatabase")) {
- db = "GeoIP2-ISP.mmdb";
+ dbSuffix = "-ISP.mmdb";
}
- URL dbFileUrl = conf.getResource(db);
- if (dbFileUrl == null) {
- LOG.error("GeoDb file {} not found on classpath", db);
- } else {
- try {
- buildDb(new File(dbFileUrl.getFile()));
- } catch (Exception e) {
- LOG.error("Failed to read geoDb file {}: ", db, e);
+ String[] dbPrefixes = {"GeoIP2", "GeoLite2"};
+ for (String dbPrefix : dbPrefixes) {
+ String db = dbPrefix + dbSuffix;
+ URL dbFileUrl = conf.getResource(db);
+ if (dbFileUrl == null) {
+ LOG.error("GeoDb file {} not found on classpath", db);
+ } else {
+ try {
+ LOG.info("Reading GeoDb file {}", db);
+ buildDb(new File(dbFileUrl.getFile()));
+ } catch (Exception e) {
+ LOG.error("Failed to read geoDb file {}: ", db, e);
+ }
}
}
}
diff --git a/src/plugin/indexer-solr/schema.xml b/src/plugin/indexer-solr/schema.xml
index 6865eb02c..ba71fe148 100644
--- a/src/plugin/indexer-solr/schema.xml
+++ b/src/plugin/indexer-solr/schema.xml
@@ -356,7 +356,7 @@
<field name="cityGeoNameId" type="int" stored="true" indexed="true" />
<field name="continentCode" type="string" stored="true" indexed="true" />
<field name="continentGeoNameId" type="int" stored="true" indexed="true" />
- <field name="contentName" type="string" stored="true" indexed="true" />
+ <field name="continentName" type="string" stored="true" indexed="true" />
<field name="countryIsoCode" type="string" stored="true" indexed="true"/>
<field name="countryName" type="string" stored="true" indexed="true" />
<field name="countryConfidence" type="int" stored="true" indexed="true"/>
@@ -379,7 +379,6 @@
<field name="org" type="string" stored="true" indexed="true" />
<field name="userType" type="string" stored="true" indexed="true" />
<field name="isAnonProxy" type="boolean" stored="true" indexed="true" />
- <field name="isSatelitteProv" type="boolean" stored="true" indexed="true" />
<field name="connType" type="string" stored="true" indexed="true" />
<field name="location" type="location" stored="true" indexed="true" />