You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/16 19:48:39 UTC
[23/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build
for nutch-core and nutch-plugins
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-metadata/src/main/java/org/apache/nutch/indexer/metadata/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-metadata/src/main/java/org/apache/nutch/indexer/metadata/package-info.java b/nutch-plugins/index-metadata/src/main/java/org/apache/nutch/indexer/metadata/package-info.java
new file mode 100644
index 0000000..8f2bee5
--- /dev/null
+++ b/nutch-plugins/index-metadata/src/main/java/org/apache/nutch/indexer/metadata/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Indexing filter to add document metadata to the index.
+ * Metadata may come from CrawlDb, parse or content metadata.
+ */
+package org.apache.nutch.indexer.metadata;
+
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-more/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-more/build.xml b/nutch-plugins/index-more/build.xml
new file mode 100644
index 0000000..dec1e12
--- /dev/null
+++ b/nutch-plugins/index-more/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="index-more" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-more/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-more/ivy.xml b/nutch-plugins/index-more/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/index-more/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../..//ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ </dependencies>
+
+</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-more/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-more/plugin.xml b/nutch-plugins/index-more/plugin.xml
new file mode 100644
index 0000000..d920f72
--- /dev/null
+++ b/nutch-plugins/index-more/plugin.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="index-more"
+ name="More Indexing Filter"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+
+ <runtime>
+ <library name="index-more.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
+ <extension id="org.apache.nutch.indexer.more"
+ name="Nutch More Indexing Filter"
+ point="org.apache.nutch.indexer.IndexingFilter">
+ <implementation id="MoreIndexingFilter"
+ class="org.apache.nutch.indexer.more.MoreIndexingFilter"/>
+ </extension>
+
+</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-more/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-more/pom.xml b/nutch-plugins/index-more/pom.xml
new file mode 100644
index 0000000..80e5de0
--- /dev/null
+++ b/nutch-plugins/index-more/pom.xml
@@ -0,0 +1,38 @@
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one or more
+ ~ contributor license agreements. See the NOTICE file distributed with
+ ~ this work for additional information regarding copyright ownership.
+ ~ The ASF licenses this file to You under the Apache License, Version 2.0
+ ~ (the "License"); you may not use this file except in compliance with
+ ~ the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing, software
+ ~ distributed under the License is distributed on an "AS IS" BASIS,
+ ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ~ See the License for the specific language governing permissions and
+ ~ limitations under the License.
+ -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.nutch</groupId>
+ <artifactId>nutch-plugins</artifactId>
+ <version>1.13-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+ <artifactId>index-more</artifactId>
+ <packaging>jar</packaging>
+
+ <name>index-more</name>
+ <url>http://nutch.apache.org</url>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-more/src/main/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-more/src/main/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java b/nutch-plugins/index-more/src/main/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
new file mode 100644
index 0000000..6e64ede
--- /dev/null
+++ b/nutch-plugins/index-more/src/main/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
@@ -0,0 +1,344 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.more;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.nutch.metadata.Metadata;
+
+import org.apache.nutch.net.protocols.HttpDateFormat;
+import org.apache.nutch.net.protocols.Response;
+
+import org.apache.nutch.parse.Parse;
+
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.NutchDocument;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.util.MimeUtil;
+import org.apache.tika.Tika;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+
+import java.text.ParseException;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.util.Date;
+import java.util.regex.*;
+import java.util.HashMap;
+
+import org.apache.commons.lang.StringUtils;
+import org.apache.commons.lang.time.DateUtils;
+
+/**
+ * Add (or reset) a few metaData properties as respective fields (if they are
+ * available), so that they can be accurately used within the search index.
+ *
+ * 'lastModifed' is indexed to support query by date, 'contentLength' obtains
+ * content length from the HTTP header, 'type' field is indexed to support query
+ * by type and finally the 'title' field is an attempt to reset the title if a
+ * content-disposition hint exists. The logic is that such a presence is
+ * indicative that the content provider wants the filename therein to be used as
+ * the title.
+ *
+ * Still need to make content-length searchable!
+ *
+ * @author John Xing
+ */
+
+public class MoreIndexingFilter implements IndexingFilter {
+ public static final Logger LOG = LoggerFactory
+ .getLogger(MoreIndexingFilter.class);
+
+ /** Get the MimeTypes resolver instance. */
+ private MimeUtil MIME;
+ private Tika tika = new Tika();
+
+ /** Map for mime-type substitution */
+ private HashMap<String, String> mimeMap = null;
+ private boolean mapMimes = false;
+
+ public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+ CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+
+ String url_s = url.toString();
+
+ addTime(doc, parse.getData(), url_s, datum);
+ addLength(doc, parse.getData(), url_s);
+ addType(doc, parse.getData(), url_s, datum);
+ resetTitle(doc, parse.getData(), url_s);
+
+ return doc;
+ }
+
+ // Add time related meta info. Add last-modified if present. Index date as
+ // last-modified, or, if that's not present, use fetch time.
+ private NutchDocument addTime(NutchDocument doc, ParseData data, String url,
+ CrawlDatum datum) {
+ long time = -1;
+
+ String lastModified = data.getMeta(Metadata.LAST_MODIFIED);
+ if (lastModified != null) { // try parse last-modified
+ time = getTime(lastModified, url); // use as time
+ // store as string
+ doc.add("lastModified", new Date(time));
+ }
+
+ if (time == -1) { // if no last-modified specified in HTTP header
+ time = datum.getModifiedTime(); // use value in CrawlDatum
+ if (time <= 0) { // if also unset
+ time = datum.getFetchTime(); // use time the fetch took place (fetchTime
+ // of fetchDatum)
+ }
+ }
+
+ // un-stored, indexed and un-tokenized
+ doc.add("date", new Date(time));
+ return doc;
+ }
+
+ private long getTime(String date, String url) {
+ long time = -1;
+ try {
+ time = HttpDateFormat.toLong(date);
+ } catch (ParseException e) {
+ // try to parse it as date in alternative format
+ try {
+ Date parsedDate = DateUtils.parseDate(date, new String[] {
+ "EEE MMM dd HH:mm:ss yyyy", "EEE MMM dd HH:mm:ss yyyy zzz",
+ "EEE MMM dd HH:mm:ss zzz yyyy", "EEE, MMM dd HH:mm:ss yyyy zzz",
+ "EEE, dd MMM yyyy HH:mm:ss zzz", "EEE,dd MMM yyyy HH:mm:ss zzz",
+ "EEE, dd MMM yyyy HH:mm:sszzz", "EEE, dd MMM yyyy HH:mm:ss",
+ "EEE, dd-MMM-yy HH:mm:ss zzz", "yyyy/MM/dd HH:mm:ss.SSS zzz",
+ "yyyy/MM/dd HH:mm:ss.SSS", "yyyy/MM/dd HH:mm:ss zzz", "yyyy/MM/dd",
+ "yyyy.MM.dd HH:mm:ss", "yyyy-MM-dd HH:mm",
+ "MMM dd yyyy HH:mm:ss. zzz", "MMM dd yyyy HH:mm:ss zzz",
+ "dd.MM.yyyy HH:mm:ss zzz", "dd MM yyyy HH:mm:ss zzz",
+ "dd.MM.yyyy; HH:mm:ss", "dd.MM.yyyy HH:mm:ss", "dd.MM.yyyy zzz",
+ "yyyy-MM-dd'T'HH:mm:ss'Z'" });
+ time = parsedDate.getTime();
+ // if (LOG.isWarnEnabled()) {
+ // LOG.warn(url + ": parsed date: " + date +" to:"+time);
+ // }
+ } catch (Exception e2) {
+ if (LOG.isWarnEnabled()) {
+ LOG.warn(url + ": can't parse erroneous date: " + date);
+ }
+ }
+ }
+ return time;
+ }
+
+ // Add Content-Length
+ private NutchDocument addLength(NutchDocument doc, ParseData data, String url) {
+ String contentLength = data.getMeta(Response.CONTENT_LENGTH);
+
+ if (contentLength != null) {
+ // NUTCH-1010 ContentLength not trimmed
+ String trimmed = contentLength.toString().trim();
+ if (!trimmed.isEmpty())
+ doc.add("contentLength", trimmed);
+ }
+ return doc;
+ }
+
+ /**
+ * <p>
+ * Add Content-Type and its primaryType and subType add contentType,
+ * primaryType and subType to field "type" as un-stored, indexed and
+ * un-tokenized, so that search results can be confined by contentType or its
+ * primaryType or its subType.
+ * </p>
+ * <p>
+ * For example, if contentType is application/vnd.ms-powerpoint, search can be
+ * done with one of the following qualifiers
+ * type:application/vnd.ms-powerpoint type:application type:vnd.ms-powerpoint
+ * all case insensitive. The query filter is implemented in
+ * {@link TypeQueryFilter}.
+ * </p>
+ *
+ * @param doc
+ * @param data
+ * @param url
+ * @return
+ */
+ private NutchDocument addType(NutchDocument doc, ParseData data, String url,
+ CrawlDatum datum) {
+ String mimeType = null;
+ String contentType = null;
+
+ Writable tcontentType = datum.getMetaData().get(
+ new Text(Response.CONTENT_TYPE));
+ if (tcontentType != null) {
+ contentType = tcontentType.toString();
+ } else
+ contentType = data.getMeta(Response.CONTENT_TYPE);
+ if (contentType == null) {
+ // Note by Jerome Charron on 20050415:
+ // Content Type not solved by a previous plugin
+ // Or unable to solve it... Trying to find it
+ // Should be better to use the doc content too
+ // (using MimeTypes.getMimeType(byte[], String), but I don't know
+ // which field it is?
+ // if (MAGIC) {
+ // contentType = MIME.getMimeType(url, content);
+ // } else {
+ // contentType = MIME.getMimeType(url);
+ // }
+
+ mimeType = tika.detect(url);
+ } else {
+ mimeType = MIME.forName(MimeUtil.cleanMimeType(contentType));
+ }
+
+ // Checks if we solved the content-type.
+ if (mimeType == null) {
+ return doc;
+ }
+
+ // Check if we have to map mime types
+ if (mapMimes) {
+ // Check if the current mime is mapped
+ if (mimeMap.containsKey(mimeType)) {
+ // It's mapped, let's replace it
+ mimeType = mimeMap.get(mimeType);
+ }
+ }
+
+ contentType = mimeType;
+ doc.add("type", contentType);
+
+ // Check if we need to split the content type in sub parts
+ if (conf.getBoolean("moreIndexingFilter.indexMimeTypeParts", true)) {
+ String[] parts = getParts(contentType);
+
+ for (String part : parts) {
+ doc.add("type", part);
+ }
+ }
+
+ // leave this for future improvement
+ // MimeTypeParameterList parameterList = mimeType.getParameters()
+
+ return doc;
+ }
+
+ /**
+ * Utility method for splitting mime type into type and subtype.
+ *
+ * @param mimeType
+ * @return
+ */
+ static String[] getParts(String mimeType) {
+ return mimeType.split("/");
+ }
+
+ // Reset title if we see non-standard HTTP header "Content-Disposition".
+ // It's a good indication that content provider wants filename therein
+ // be used as the title of this url.
+
+ // Patterns used to extract filename from possible non-standard
+ // HTTP header "Content-Disposition". Typically it looks like:
+ // Content-Disposition: inline; filename="foo.ppt"
+ private Configuration conf;
+
+ static Pattern patterns[] = { null, null };
+
+ static {
+ try {
+ // order here is important
+ patterns[0] = Pattern.compile("\\bfilename=['\"](.+)['\"]");
+ patterns[1] = Pattern.compile("\\bfilename=(\\S+)\\b");
+ } catch (PatternSyntaxException e) {
+ // just ignore
+ }
+ }
+
+ private NutchDocument resetTitle(NutchDocument doc, ParseData data, String url) {
+ String contentDisposition = data.getMeta(Metadata.CONTENT_DISPOSITION);
+ if (contentDisposition == null || doc.getFieldValue("title") != null)
+ return doc;
+
+ for (int i = 0; i < patterns.length; i++) {
+ Matcher matcher = patterns[i].matcher(contentDisposition);
+ if (matcher.find()) {
+ doc.add("title", matcher.group(1));
+ break;
+ }
+ }
+
+ return doc;
+ }
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ MIME = new MimeUtil(conf);
+
+ if (conf.getBoolean("moreIndexingFilter.mapMimeTypes", false) == true) {
+ mapMimes = true;
+
+ // Load the mapping
+ try {
+ readConfiguration();
+ } catch (Exception e) {
+ LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+ }
+ }
+ }
+
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+ private void readConfiguration() throws IOException {
+ LOG.info("Reading content type mappings from file contenttype-mapping.txt");
+ BufferedReader reader = new BufferedReader(
+ conf.getConfResourceAsReader("contenttype-mapping.txt"));
+ String line;
+ String parts[];
+ boolean formatWarningShown = false;
+
+ mimeMap = new HashMap<String, String>();
+
+ while ((line = reader.readLine()) != null) {
+ if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
+ line.trim();
+ parts = line.split("\t");
+
+ // Must be at least two parts
+ if (parts.length > 1) {
+ for (int i = 1; i < parts.length; i++) {
+ mimeMap.put(parts[i].trim(), parts[0].trim());
+ }
+ } else {
+ LOG.warn("Wrong format of line: {}", line);
+ if (!formatWarningShown) {
+ LOG.warn("Expected format: <target type> <tab> <type1> [<tab> <type2> ...]");
+ formatWarningShown = true;
+ }
+ }
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-more/src/main/java/org/apache/nutch/indexer/more/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-more/src/main/java/org/apache/nutch/indexer/more/package.html b/nutch-plugins/index-more/src/main/java/org/apache/nutch/indexer/more/package.html
new file mode 100644
index 0000000..7b8fade
--- /dev/null
+++ b/nutch-plugins/index-more/src/main/java/org/apache/nutch/indexer/more/package.html
@@ -0,0 +1,6 @@
+<html>
+<body>
+<p>A more indexing plugin, adds "more" index fields:
+last modified date, MIME type, content length.</p><p></p>
+</body>
+</html>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-more/src/test/java/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-more/src/test/java/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java b/nutch-plugins/index-more/src/test/java/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
new file mode 100644
index 0000000..f918dde
--- /dev/null
+++ b/nutch-plugins/index-more/src/test/java/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
@@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.more;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TestMoreIndexingFilter {
+
+ @Test
+ public void testContentType() throws IndexingException {
+ Configuration conf = NutchConfiguration.create();
+ assertContentType(conf, "text/html", "text/html");
+ assertContentType(conf, "text/html; charset=UTF-8", "text/html");
+ }
+
+ @Test
+ public void testGetParts() {
+ String[] parts = MoreIndexingFilter.getParts("text/html");
+ assertParts(parts, 2, "text", "html");
+ }
+
+ /**
+ * @since NUTCH-901
+ */
+ @Test
+ public void testNoParts() {
+ Configuration conf = NutchConfiguration.create();
+ conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);
+ MoreIndexingFilter filter = new MoreIndexingFilter();
+ filter.setConf(conf);
+ Assert.assertNotNull(filter);
+ NutchDocument doc = new NutchDocument();
+ ParseImpl parse = new ParseImpl("foo bar", new ParseData());
+
+ try {
+ filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"),
+ new CrawlDatum(), new Inlinks());
+ } catch (Exception e) {
+ e.printStackTrace();
+ Assert.fail(e.getMessage());
+ }
+ Assert.assertNotNull(doc);
+ Assert.assertTrue(doc.getFieldNames().contains("type"));
+ Assert.assertEquals(1, doc.getField("type").getValues().size());
+ Assert.assertEquals("text/html", doc.getFieldValue("type"));
+ }
+
+ @Test
+ public void testContentDispositionTitle() throws IndexingException {
+ Configuration conf = NutchConfiguration.create();
+
+ Metadata metadata = new Metadata();
+ metadata.add(Response.CONTENT_DISPOSITION, "filename=filename.ext");
+ MoreIndexingFilter filter = new MoreIndexingFilter();
+ filter.setConf(conf);
+
+ Text url = new Text("http://www.example.com/");
+ ParseImpl parseImpl = new ParseImpl("text", new ParseData(
+ new ParseStatus(), "title", new Outlink[0], metadata));
+
+ NutchDocument doc = new NutchDocument();
+ doc = filter.filter(doc, parseImpl, url, new CrawlDatum(), new Inlinks());
+
+ Assert.assertEquals("content-disposition not detected", "filename.ext",
+ doc.getFieldValue("title"));
+
+ /* NUTCH-1140: do not add second title to avoid a multi-valued title field */
+ doc = new NutchDocument();
+ doc.add("title", "title");
+ doc = filter.filter(doc, parseImpl, url, new CrawlDatum(), new Inlinks());
+ Assert.assertEquals("do not add second title by content-disposition",
+ "title", doc.getFieldValue("title"));
+ }
+
+ private void assertParts(String[] parts, int count, String... expected) {
+ Assert.assertEquals(count, parts.length);
+ for (int i = 0; i < expected.length; i++) {
+ Assert.assertEquals(expected[i], parts[i]);
+ }
+ }
+
+ private void assertContentType(Configuration conf, String source,
+ String expected) throws IndexingException {
+ Metadata metadata = new Metadata();
+ metadata.add(Response.CONTENT_TYPE, source);
+ MoreIndexingFilter filter = new MoreIndexingFilter();
+ filter.setConf(conf);
+ NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl(
+ "text", new ParseData(new ParseStatus(), "title", new Outlink[0],
+ metadata)), new Text("http://www.example.com/"), new CrawlDatum(),
+ new Inlinks());
+ Assert.assertEquals("mime type not detected", expected,
+ doc.getFieldValue("type"));
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-replace/README.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-replace/README.txt b/nutch-plugins/index-replace/README.txt
new file mode 100644
index 0000000..4c866a7
--- /dev/null
+++ b/nutch-plugins/index-replace/README.txt
@@ -0,0 +1,95 @@
+IndexReplace plugin
+
+Allows indexing-time regexp replace manipulation of metadata fields.
+
+Configuration Example
+ <property>
+ <name>index.replace.regexp</name>
+ <value>
+ id=/file\:/http\:my.site.com/
+ url=/file\:/http\:my.site.com/2
+ </value>
+ </property
+
+Property format: index.replace.regexp
+ The format of the property is a list of regexp replacements, one line per field being
+ modified. Field names would be one of those from https://wiki.apache.org/nutch/IndexStructure.
+
+ The fieldname precedes the equal sign. The first character after the equal sign signifies
+ the delimiter for the regexp, the replacement value and the flags.
+
+Replacement Sequence
+ The replacements will happen in the order listed. If a field needs multiple replacement operations
+ they may be listed more than once.
+
+RegExp Format
+ The regexp and the optional flags should correspond to Pattern.compile(String regexp, int flags) defined
+ here: http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#compile%28java.lang.String,%20int%29
+ Patterns are compiled when the plugin is initialized for efficiency.
+
+Replacement Format
+ The replacement value should correspond to Java Matcher(CharSequence input).replaceAll(String replacement):
+ http://docs.oracle.com/javase/7/docs/api/java/util/regex/Matcher.html#replaceAll%28java.lang.String%29
+
+Flags
+ The flags is an integer sum of the flag values defined in
+ http://docs.oracle.com/javase/7/docs/api/constant-values.html (Sec: java.util.regex.Pattern)
+
+Creating New Fields
+ If you express the fieldname as fldname1:fldname2=[replacement], then the replacer will create a new field
+ from the source field. The source field remains unmodified. This is an alternative to solrindex-mapping
+ which is only able to copy fields verbatim.
+
+Multi-valued Fields
+ If a field has multiple values, the replacement will be applied to each value in turn.
+
+Non-string Datatypes
+ Replacement is possible only on String field datatypes. If the field you name in the property is
+ not a String datatype, it will be silently ignored.
+
+Host and URL specific replacements.
+ If the replacements should apply only to specific pages, then add a sequence like
+
+ hostmatch=hostmatchpattern
+ fld1=/regexp/replace/flags
+ fld2=/regexp/replace/flags
+
+ or
+ urlmatch=urlmatchpattern
+ fld1=/regexp/replace/flags
+ fld2=/regexp/replace/flags
+
+When using Host and URL replacements, all replacements preceding the first hostmatch or urlmatch
+will apply to all parsed pages. Replacements following a hostmatch or urlmatch will be applied
+to pages which match the host or url field (up to the next hostmatch or urlmatch line). hostmatch
+and urlmatch patterns must be unique in this property.
+
+Plugin order
+ In most cases you will want this plugin to run last.
+
+Testing your match patterns
+ Online Regexp testers like http://www.regexplanet.com/advanced/java/index.html
+ can help get the basics of your pattern working.
+ To test in nutch:
+ Prepare a test HTML file with the field contents you want to test.
+ Place this in a directory accessible to nutch.
+ Use the file:/// syntax to list the test file(s) in a test/urls seed list.
+ See the nutch faq "index my local file system" for conf settings you will need.
+ (Note the urlmatch and hostmatch patterns may not conform to your test file host and url; This
+ test approach confirms only how your global matches behave, unless your urlmatch and hostmatch
+ patterns also match the file: URL pattern)
+
+ Run..
+ bin/nutch inject crawl/crawldb test
+ bin/nutch generate crawl/crawldb crawl/segments
+ bin/nutch fetch crawl/segments/[segment]
+ bin/nutch parse crawl/segments/[segment]
+ bin/nutch invertlinks crawl/linkdb -dir crawl/segments
+ ...index your document, for example with SOLR...
+ bin/nutch solrindex http://localhost:8983/solr crawl/crawldb/ -linkdb crawl/linkdb/ crawl/segement[segment] -filter -normalize
+
+ Inspect hadoop.log for info about pattern parsing and compilation..
+ grep replace logs/hadoop.log
+
+ To inspect your index with the solr admin panel...
+ http://localhost:8983/solr/#/
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-replace/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-replace/build.xml b/nutch-plugins/index-replace/build.xml
new file mode 100644
index 0000000..ea8c95d
--- /dev/null
+++ b/nutch-plugins/index-replace/build.xml
@@ -0,0 +1,55 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="index-replace" default="jar-core">
+
+ <import file="../build-plugin.xml" />
+
+ <!-- Add compilation dependencies to classpath -->
+ <path id="plugin.deps">
+ <fileset dir="${nutch.root}/build">
+ <include name="**/index-basic/*.jar" />
+ <include name="**/index-metadata/*.jar" />
+ </fileset>
+ <pathelement location="${nutch.root}/build/lib-regex-filter/test"/>
+ </path>
+
+ <!-- Compile Unit test dependencies -->
+ <target name="deps-test-compile">
+ <ant target="compile-test" inheritall="false" dir="../index-basic"/>
+ <ant target="compile-test" inheritall="false" dir="../index-metadata"/>
+ </target>
+
+ <!-- Deploy Unit test dependencies -->
+ <target name="deps-test">
+ <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints" />
+ <ant target="deploy" inheritall="false" dir="../protocol-file" />
+ <ant target="deploy" inheritall="false" dir="../parse-html" />
+ <ant target="deploy" inheritall="false" dir="../parse-metatags" />
+ <ant target="deploy" inheritall="false" dir="../index-basic" />
+ <ant target="deploy" inheritall="false" dir="../index-metadata" />
+ </target>
+
+ <!-- Copy test file for junit test -->
+ <mkdir dir="${build.test}/data" />
+ <copy todir="${build.test}/data">
+ <fileset dir="sample">
+ <include name="*.html" />
+ </fileset>
+ </copy>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-replace/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-replace/ivy.xml b/nutch-plugins/index-replace/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/index-replace/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../..//ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ </dependencies>
+
+</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-replace/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-replace/plugin.xml b/nutch-plugins/index-replace/plugin.xml
new file mode 100644
index 0000000..3cffe60
--- /dev/null
+++ b/nutch-plugins/index-replace/plugin.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<plugin
+ id="index-replace"
+ name="Replace Indexer"
+ version="1.0"
+ provider-name="PeterCiuffetti">
+
+ <runtime>
+ <library name="index-replace.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <extension id="org.apache.nutch.indexer.replace"
+ name="Replace Indexer"
+ point="org.apache.nutch.indexer.IndexingFilter">
+ <implementation id="ReplaceIndexer"
+ class="org.apache.nutch.indexer.replace.ReplaceIndexer"/>
+ </extension>
+
+</plugin>
+
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-replace/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-replace/pom.xml b/nutch-plugins/index-replace/pom.xml
new file mode 100644
index 0000000..d39851d
--- /dev/null
+++ b/nutch-plugins/index-replace/pom.xml
@@ -0,0 +1,50 @@
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one or more
+ ~ contributor license agreements. See the NOTICE file distributed with
+ ~ this work for additional information regarding copyright ownership.
+ ~ The ASF licenses this file to You under the Apache License, Version 2.0
+ ~ (the "License"); you may not use this file except in compliance with
+ ~ the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing, software
+ ~ distributed under the License is distributed on an "AS IS" BASIS,
+ ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ~ See the License for the specific language governing permissions and
+ ~ limitations under the License.
+ -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.nutch</groupId>
+ <artifactId>nutch-plugins</artifactId>
+ <version>1.13-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+ <artifactId>index-replace</artifactId>
+ <packaging>jar</packaging>
+
+ <name>index-replace</name>
+ <url>http://nutch.apache.org</url>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.nutch</groupId>
+ <artifactId>index-basic</artifactId>
+ <version>${parent.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.nutch</groupId>
+ <artifactId>index-metadata</artifactId>
+ <version>${parent.version}</version>
+ </dependency>
+ </dependencies>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-replace/src/main/java/org/apache/nutch/indexer/replace/FieldReplacer.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-replace/src/main/java/org/apache/nutch/indexer/replace/FieldReplacer.java b/nutch-plugins/index-replace/src/main/java/org/apache/nutch/indexer/replace/FieldReplacer.java
new file mode 100644
index 0000000..ddfe24d
--- /dev/null
+++ b/nutch-plugins/index-replace/src/main/java/org/apache/nutch/indexer/replace/FieldReplacer.java
@@ -0,0 +1,196 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.replace;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+/**
+ * POJO to store a filename, its match pattern and its replacement string.
+ *
+ * A checkAndReplace method is provided where you can simultaneously check if
+ * the field matches this replacer and if the pattern matches your field value.
+ *
+ * @author Peter Ciuffetti
+ */
+public class FieldReplacer {
+
+ private static final Log LOG = LogFactory.getLog(FieldReplacer.class
+ .getName());
+
+ private final String fieldName;
+ private final String toFieldName;
+ private final Pattern pattern;
+ private final String replacement;
+ private boolean isValid;
+
+ /**
+ * Create a FieldReplacer for a field.
+ *
+ * Any pattern exceptions are caught within this constructor and the object is
+ * marked inValid. The error will be logged. This prevents this caller from
+ * attempting invalid replacements.
+ *
+ * @param fieldName
+ * the name of the source field to operate on. Required.
+ * @param toFieldName
+ * the name of the target field. Required.
+ * @param pattern
+ * the pattern the field must match. Required.
+ * @param replacement
+ * the replacement string
+ * @param flags
+ * the Pattern flags value, or null if no flags are needed
+ */
+ public FieldReplacer(String fieldName, String toFieldName, String pattern,
+ String replacement, Integer flags) {
+
+ this.isValid = true;
+ // Must have a non-empty field name and pattern.
+ if (fieldName == null || fieldName.trim().length() == 0) {
+ LOG.error("Empty fieldName provided, FieldReplacer marked invalid.");
+ this.isValid = false;
+ }
+ if (pattern == null || pattern.trim().length() == 0) {
+ LOG.error("Empty pattern for field " + fieldName
+ + "provided, FieldReplacer marked invalid.");
+ this.isValid = false;
+ }
+
+ if (replacement == null) {
+ this.replacement = "";
+ } else {
+ this.replacement = replacement;
+ }
+
+ this.fieldName = fieldName.trim();
+ this.toFieldName = toFieldName.trim();
+
+ if (this.isValid) {
+ LOG.info("Compiling pattern " + pattern + " for field " + fieldName);
+ Pattern myPattern = null;
+ try {
+ if (flags != null) {
+ myPattern = Pattern.compile(pattern, flags);
+ } else {
+ myPattern = Pattern.compile(pattern);
+ }
+ } catch (PatternSyntaxException e) {
+ LOG.error("Pattern " + pattern + " for field " + fieldName
+ + " failed to compile: " + e.toString());
+ this.isValid = false;
+ }
+ this.pattern = myPattern;
+ } else {
+ this.pattern = null;
+ }
+ }
+
+ /**
+ * Field replacer with the input and output field the same.
+ *
+ * @param fieldName
+ * @param pattern
+ * @param replacement
+ * @param flags
+ */
+ public FieldReplacer(String fieldName, String pattern, String replacement,
+ Integer flags) {
+ this(fieldName, fieldName, pattern, replacement, flags);
+ }
+
+ public String getFieldName() {
+ return this.fieldName;
+ }
+
+ public String getToFieldName() {
+ return this.toFieldName;
+ }
+
+ public Pattern getPattern() {
+ return this.pattern;
+ }
+
+ public String getReplacement() {
+ return this.replacement;
+ }
+
+ /**
+ * Does this FieldReplacer have a valid fieldname and pattern?
+ *
+ * @return
+ */
+ public boolean isValid() {
+ return this.isValid;
+ }
+
+ /**
+ * Return the replacement value for a field value.
+ *
+ * This does not check for a matching field; the caller must decide if this
+ * FieldReplacer should operate on this value by checking getFieldName().
+ *
+ * The method returns the value with the replacement. If the value returned is
+ * not different then eiher the pattern didn't match or the replacement was a
+ * no-op.
+ *
+ * @param value
+ * @return
+ */
+ public String replace(String value) {
+ if (this.isValid) {
+ return this.pattern.matcher(value).replaceAll(replacement);
+ } else {
+ return value;
+ }
+ }
+
+ /**
+ * Return a replacement value for a field.
+ *
+ * This is designed to fail fast and trigger a replacement only when
+ * necessary. If this method returns null, either the field does not match or
+ * the value does not match the pattern (or possibly the pattern is invalid).
+ *
+ * So only if the method returns a non-null value will you need to replace the
+ * value for the field.
+ *
+ * @param fieldName
+ * the name of the field you are checking
+ * @param value
+ * the value of the field you are checking
+ * @return a replacement value. If null, either the field does not match or
+ * the value does not match.
+ */
+ public String checkAndReplace(String fieldName, String value) {
+ if (this.fieldName.equals(fieldName)) {
+ if (value != null && value.length() > 0) {
+ if (this.isValid) {
+ Matcher m = this.pattern.matcher(value);
+ if (m.find()) {
+ return m.replaceAll(this.replacement);
+ }
+ }
+ }
+ }
+ return null;
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-replace/src/main/java/org/apache/nutch/indexer/replace/ReplaceIndexer.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-replace/src/main/java/org/apache/nutch/indexer/replace/ReplaceIndexer.java b/nutch-plugins/index-replace/src/main/java/org/apache/nutch/indexer/replace/ReplaceIndexer.java
new file mode 100644
index 0000000..7017603
--- /dev/null
+++ b/nutch-plugins/index-replace/src/main/java/org/apache/nutch/indexer/replace/ReplaceIndexer.java
@@ -0,0 +1,330 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.replace;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.NutchField;
+import org.apache.nutch.parse.Parse;
+
+/**
+ * Do pattern replacements on selected field contents prior to indexing.
+ *
+ * To use this plugin, add <code>index-replace</code> to your
+ * <code>plugin.includes</code>. Example:
+ *
+ * <pre>
+ * <property>
+ * <name>plugin.includes</name>
+ * <value>protocol-(http)|urlfilter-regex|parse-(html|tika|metatags)|index-(basic|anchor|metadata|replace)|urlnormalizer-(pass|regex|basic)|indexer-solr</value>
+ * </property>
+ * </pre>
+ *
+ * And then add the <code>index.replace.regexp</code> property to
+ * <code>conf/nutch-site.xml</code>. This contains a list of replacement
+ * instructions per field name, one per line. eg.
+ *
+ * <pre>
+ * fieldname=/regexp/replacement/[flags]
+ * </pre>
+ *
+ * <pre>
+ * <property>
+ * <name>index.replace.regexp</name>
+ * <value>
+ * hostmatch=.*\\.com
+ * title=/search/replace/2
+ * </value>
+ * </property>
+ * </pre>
+ *
+ * <code>hostmatch=</code> and <code>urlmatch=</code> lines indicate the match
+ * pattern for a host or url. The field replacements that follow this line will
+ * apply only to pages from the matching host or url. Replacements run in the
+ * order specified. Field names may appear multiple times if multiple
+ * replacements are needed.
+ *
+ * The property format is defined in greater detail in
+ * <code>conf/nutch-default.xml</code>.
+ *
+ * @author Peter Ciuffetti
+ * @see <a
+ * href="https://issues.apache.org/jira/browse/NUTCH-2058">NUTCH-2058</a>
+ */
+public class ReplaceIndexer implements IndexingFilter {
+
+ private static final Log LOG = LogFactory.getLog(ReplaceIndexer.class
+ .getName());
+
+ /** Special field name signifying the start of a host-specific match set */
+ private static final String HOSTMATCH = "hostmatch";
+ /** Special field name signifying the start of a url-specific match set */
+ private static final String URLMATCH = "urlmatch";
+
+ private static Map<Pattern, List<FieldReplacer>> FIELDREPLACERS_BY_HOST = new LinkedHashMap<Pattern, List<FieldReplacer>>();
+ private static Map<Pattern, List<FieldReplacer>> FIELDREPLACERS_BY_URL = new LinkedHashMap<Pattern, List<FieldReplacer>>();
+
+ private static Pattern LINE_SPLIT = Pattern.compile("(^.+$)+",
+ Pattern.MULTILINE);
+ private static Pattern NAME_VALUE_SPLIT = Pattern.compile("(.*?)=(.*)");
+
+ private Configuration conf;
+
+ /**
+ * {@inheritDoc}
+ */
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ FIELDREPLACERS_BY_HOST.clear();
+ FIELDREPLACERS_BY_URL.clear();
+ String value = conf.get("index.replace.regexp", null);
+ if (value != null) {
+ LOG.debug("Parsing index.replace.regexp property");
+ this.parseConf(value);
+ }
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+ /**
+ * Parse the property value into a set of maps that store a list of
+ * replacements by field for each host and url configured into the property.
+ *
+ * @param propertyValue
+ */
+ private void parseConf(String propertyValue) {
+ if (propertyValue == null || propertyValue.trim().length() == 0) {
+ return;
+ }
+
+ // At the start, all replacements apply globally to every host.
+ Pattern hostPattern = Pattern.compile(".*");
+ Pattern urlPattern = null;
+
+ // Split the property into lines
+ Matcher lineMatcher = LINE_SPLIT.matcher(propertyValue);
+ while (lineMatcher.find()) {
+ String line = lineMatcher.group();
+ if (line != null && line.length() > 0) {
+
+ // Split the line into field and value
+ Matcher nameValueMatcher = NAME_VALUE_SPLIT.matcher(line.trim());
+ if (nameValueMatcher.find()) {
+ String fieldName = nameValueMatcher.group(1).trim();
+ String value = nameValueMatcher.group(2);
+ if (fieldName != null && value != null) {
+ // Check if the field name is one of our special cases.
+ if (HOSTMATCH.equals(fieldName)) {
+ urlPattern = null;
+ try {
+ hostPattern = Pattern.compile(value);
+ } catch (PatternSyntaxException pse) {
+ LOG.error("hostmatch pattern " + value + " does not compile: "
+ + pse.getMessage());
+ // Deactivate this invalid match set by making it match no host.
+ hostPattern = Pattern.compile("willnotmatchanyhost");
+ }
+ } else if (URLMATCH.equals(fieldName)) {
+ try {
+ urlPattern = Pattern.compile(value);
+ } catch (PatternSyntaxException pse) {
+ LOG.error("urlmatch pattern " + value + " does not compile: "
+ + pse.getMessage());
+ // Deactivate this invalid match set by making it match no url.
+ urlPattern = Pattern.compile("willnotmatchanyurl");
+ }
+ } else if (value.length() > 3) {
+ String toFieldName = fieldName;
+ // If the fieldname has a colon, this indicates a different target
+ // field.
+ if (fieldName.indexOf(':') > 0) {
+ toFieldName = fieldName.substring(fieldName.indexOf(':') + 1);
+ fieldName = fieldName.substring(0, fieldName.indexOf(':'));
+ }
+ String sep = value.substring(0, 1);
+
+ // Divide the value into pattern / replacement / flags.
+ value = value.substring(1);
+ if (!value.contains(sep)) {
+ LOG.error("Pattern '" + line
+ + "', not parseable. Missing separator " + sep);
+ continue;
+ }
+ String pattern = value.substring(0, value.indexOf(sep));
+ value = value.substring(pattern.length() + 1);
+ String replacement = value;
+ if (value.contains(sep)) {
+ replacement = value.substring(0, value.indexOf(sep));
+ }
+ int flags = 0;
+ if (value.length() > replacement.length() + 1) {
+ value = value.substring(replacement.length() + 1).trim();
+ try {
+ flags = Integer.parseInt(value);
+ } catch (NumberFormatException e) {
+ LOG.error("Pattern " + line + ", has invalid flags component");
+ continue;
+ }
+ }
+ Integer iFlags = (flags > 0) ? new Integer(flags) : null;
+
+ // Make a FieldReplacer out of these params.
+ FieldReplacer fr = new FieldReplacer(fieldName, toFieldName,
+ pattern, replacement, iFlags);
+
+ // Add this field replacer to the list for this host or URL.
+ if (urlPattern != null) {
+ List<FieldReplacer> lfp = FIELDREPLACERS_BY_URL.get(urlPattern);
+ if (lfp == null) {
+ lfp = new ArrayList<FieldReplacer>();
+ }
+ lfp.add(fr);
+ FIELDREPLACERS_BY_URL.put(urlPattern, lfp);
+ } else {
+ List<FieldReplacer> lfp = FIELDREPLACERS_BY_HOST
+ .get(hostPattern);
+ if (lfp == null) {
+ lfp = new ArrayList<FieldReplacer>();
+ }
+ lfp.add(fr);
+ FIELDREPLACERS_BY_HOST.put(hostPattern, lfp);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+ CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+
+ if (doc != null) {
+ if (FIELDREPLACERS_BY_HOST.size() > 0) {
+ this.doReplace(doc, "host", FIELDREPLACERS_BY_HOST);
+ }
+
+ if (FIELDREPLACERS_BY_URL.size() > 0) {
+ this.doReplace(doc, "url", FIELDREPLACERS_BY_URL);
+ }
+ }
+
+ return doc;
+ }
+
+ /**
+ * Iterates through the replacement map provided, to update the fields in the
+ * Nutch Document.
+ *
+ * @param doc
+ * the document we are modifying
+ * @param keyName
+ * either "host" or "url" -- the field that determines the
+ * replacement set used
+ * @param replaceMap
+ * the list of FieldReplacers that applies to this keyName.
+ */
+ private void doReplace(NutchDocument doc, String keyName,
+ Map<Pattern, List<FieldReplacer>> replaceMap) {
+
+ if (doc == null || replaceMap.size() == 0) {
+ return;
+ }
+
+ Collection<String> docFieldNames = doc.getFieldNames();
+ NutchField keyField = doc.getField(keyName);
+ if (keyField == null) {
+ // This document doesn't have the key field; no work to do.
+ return;
+ }
+
+ List<Object> keyFieldValues = keyField.getValues();
+ if (keyFieldValues.size() == 0) {
+ // This document doesn't have any values for the key field; no work to do.
+ return;
+ }
+
+ // For every value of the keyField (one expected)
+ for (Object oKeyFieldValue : keyFieldValues) {
+ if (oKeyFieldValue != null && oKeyFieldValue instanceof java.lang.String) {
+ String keyFieldValue = (String) oKeyFieldValue;
+
+ // For each pattern that we have a replacement list for...
+ for (Map.Entry<Pattern, List<FieldReplacer>> entries : replaceMap
+ .entrySet()) {
+ // If this key is a match for a replacement set...
+ if (entries.getKey().matcher(keyFieldValue).find()) {
+
+ // For each field we will replace for this key...
+ for (FieldReplacer fp : entries.getValue()) {
+ String fieldName = fp.getFieldName();
+
+ // Does this document contain the FieldReplacer's field?
+ if (docFieldNames.contains(fieldName)) {
+ NutchField docField = doc.getField(fieldName);
+ List<Object> fieldValues = docField.getValues();
+ ArrayList<String> newFieldValues = new ArrayList<String>();
+
+ // For each value of the field, match against our
+ // replacer...
+ for (Object oFieldValue : fieldValues) {
+ if (oFieldValue != null
+ && oFieldValue instanceof java.lang.String) {
+ String fieldValue = (String) oFieldValue;
+ String newValue = fp.replace(fieldValue);
+ newFieldValues.add(newValue);
+ }
+ }
+
+ // Remove the target field and add our replaced values.
+ String targetFieldName = fp.getToFieldName();
+ doc.removeField(targetFieldName);
+ for (String newFieldValue : newFieldValues) {
+ doc.add(targetFieldName, newFieldValue);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-replace/src/main/java/org/apache/nutch/indexer/replace/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-replace/src/main/java/org/apache/nutch/indexer/replace/package-info.java b/nutch-plugins/index-replace/src/main/java/org/apache/nutch/indexer/replace/package-info.java
new file mode 100644
index 0000000..28c24a4
--- /dev/null
+++ b/nutch-plugins/index-replace/src/main/java/org/apache/nutch/indexer/replace/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Indexing filter to allow pattern replacements on metadata.
+ */
+package org.apache.nutch.indexer.replace;
+
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-replace/src/test/java/org/apache/nutch/indexer/replace/TestIndexReplace.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-replace/src/test/java/org/apache/nutch/indexer/replace/TestIndexReplace.java b/nutch-plugins/index-replace/src/test/java/org/apache/nutch/indexer/replace/TestIndexReplace.java
new file mode 100644
index 0000000..ca90ca3
--- /dev/null
+++ b/nutch-plugins/index-replace/src/test/java/org/apache/nutch/indexer/replace/TestIndexReplace.java
@@ -0,0 +1,456 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer.replace;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.basic.BasicIndexingFilter;
+import org.apache.nutch.indexer.metadata.MetadataIndexer;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * JUnit tests for the <code>index-replace</code> plugin.
+ *
+ * In these tests, the sample file has some meta tags added to the Nutch
+ * document by the <code>index-metadata</code> plugin. The
+ * <code>index-replace</code> plugin is then used to either change (or not
+ * change) the fields depending on the various values of
+ * <code>index.replace.regexp</code> property being provided to Nutch.
+ *
+ *
+ * @author Peter Ciuffetti
+ *
+ */
+public class TestIndexReplace {
+
+ private static final String INDEX_REPLACE_PROPERTY = "index.replace.regexp";
+
+ private String fileSeparator = System.getProperty("file.separator");
+ private String sampleDir = System.getProperty("test.data", ".");
+ private String sampleFile = "testIndexReplace.html";
+
+ /**
+ * Run a test file through the Nutch parser and index filters.
+ *
+ * @param fileName
+ * @param conf
+ * @return the Nutch document with the replace indexer applied
+ */
+ public NutchDocument parseAndFilterFile(String fileName, Configuration conf) {
+ NutchDocument doc = new NutchDocument();
+
+ BasicIndexingFilter basicIndexer = new BasicIndexingFilter();
+ basicIndexer.setConf(conf);
+ Assert.assertNotNull(basicIndexer);
+
+ MetadataIndexer metaIndexer = new MetadataIndexer();
+ metaIndexer.setConf(conf);
+ Assert.assertNotNull(basicIndexer);
+
+ ReplaceIndexer replaceIndexer = new ReplaceIndexer();
+ replaceIndexer.setConf(conf);
+ Assert.assertNotNull(replaceIndexer);
+
+ try {
+ String urlString = "file:" + sampleDir + fileSeparator + fileName;
+ Text text = new Text(urlString);
+ CrawlDatum crawlDatum = new CrawlDatum();
+ Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
+ Content content = protocol.getProtocolOutput(text, crawlDatum)
+ .getContent();
+ Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
+ crawlDatum.setFetchTime(100L);
+
+ Inlinks inlinks = new Inlinks();
+ doc = basicIndexer.filter(doc, parse, text, crawlDatum, inlinks);
+ doc = metaIndexer.filter(doc, parse, text, crawlDatum, inlinks);
+ doc = replaceIndexer.filter(doc, parse, text, crawlDatum, inlinks);
+ } catch (Exception e) {
+ e.printStackTrace();
+ Assert.fail(e.toString());
+ }
+
+ return doc;
+ }
+
+ /**
+ * Test property parsing.
+ *
+ * The filter does not expose details of the parse. So all we are checking is
+ * that the parse does not throw a runtime exception and that the value
+ * provided is the value returned.
+ */
+ @Test
+ public void testPropertyParse() {
+ Configuration conf = NutchConfiguration.create();
+ String indexReplaceProperty = " metatag.description=/this(.*)plugin/this awesome plugin/2\n"
+ + " metatag.keywords=/\\,/\\!/\n"
+ + " hostmatch=.*.com\n"
+ + " metatag.keywords=/\\,/\\?/\n"
+ + " metatag.author:dc_author=/\\s+/ David /\n"
+ + " urlmatch=.*.html\n"
+ + " metatag.keywords=/\\,/\\./\n" + " metatag.author=/\\s+/ D. /\n";
+
+ conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
+
+ ReplaceIndexer rp = new ReplaceIndexer();
+ try {
+ rp.setConf(conf);
+ } catch (RuntimeException ohno) {
+ Assert.fail("Unable to parse a valid index.replace.regexp property! "
+ + ohno.getMessage());
+ }
+
+ Configuration parsedConf = rp.getConf();
+
+ // Does the getter equal the setter? Too easy!
+ Assert.assertEquals(indexReplaceProperty,
+ parsedConf.get(INDEX_REPLACE_PROPERTY));
+ }
+
+ /**
+ * Test metatag value replacement using global replacement settings.
+ *
+ * The index.replace.regexp property does not use hostmatch or urlmatch, so
+ * all patterns are global.
+ */
+ @Test
+ public void testGlobalReplacement() {
+ String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!";
+ String expectedKeywords = "Breathtaking! Riveting! Two Thumbs Up!";
+ String expectedAuthor = "Peter D. Ciuffetti";
+ String indexReplaceProperty = " metatag.description=/this(.*)plugin/this awesome plugin/\n"
+ + " metatag.keywords=/\\,/\\!/\n" + " metatag.author=/\\s+/ D. /\n";
+
+ Configuration conf = NutchConfiguration.create();
+ conf.set(
+ "plugin.includes",
+ "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
+ conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
+ conf.set("metatags.names", "author,description,keywords");
+ conf.set("index.parse.md",
+ "metatag.author,metatag.description,metatag.keywords");
+ // Not necessary but helpful when debugging the filter.
+ conf.set("http.timeout", "99999999999");
+
+ // Run the document through the parser and index filters.
+ NutchDocument doc = parseAndFilterFile(sampleFile, conf);
+
+ Assert.assertEquals(expectedDescription,
+ doc.getFieldValue("metatag.description"));
+ Assert
+ .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords"));
+ Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author"));
+ }
+
+ /**
+ * Test that invalid property settings are handled and ignored.
+ *
+ * This test provides an invalid property setting that will fail property
+ * parsing and Pattern.compile. The expected outcome is that the patterns will
+ * not cause failure and the targeted fields will not be modified by the
+ * filter.
+ */
+ @Test
+ public void testInvalidPatterns() {
+ String expectedDescription = "With this plugin, I control the description! Bwuhuhuhaha!";
+ String expectedKeywords = "Breathtaking, Riveting, Two Thumbs Up!";
+ String expectedAuthor = "Peter Ciuffetti";
+ // Contains: invalid pattern, invalid flags, incomplete property
+ String indexReplaceProperty = " metatag.description=/this\\s+**plugin/this awesome plugin/\n"
+ + " metatag.keywords=/\\,/\\!/what\n" + " metatag.author=#notcomplete";
+
+ Configuration conf = NutchConfiguration.create();
+ conf.set(
+ "plugin.includes",
+ "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
+ conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
+ conf.set("metatags.names", "author,description,keywords");
+ conf.set("index.parse.md",
+ "metatag.author,metatag.description,metatag.keywords");
+ // Not necessary but helpful when debugging the filter.
+ conf.set("http.timeout", "99999999999");
+
+ // Run the document through the parser and index filters.
+ NutchDocument doc = parseAndFilterFile(sampleFile, conf);
+
+ // Assert that our metatags have not changed.
+ Assert.assertEquals(expectedDescription,
+ doc.getFieldValue("metatag.description"));
+ Assert
+ .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords"));
+ Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author"));
+
+ }
+
+ /**
+ * Test URL pattern matching
+ */
+ @Test
+ public void testUrlMatchesPattern() {
+ String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!";
+ String expectedKeywords = "Breathtaking! Riveting! Two Thumbs Up!";
+ String expectedAuthor = "Peter D. Ciuffetti";
+ String indexReplaceProperty = " urlmatch=.*.html\n"
+ + " metatag.description=/this(.*)plugin/this awesome plugin/\n"
+ + " metatag.keywords=/\\,/\\!/\n" + " metatag.author=/\\s+/ D. /\n";
+
+ Configuration conf = NutchConfiguration.create();
+ conf.set(
+ "plugin.includes",
+ "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
+ conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
+ conf.set("metatags.names", "author,description,keywords");
+ conf.set("index.parse.md",
+ "metatag.author,metatag.description,metatag.keywords");
+ // Not necessary but helpful when debugging the filter.
+ conf.set("http.timeout", "99999999999");
+
+ // Run the document through the parser and index filters.
+ NutchDocument doc = parseAndFilterFile(sampleFile, conf);
+
+ // Assert that our metatags have changed.
+ Assert.assertEquals(expectedDescription,
+ doc.getFieldValue("metatag.description"));
+ Assert
+ .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords"));
+ Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author"));
+
+ }
+
+ /**
+ * Test URL pattern not matching.
+ *
+ * Expected result is that the filter does not change the fields.
+ */
+ @Test
+ public void testUrlNotMatchesPattern() {
+ String expectedDescription = "With this plugin, I control the description! Bwuhuhuhaha!";
+ String expectedKeywords = "Breathtaking, Riveting, Two Thumbs Up!";
+ String expectedAuthor = "Peter Ciuffetti";
+ String indexReplaceProperty = " urlmatch=.*.xml\n"
+ + " metatag.description=/this(.*)plugin/this awesome plugin/\n"
+ + " metatag.keywords=/\\,/\\!/\n" + " metatag.author=/\\s+/ D. /\n";
+
+ Configuration conf = NutchConfiguration.create();
+ conf.set(
+ "plugin.includes",
+ "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
+ conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
+ conf.set("metatags.names", "author,description,keywords");
+ conf.set("index.parse.md",
+ "metatag.author,metatag.description,metatag.keywords");
+ // Not necessary but helpful when debugging the filter.
+ conf.set("http.timeout", "99999999999");
+
+ // Run the document through the parser and index filters.
+ NutchDocument doc = parseAndFilterFile(sampleFile, conf);
+
+ // Assert that our metatags have not changed.
+ Assert.assertEquals(expectedDescription,
+ doc.getFieldValue("metatag.description"));
+ Assert
+ .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords"));
+ Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author"));
+
+ }
+
+ /**
+ * Test a global pattern match for description and URL pattern match for
+ * keywords and author.
+ *
+ * All three should be triggered. It also tests replacement groups.
+ */
+ @Test
+ public void testGlobalAndUrlMatchesPattern() {
+ String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!";
+ String expectedKeywords = "Breathtaking! Riveting! Two Thumbs Up!";
+ String expectedAuthor = "Peter D. Ciuffetti";
+ String indexReplaceProperty = " metatag.description=/this(.*)plugin/this$1awesome$1plugin/\n"
+ + " urlmatch=.*.html\n"
+ + " metatag.keywords=/\\,/\\!/\n"
+ + " metatag.author=/\\s+/ D. /\n";
+
+ Configuration conf = NutchConfiguration.create();
+ conf.set(
+ "plugin.includes",
+ "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
+ conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
+ conf.set("metatags.names", "author,description,keywords");
+ conf.set("index.parse.md",
+ "metatag.author,metatag.description,metatag.keywords");
+ // Not necessary but helpful when debugging the filter.
+ conf.set("http.timeout", "99999999999");
+
+ // Run the document through the parser and index filters.
+ NutchDocument doc = parseAndFilterFile(sampleFile, conf);
+
+ // Assert that our metatags have changed.
+ Assert.assertEquals(expectedDescription,
+ doc.getFieldValue("metatag.description"));
+ Assert
+ .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords"));
+ Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author"));
+
+ }
+
+ /**
+ * Test a global pattern match for description and URL pattern match for
+ * keywords and author.
+ *
+ * Only the global match should be triggered.
+ */
+ @Test
+ public void testGlobalAndUrlNotMatchesPattern() {
+ String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!";
+ String expectedKeywords = "Breathtaking, Riveting, Two Thumbs Up!";
+ String expectedAuthor = "Peter Ciuffetti";
+ String indexReplaceProperty = " metatag.description=/this(.*)plugin/this$1awesome$1plugin/\n"
+ + " urlmatch=.*.xml\n"
+ + " metatag.keywords=/\\,/\\!/\n"
+ + " metatag.author=/\\s+/ D. /\n";
+
+ Configuration conf = NutchConfiguration.create();
+ conf.set(
+ "plugin.includes",
+ "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
+ conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
+ conf.set("metatags.names", "author,description,keywords");
+ conf.set("index.parse.md",
+ "metatag.author,metatag.description,metatag.keywords");
+ // Not necessary but helpful when debugging the filter.
+ conf.set("http.timeout", "99999999999");
+
+ // Run the document through the parser and index filters.
+ NutchDocument doc = parseAndFilterFile(sampleFile, conf);
+
+ // Assert that description has changed and the others have not changed.
+ Assert.assertEquals(expectedDescription,
+ doc.getFieldValue("metatag.description"));
+ Assert
+ .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords"));
+ Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author"));
+ }
+
+ /**
+ * Test order-specific replacement settings.
+ *
+ * This makes multiple replacements on the same field and will produce the
+ * expected value only if the replacements are run in the order specified.
+ */
+ @Test
+ public void testReplacementsRunInSpecifedOrder() {
+ String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!";
+ String indexReplaceProperty = " metatag.description=/this plugin/this amazing plugin/\n"
+ + " metatag.description=/this amazing plugin/this valuable plugin/\n"
+ + " metatag.description=/this valuable plugin/this cool plugin/\n"
+ + " metatag.description=/this cool plugin/this wicked plugin/\n"
+ + " metatag.description=/this wicked plugin/this awesome plugin/\n";
+
+ Configuration conf = NutchConfiguration.create();
+ conf.set(
+ "plugin.includes",
+ "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
+ conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
+ conf.set("metatags.names", "author,description,keywords");
+ conf.set("index.parse.md",
+ "metatag.author,metatag.description,metatag.keywords");
+ // Not necessary but helpful when debugging the filter.
+ conf.set("http.timeout", "99999999999");
+
+ // Run the document through the parser and index filters.
+ NutchDocument doc = parseAndFilterFile(sampleFile, conf);
+
+ // Check that the value produced by the last replacement has worked.
+ Assert.assertEquals(expectedDescription,
+ doc.getFieldValue("metatag.description"));
+ }
+
+ /**
+ * Test a replacement pattern that uses the flags feature.
+ *
+ * A 2 is Pattern.CASE_INSENSITIVE. We look for upper case and expect to match
+ * any case.
+ */
+ @Test
+ public void testReplacementsWithFlags() {
+ String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!";
+ String indexReplaceProperty = " metatag.description=/THIS PLUGIN/this awesome plugin/2";
+
+ Configuration conf = NutchConfiguration.create();
+ conf.set(
+ "plugin.includes",
+ "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
+ conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
+ conf.set("metatags.names", "author,description,keywords");
+ conf.set("index.parse.md",
+ "metatag.author,metatag.description,metatag.keywords");
+ // Not necessary but helpful when debugging the filter.
+ conf.set("http.timeout", "99999999999");
+
+ // Run the document through the parser and index filters.
+ NutchDocument doc = parseAndFilterFile(sampleFile, conf);
+
+ // Check that the value produced by the case-insensitive replacement has
+ // worked.
+ Assert.assertEquals(expectedDescription,
+ doc.getFieldValue("metatag.description"));
+ }
+
+ /**
+ * Test a replacement pattern that uses the target field feature.
+ * Check that the input is not modifid and that the taret field is added.
+ */
+ @Test
+ public void testReplacementsDifferentTarget() {
+ String expectedDescription = "With this plugin, I control the description! Bwuhuhuhaha!";
+ String expectedTargetDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!";
+ String indexReplaceProperty = " metatag.description:new=/this plugin/this awesome plugin/";
+
+ Configuration conf = NutchConfiguration.create();
+ conf.set(
+ "plugin.includes",
+ "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
+ conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
+ conf.set("metatags.names", "author,description,keywords");
+ conf.set("index.parse.md",
+ "metatag.author,metatag.description,metatag.keywords");
+ // Not necessary but helpful when debugging the filter.
+ conf.set("http.timeout", "99999999999");
+
+ // Run the document through the parser and index filters.
+ NutchDocument doc = parseAndFilterFile(sampleFile, conf);
+
+ // Check that the input field has not been modified
+ Assert.assertEquals(expectedDescription,
+ doc.getFieldValue("metatag.description"));
+ // Check that the output field has created
+ Assert.assertEquals(expectedTargetDescription,
+ doc.getFieldValue("new"));
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-replace/src/test/resources/testIndexReplace.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-replace/src/test/resources/testIndexReplace.html b/nutch-plugins/index-replace/src/test/resources/testIndexReplace.html
new file mode 100644
index 0000000..0b90fc2
--- /dev/null
+++ b/nutch-plugins/index-replace/src/test/resources/testIndexReplace.html
@@ -0,0 +1,12 @@
+<html>
+ <head>
+ <title>Testing the power of the index-replace plugin</title>
+ <meta name="description" content="With this plugin, I control the description! Bwuhuhuhaha!">
+ <meta name="keywords" content="Breathtaking, Riveting, Two Thumbs Up!">
+ <meta name="author" content="Peter Ciuffetti">
+ </head>
+ <body>
+ <p>This html file is used to test the Nutch index-replace regexp replacer plugin.
+ A decidedly boring thing to do.</p>
+ </body>
+</html>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-static/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-static/build.xml b/nutch-plugins/index-static/build.xml
new file mode 100644
index 0000000..0ec5665
--- /dev/null
+++ b/nutch-plugins/index-static/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="index-static" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-static/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-static/ivy.xml b/nutch-plugins/index-static/ivy.xml
new file mode 100644
index 0000000..24d7606
--- /dev/null
+++ b/nutch-plugins/index-static/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../../ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ </dependencies>
+
+</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-static/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-static/plugin.xml b/nutch-plugins/index-static/plugin.xml
new file mode 100644
index 0000000..539e355
--- /dev/null
+++ b/nutch-plugins/index-static/plugin.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="index-static"
+ name="Index Static"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+ <runtime>
+ <library name="index-static.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
+
+ <extension id="org.apache.nutch.indexer.staticfield"
+ name="Nutch static field index"
+ point="org.apache.nutch.indexer.IndexingFilter">
+ <implementation id="StaticField"
+ class="org.apache.nutch.indexer.staticfield.StaticFieldIndexer"/>
+ </extension>
+
+</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-static/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-static/pom.xml b/nutch-plugins/index-static/pom.xml
new file mode 100644
index 0000000..6eaf0ba
--- /dev/null
+++ b/nutch-plugins/index-static/pom.xml
@@ -0,0 +1,38 @@
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one or more
+ ~ contributor license agreements. See the NOTICE file distributed with
+ ~ this work for additional information regarding copyright ownership.
+ ~ The ASF licenses this file to You under the Apache License, Version 2.0
+ ~ (the "License"); you may not use this file except in compliance with
+ ~ the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing, software
+ ~ distributed under the License is distributed on an "AS IS" BASIS,
+ ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ~ See the License for the specific language governing permissions and
+ ~ limitations under the License.
+ -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.nutch</groupId>
+ <artifactId>nutch-plugins</artifactId>
+ <version>1.13-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+ <artifactId>index-static</artifactId>
+ <packaging>jar</packaging>
+
+ <name>index-static</name>
+ <url>http://nutch.apache.org</url>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+
+</project>