You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2016/01/08 22:06:14 UTC
svn commit: r1723794 - in /nutch/branches/2.x: ./ src/plugin/
src/plugin/index-html/src/java/org/apache/nutch/indexer/html/
Author: snagel
Date: Fri Jan 8 21:06:13 2016
New Revision: 1723794
URL: http://svn.apache.org/viewvc?rev=1723794&view=rev
Log:
NUTCH-2169 Integrate index-html into Nutch build
Added:
nutch/branches/2.x/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/package-info.java (with props)
Removed:
nutch/branches/2.x/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/README.md
nutch/branches/2.x/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/package.html
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/build.xml
nutch/branches/2.x/default.properties
nutch/branches/2.x/src/plugin/build.xml
nutch/branches/2.x/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/HtmlIndexingFilter.java
Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1723794&r1=1723793&r2=1723794&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Fri Jan 8 21:06:13 2016
@@ -3,6 +3,8 @@ Nutch Change Log
Nutch 2.3.1 Release 22092015 (ddmmyyyy)
Release Report - http://s.apache.org/nutch_2.3.1
+* NUTCH-2169 Integrate index-html into Nutch build (snagel)
+
* NUTCH-2143 GeneratorJob ignores batch id passed as argument (liuqibj, lewismc, snagel)
* NUTCH-2042 parse-html increase chunk size used to detect charset (snagel)
Modified: nutch/branches/2.x/build.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/build.xml?rev=1723794&r1=1723793&r2=1723794&view=diff
==============================================================================
--- nutch/branches/2.x/build.xml (original)
+++ nutch/branches/2.x/build.xml Fri Jan 8 21:06:13 2016
@@ -169,6 +169,7 @@
<!--packageset dir="${plugins.dir}/feed/src/java"/-->
<packageset dir="${plugins.dir}/index-anchor/src/java"/>
<packageset dir="${plugins.dir}/index-basic/src/java"/>
+ <packageset dir="${plugins.dir}/index-html/src/java"/>
<packageset dir="${plugins.dir}/index-metadata/src/java"/>
<packageset dir="${plugins.dir}/index-more/src/java"/>
<packageset dir="${plugins.dir}/indexer-elastic/src/java"/>
@@ -599,6 +600,7 @@
<!--packageset dir="${plugins.dir}/feed/src/java"/-->
<packageset dir="${plugins.dir}/index-anchor/src/java"/>
<packageset dir="${plugins.dir}/index-basic/src/java"/>
+ <packageset dir="${plugins.dir}/index-html/src/java"/>
<packageset dir="${plugins.dir}/index-metadata/src/java"/>
<packageset dir="${plugins.dir}/index-more/src/java"/>
<packageset dir="${plugins.dir}/indexer-elastic/src/java"/>
@@ -967,6 +969,7 @@
<source path="${basedir}/src/plugin/index-anchor/src/test/" />
<source path="${basedir}/src/plugin/index-basic/src/java/" />
<source path="${basedir}/src/plugin/index-basic/src/test/" />
+ <source path="${basedir}/src/plugin/index-html/src/java/" />
<source path="${basedir}/src/plugin/index-metadata/src/java/" />
<source path="${basedir}/src/plugin/index-more/src/java/" />
<source path="${basedir}/src/plugin/index-more/src/test/" />
Modified: nutch/branches/2.x/default.properties
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/default.properties?rev=1723794&r1=1723793&r2=1723794&view=diff
==============================================================================
--- nutch/branches/2.x/default.properties (original)
+++ nutch/branches/2.x/default.properties Fri Jan 8 21:06:13 2016
@@ -146,6 +146,7 @@ plugins.index=\
org.apache.nutch.indexer.anchor*:\
org.apache.nutch.indexer.basic*:\
org.apache.nutch.indexer.feed*:\
+ org.apache.nutch.indexer.html*:\
org.apache.nutch.indexer.metadata*:\
org.apache.nutch.indexer.more*:\
org.apache.nutch.indexer.subcollection*:\
Modified: nutch/branches/2.x/src/plugin/build.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/build.xml?rev=1723794&r1=1723793&r2=1723794&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/build.xml (original)
+++ nutch/branches/2.x/src/plugin/build.xml Fri Jan 8 21:06:13 2016
@@ -29,6 +29,7 @@
<ant dir="creativecommons" target="deploy"/>
<ant dir="index-anchor" target="deploy"/>
<ant dir="index-basic" target="deploy"/>
+ <ant dir="index-html" target="deploy"/>
<ant dir="index-more" target="deploy"/>
<ant dir="index-metadata" target="deploy"/>
<ant dir="indexer-solr" target="deploy"/>
@@ -116,6 +117,7 @@
<ant dir="feed" target="clean"/>
<ant dir="index-anchor" target="clean"/>
<ant dir="index-basic" target="clean"/>
+ <ant dir="index-html" target="clean"/>
<ant dir="index-more" target="clean"/>
<ant dir="index-metadata" target="clean"/>
<ant dir="indexer-solr" target="clean"/>
Modified: nutch/branches/2.x/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/HtmlIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/HtmlIndexingFilter.java?rev=1723794&r1=1723793&r2=1723794&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/HtmlIndexingFilter.java (original)
+++ nutch/branches/2.x/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/HtmlIndexingFilter.java Fri Jan 8 21:06:13 2016
@@ -16,45 +16,26 @@
*/
package org.apache.nutch.indexer.html;
-import java.util.Scanner;
-import java.nio.ByteBuffer;
import java.io.ByteArrayInputStream;
-
-import java.text.ParseException;
+import java.nio.ByteBuffer;
import java.util.Collection;
-import java.util.Date;
import java.util.HashSet;
+import java.util.Scanner;
-import org.apache.avro.util.Utf8;
-import org.apache.commons.lang.StringUtils;
-import org.apache.nutch.util.StringUtil;
-
-import org.apache.commons.lang.time.DateUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.indexer.IndexingException;
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.metadata.HttpHeaders;
-import org.apache.nutch.net.protocols.HttpDateFormat;
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.storage.WebPage.Field;
import org.apache.nutch.util.MimeUtil;
-import org.apache.nutch.util.TableUtil;
-import org.apache.oro.text.regex.MalformedPatternException;
-import org.apache.oro.text.regex.MatchResult;
-import org.apache.oro.text.regex.PatternMatcher;
-import org.apache.oro.text.regex.Perl5Compiler;
-import org.apache.oro.text.regex.Perl5Matcher;
-import org.apache.oro.text.regex.Perl5Pattern;
-import org.apache.solr.common.util.DateUtil;
+import org.apache.nutch.util.StringUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
- * Add HTML of page the document element so it can be indexed in scheme.xml
- *
- * @author Mohamed Meabed <mo...@gmail.com>
+ * Add raw HTML content of a document to the index.
*/
public class HtmlIndexingFilter implements IndexingFilter {
@@ -93,6 +74,7 @@ public class HtmlIndexingFilter implemen
data = scanner.next();
}
doc.add("rawcontent", StringUtil.cleanField(data));
+ scanner.close();
}
return doc;
}
Added: nutch/branches/2.x/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/package-info.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/package-info.java?rev=1723794&view=auto
==============================================================================
--- nutch/branches/2.x/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/package-info.java (added)
+++ nutch/branches/2.x/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/package-info.java Fri Jan 8 21:06:13 2016
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Index raw HTML content.
+ *
+ * The plugin index-html adds the field "rawcontent" to the index.
+ * This field contains the raw (HTML) content of a document converted to a String.
+ */
+package org.apache.nutch.indexer.html;
+
Propchange: nutch/branches/2.x/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/package-info.java
------------------------------------------------------------------------------
svn:eol-style = native