You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2016/01/08 22:06:14 UTC

svn commit: r1723794 - in /nutch/branches/2.x: ./ src/plugin/ src/plugin/index-html/src/java/org/apache/nutch/indexer/html/

Author: snagel
Date: Fri Jan  8 21:06:13 2016
New Revision: 1723794

URL: http://svn.apache.org/viewvc?rev=1723794&view=rev
Log:
NUTCH-2169 Integrate index-html into Nutch build

Added:
    nutch/branches/2.x/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/package-info.java   (with props)
Removed:
    nutch/branches/2.x/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/README.md
    nutch/branches/2.x/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/package.html
Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/build.xml
    nutch/branches/2.x/default.properties
    nutch/branches/2.x/src/plugin/build.xml
    nutch/branches/2.x/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/HtmlIndexingFilter.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1723794&r1=1723793&r2=1723794&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Fri Jan  8 21:06:13 2016
@@ -3,6 +3,8 @@ Nutch Change Log
 Nutch 2.3.1 Release 22092015 (ddmmyyyy)
 Release Report - http://s.apache.org/nutch_2.3.1
 
+* NUTCH-2169 Integrate index-html into Nutch build (snagel)
+
 * NUTCH-2143 GeneratorJob ignores batch id passed as argument (liuqibj, lewismc, snagel)
 
 * NUTCH-2042 parse-html increase chunk size used to detect charset (snagel)

Modified: nutch/branches/2.x/build.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/build.xml?rev=1723794&r1=1723793&r2=1723794&view=diff
==============================================================================
--- nutch/branches/2.x/build.xml (original)
+++ nutch/branches/2.x/build.xml Fri Jan  8 21:06:13 2016
@@ -169,6 +169,7 @@
       <!--packageset dir="${plugins.dir}/feed/src/java"/-->
       <packageset dir="${plugins.dir}/index-anchor/src/java"/>
       <packageset dir="${plugins.dir}/index-basic/src/java"/>
+      <packageset dir="${plugins.dir}/index-html/src/java"/>
       <packageset dir="${plugins.dir}/index-metadata/src/java"/>
       <packageset dir="${plugins.dir}/index-more/src/java"/>
       <packageset dir="${plugins.dir}/indexer-elastic/src/java"/>
@@ -599,6 +600,7 @@
       <!--packageset dir="${plugins.dir}/feed/src/java"/-->
       <packageset dir="${plugins.dir}/index-anchor/src/java"/>
       <packageset dir="${plugins.dir}/index-basic/src/java"/>
+      <packageset dir="${plugins.dir}/index-html/src/java"/>
       <packageset dir="${plugins.dir}/index-metadata/src/java"/>
       <packageset dir="${plugins.dir}/index-more/src/java"/>
       <packageset dir="${plugins.dir}/indexer-elastic/src/java"/>
@@ -967,6 +969,7 @@
         <source path="${basedir}/src/plugin/index-anchor/src/test/" />
         <source path="${basedir}/src/plugin/index-basic/src/java/" />
         <source path="${basedir}/src/plugin/index-basic/src/test/" />
+        <source path="${basedir}/src/plugin/index-html/src/java/" />
         <source path="${basedir}/src/plugin/index-metadata/src/java/" />
         <source path="${basedir}/src/plugin/index-more/src/java/" />
         <source path="${basedir}/src/plugin/index-more/src/test/" />

Modified: nutch/branches/2.x/default.properties
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/default.properties?rev=1723794&r1=1723793&r2=1723794&view=diff
==============================================================================
--- nutch/branches/2.x/default.properties (original)
+++ nutch/branches/2.x/default.properties Fri Jan  8 21:06:13 2016
@@ -146,6 +146,7 @@ plugins.index=\
    org.apache.nutch.indexer.anchor*:\
    org.apache.nutch.indexer.basic*:\
    org.apache.nutch.indexer.feed*:\
+   org.apache.nutch.indexer.html*:\
    org.apache.nutch.indexer.metadata*:\
    org.apache.nutch.indexer.more*:\
    org.apache.nutch.indexer.subcollection*:\

Modified: nutch/branches/2.x/src/plugin/build.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/build.xml?rev=1723794&r1=1723793&r2=1723794&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/build.xml (original)
+++ nutch/branches/2.x/src/plugin/build.xml Fri Jan  8 21:06:13 2016
@@ -29,6 +29,7 @@
      <ant dir="creativecommons" target="deploy"/>
      <ant dir="index-anchor" target="deploy"/>
      <ant dir="index-basic" target="deploy"/>
+     <ant dir="index-html" target="deploy"/>
      <ant dir="index-more" target="deploy"/>
      <ant dir="index-metadata" target="deploy"/>
      <ant dir="indexer-solr" target="deploy"/>
@@ -116,6 +117,7 @@
     <ant dir="feed" target="clean"/>
     <ant dir="index-anchor" target="clean"/>
     <ant dir="index-basic" target="clean"/>
+    <ant dir="index-html" target="clean"/>
     <ant dir="index-more" target="clean"/>
     <ant dir="index-metadata" target="clean"/>
     <ant dir="indexer-solr" target="clean"/>

Modified: nutch/branches/2.x/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/HtmlIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/HtmlIndexingFilter.java?rev=1723794&r1=1723793&r2=1723794&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/HtmlIndexingFilter.java (original)
+++ nutch/branches/2.x/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/HtmlIndexingFilter.java Fri Jan  8 21:06:13 2016
@@ -16,45 +16,26 @@
  */
 package org.apache.nutch.indexer.html;
 
-import java.util.Scanner;
-import java.nio.ByteBuffer;
 import java.io.ByteArrayInputStream;
-
-import java.text.ParseException;
+import java.nio.ByteBuffer;
 import java.util.Collection;
-import java.util.Date;
 import java.util.HashSet;
+import java.util.Scanner;
 
-import org.apache.avro.util.Utf8;
-import org.apache.commons.lang.StringUtils;
-import org.apache.nutch.util.StringUtil;
-
-import org.apache.commons.lang.time.DateUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.indexer.IndexingException;
 import org.apache.nutch.indexer.IndexingFilter;
 import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.metadata.HttpHeaders;
-import org.apache.nutch.net.protocols.HttpDateFormat;
 import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.storage.WebPage.Field;
 import org.apache.nutch.util.MimeUtil;
-import org.apache.nutch.util.TableUtil;
-import org.apache.oro.text.regex.MalformedPatternException;
-import org.apache.oro.text.regex.MatchResult;
-import org.apache.oro.text.regex.PatternMatcher;
-import org.apache.oro.text.regex.Perl5Compiler;
-import org.apache.oro.text.regex.Perl5Matcher;
-import org.apache.oro.text.regex.Perl5Pattern;
-import org.apache.solr.common.util.DateUtil;
+import org.apache.nutch.util.StringUtil;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 
 /**
- * Add HTML of page the document element so it can be indexed in scheme.xml
- *
- * @author Mohamed Meabed <mo...@gmail.com>
+ * Add raw HTML content of a document to the index.
  */
 
 public class HtmlIndexingFilter implements IndexingFilter {
@@ -93,6 +74,7 @@ public class HtmlIndexingFilter implemen
                 data = scanner.next();
             }
             doc.add("rawcontent", StringUtil.cleanField(data));
+            scanner.close();
         }
         return doc;
     }

Added: nutch/branches/2.x/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/package-info.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/package-info.java?rev=1723794&view=auto
==============================================================================
--- nutch/branches/2.x/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/package-info.java (added)
+++ nutch/branches/2.x/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/package-info.java Fri Jan  8 21:06:13 2016
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Index raw HTML content.
+ * 
+ * The plugin index-html adds the field &quot;rawcontent&quot; to the index.
+ * This field contains the raw (HTML) content of a document converted to a String.
+ */
+package org.apache.nutch.indexer.html;
+

Propchange: nutch/branches/2.x/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/package-info.java
------------------------------------------------------------------------------
    svn:eol-style = native