You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2014/08/23 00:23:27 UTC
svn commit: r1619942 - in /nutch: branches/2.x/
branches/2.x/src/java/org/apache/nutch/crawl/
branches/2.x/src/java/org/apache/nutch/parse/ trunk/
trunk/src/java/org/apache/nutch/crawl/
Author: snagel
Date: Fri Aug 22 22:23:27 2014
New Revision: 1619942
URL: http://svn.apache.org/r1619942
Log:
NUTCH-1693 TextMD5Signature computed on textual content
Added:
nutch/branches/2.x/src/java/org/apache/nutch/crawl/TextMD5Signature.java (with props)
nutch/trunk/src/java/org/apache/nutch/crawl/TextMD5Signature.java (with props)
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java
nutch/trunk/CHANGES.txt
Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1619942&r1=1619941&r2=1619942&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Fri Aug 22 22:23:27 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Current Development
+* NUTCH-1693 TextMD5Signature computed on textual content (Tien Nguyen Manh, markus via snagel)
+
* NUTCH-1409 remove deprecated properties db.{default,max}.fetch.interval, generate.max.per.host.by.ip (Matthias Agethle via snagel)
* NUTCH-1819 batchId in GeneratorJob ( Fjodor Vershinin via lewismc)
Added: nutch/branches/2.x/src/java/org/apache/nutch/crawl/TextMD5Signature.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/TextMD5Signature.java?rev=1619942&view=auto
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/TextMD5Signature.java (added)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/TextMD5Signature.java Fri Aug 22 22:23:27 2014
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.util.Collection;
+import java.util.HashSet;
+
+import org.apache.hadoop.io.MD5Hash;
+import org.apache.nutch.storage.WebPage;
+
+/**
+ * Default implementation of a page signature. It calculates an MD5 hash of the
+ * textual content of a page. In case there is no text, it calculates a hash
+ * from the page's fetched content.
+ */
+public class TextMD5Signature extends Signature {
+
+ private final static Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
+
+ static {
+ FIELDS.add(WebPage.Field.TEXT);
+ }
+
+ Signature fallback = new MD5Signature();
+
+ @Override
+ public byte[] calculate(WebPage page) {
+ CharSequence text = page.getText();
+
+ if (text == null || text.length() == 0) {
+ return fallback.calculate(page);
+ }
+
+ return MD5Hash.digest(text.toString()).getDigest();
+ }
+
+ @Override
+ public Collection<WebPage.Field> getFields() {
+ Collection<WebPage.Field> fields = new HashSet<WebPage.Field>(FIELDS);
+ fields.addAll(fallback.getFields());
+ return fields;
+ }
+}
Propchange: nutch/branches/2.x/src/java/org/apache/nutch/crawl/TextMD5Signature.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java?rev=1619942&r1=1619941&r2=1619942&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java Fri Aug 22 22:23:27 2014
@@ -187,8 +187,6 @@ public class ParseUtil extends Configure
return;
}
- final byte[] signature = sig.calculate(page);
-
org.apache.nutch.storage.ParseStatus pstatus = parse.getParseStatus();
page.setParseStatus(pstatus);
if (ParseStatusUtils.isSuccess(pstatus)) {
@@ -233,6 +231,7 @@ public class ParseUtil extends Configure
if (prevSig != null) {
page.setPrevSignature(prevSig);
}
+ final byte[] signature = sig.calculate(page);
page.setSignature(ByteBuffer.wrap(signature));
if (page.getOutlinks() != null) {
page.getOutlinks().clear();
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1619942&r1=1619941&r2=1619942&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Aug 22 22:23:27 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development
+* NUTCH-1693 TextMD5Signature computed on textual content (Tien Nguyen Manh, markus via snagel)
+
* NUTCH-1409 remove deprecated properties db.{default,max}.fetch.interval, generate.max.per.host.by.ip (Matthias Agethle via snagel)
Nutch 1.9 Release Change Log - 12/08/2014 (dd/mm/yyyy)
Added: nutch/trunk/src/java/org/apache/nutch/crawl/TextMD5Signature.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/TextMD5Signature.java?rev=1619942&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/TextMD5Signature.java (added)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/TextMD5Signature.java Fri Aug 22 22:23:27 2014
@@ -0,0 +1,42 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import org.apache.hadoop.io.MD5Hash;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.protocol.Content;
+
+/**
+ * Implementation of a page signature. It calculates an MD5 hash
+ * of the textual content of a page. In case there is no content, it
+ * calculates a hash from the page's URL.
+ */
+public class TextMD5Signature extends Signature {
+
+ Signature fallback = new MD5Signature();
+
+ public byte[] calculate(Content content, Parse parse) {
+ String text = parse.getText();
+
+ if (text == null || text.length() == 0) {
+ return fallback.calculate(content, parse);
+ }
+
+ return MD5Hash.digest(text).getDigest();
+ }
+}
Propchange: nutch/trunk/src/java/org/apache/nutch/crawl/TextMD5Signature.java
------------------------------------------------------------------------------
svn:eol-style = native