You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/05 22:48:46 UTC
[02/69] [abbrv] [partial] nutch git commit: Re arranged the source
code as per maven conventions for build
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-selenium/build-ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-selenium/build-ivy.xml b/src/plugin/protocol-selenium/build-ivy.xml
deleted file mode 100644
index 67d39cd..0000000
--- a/src/plugin/protocol-selenium/build-ivy.xml
+++ /dev/null
@@ -1,54 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="protocol-selenium" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant">
-
- <property name="ivy.install.version" value="2.1.0" />
- <condition property="ivy.home" value="${env.IVY_HOME}">
- <isset property="env.IVY_HOME" />
- </condition>
- <property name="ivy.home" value="${user.home}/.ant" />
- <property name="ivy.checksums" value="" />
- <property name="ivy.jar.dir" value="${ivy.home}/lib" />
- <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
-
- <target name="download-ivy" unless="offline">
-
- <mkdir dir="${ivy.jar.dir}"/>
- <!-- download Ivy from web site so that it can be used even without any special installation -->
- <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar"
- dest="${ivy.jar.file}" usetimestamp="true"/>
- </target>
-
- <target name="init-ivy" depends="download-ivy">
- <!-- try to load ivy here from ivy home, in case the user has not already dropped
- it into ant's lib dir (note that the latter copy will always take precedence).
- We will not fail as long as local lib dir exists (it may be empty) and
- ivy is in at least one of ant's lib dir or the local lib dir. -->
- <path id="ivy.lib.path">
- <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
-
- </path>
- <taskdef resource="org/apache/ivy/ant/antlib.xml"
- uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
- </target>
-
- <target name="deps-jar" depends="init-ivy">
- <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]" sync="true"/>
- </target>
-
-</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-selenium/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-selenium/build.xml b/src/plugin/protocol-selenium/build.xml
deleted file mode 100644
index 055018f..0000000
--- a/src/plugin/protocol-selenium/build.xml
+++ /dev/null
@@ -1,36 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="protocol-selenium" default="jar-core">
-
- <import file="../build-plugin.xml"/>
-
- <!-- Build compilation dependencies -->
- <target name="deps-jar">
- <ant target="jar" inheritall="false" dir="../lib-http"/>
- <ant target="jar" inheritall="false" dir="../lib-selenium"/>
- </target>
-
- <!-- Add compilation dependencies to classpath -->
- <path id="plugin.deps">
- <fileset dir="${nutch.root}/build">
- <include name="**/lib-http/*.jar" />
- <include name="**/lib-selenium/*.jar" />
- </fileset>
- </path>
-
-</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-selenium/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-selenium/ivy.xml b/src/plugin/protocol-selenium/ivy.xml
deleted file mode 100644
index ff07f8c..0000000
--- a/src/plugin/protocol-selenium/ivy.xml
+++ /dev/null
@@ -1,42 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<ivy-module version="1.0">
- <info organisation="org.apache.nutch" module="${ant.project.name}">
- <license name="Apache 2.0"/>
- <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
- <description>
- Apache Nutch
- </description>
- </info>
-
- <configurations>
- <include file="../../../ivy/ivy-configurations.xml"/>
- </configurations>
-
- <publications>
- <!--get the artifact from our module name-->
- <artifact conf="default"/>
- </publications>
-
- <dependencies>
- <!-- Note: only dependencies which are not contained in lib-selenium have to be listed here! -->
- </dependencies>
-
-</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-selenium/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-selenium/plugin.xml b/src/plugin/protocol-selenium/plugin.xml
deleted file mode 100644
index 1454c1b..0000000
--- a/src/plugin/protocol-selenium/plugin.xml
+++ /dev/null
@@ -1,47 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
- id="protocol-selenium"
- name="Http Protocol Plug-in"
- version="1.0.0"
- provider-name="nutch.org">
-
- <runtime>
- <library name="protocol-selenium.jar">
- <export name="*"/>
- </library>
- </runtime>
-
- <requires>
- <import plugin="nutch-extensionpoints"/>
- <import plugin="lib-http"/>
- <import plugin="lib-selenium"/>
- </requires>
-
- <extension id="org.apache.nutch.protocol.selenium"
- name="HttpProtocol"
- point="org.apache.nutch.protocol.Protocol">
-
- <implementation id="org.apache.nutch.protocol.selenium.Http"
- class="org.apache.nutch.protocol.selenium.Http">
- <parameter name="protocolName" value="http"/>
- </implementation>
-
- </extension>
-
-</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java
deleted file mode 100644
index 7726bdf..0000000
--- a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java
+++ /dev/null
@@ -1,59 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.protocol.selenium;
-
-// JDK imports
-import java.io.IOException;
-import java.net.URL;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.net.protocols.Response;
-import org.apache.nutch.protocol.http.api.HttpBase;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.util.NutchConfiguration;
-
-import org.apache.nutch.protocol.selenium.HttpResponse;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public class Http extends HttpBase {
-
- public static final Logger LOG = LoggerFactory.getLogger(Http.class);
-
- public Http() {
- super(LOG);
- }
-
- @Override
- public void setConf(Configuration conf) {
- super.setConf(conf);
- }
-
- public static void main(String[] args) throws Exception {
- Http http = new Http();
- http.setConf(NutchConfiguration.create());
- main(http, args);
- }
-
- @Override
- protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
- throws ProtocolException, IOException {
- return new HttpResponse(this, url, datum);
- }
-
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
deleted file mode 100644
index 681e838..0000000
--- a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
+++ /dev/null
@@ -1,360 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.protocol.selenium;
-
-// JDK imports
-import java.io.BufferedInputStream;
-import java.io.EOFException;
-import java.io.IOException;
-import java.io.OutputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.PushbackInputStream;
-import java.net.InetSocketAddress;
-import java.net.Socket;
-import java.net.URL;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.metadata.SpellCheckedMetadata;
-import org.apache.nutch.net.protocols.HttpDateFormat;
-import org.apache.nutch.net.protocols.Response;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.protocol.http.api.HttpException;
-import org.apache.nutch.protocol.http.api.HttpBase;
-
-/* Most of this code was borrowed from protocol-htmlunit; which in turn borrowed it from protocol-httpclient */
-
-public class HttpResponse implements Response {
-
- private Http http;
- private URL url;
- private String orig;
- private String base;
- private byte[] content;
- private int code;
- private Metadata headers = new SpellCheckedMetadata();
-
- /** The nutch configuration */
- private Configuration conf = null;
-
- public HttpResponse(Http http, URL url, CrawlDatum datum) throws ProtocolException, IOException {
-
- this.conf = http.getConf();
- this.http = http;
- this.url = url;
- this.orig = url.toString();
- this.base = url.toString();
-
- if (!"http".equals(url.getProtocol()))
- throw new HttpException("Not an HTTP url:" + url);
-
- if (Http.LOG.isTraceEnabled()) {
- Http.LOG.trace("fetching " + url);
- }
-
- String path = "".equals(url.getFile()) ? "/" : url.getFile();
-
- // some servers will redirect a request with a host line like
- // "Host: <hostname>:80" to "http://<hpstname>/<orig_path>"- they
- // don't want the :80...
-
- String host = url.getHost();
- int port;
- String portString;
- if (url.getPort() == -1) {
- port = 80;
- portString = "";
- } else {
- port = url.getPort();
- portString = ":" + port;
- }
- Socket socket = null;
-
- try {
- socket = new Socket(); // create the socket
- socket.setSoTimeout(http.getTimeout());
-
- // connect
- String sockHost = http.useProxy(url) ? http.getProxyHost() : host;
- int sockPort = http.useProxy(url) ? http.getProxyPort() : port;
- InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort);
- socket.connect(sockAddr, http.getTimeout());
-
- // make request
- OutputStream req = socket.getOutputStream();
-
- StringBuffer reqStr = new StringBuffer("GET ");
- if (http.useProxy(url)) {
- reqStr.append(url.getProtocol() + "://" + host + portString + path);
- } else {
- reqStr.append(path);
- }
-
- reqStr.append(" HTTP/1.0\r\n");
-
- reqStr.append("Host: ");
- reqStr.append(host);
- reqStr.append(portString);
- reqStr.append("\r\n");
-
- reqStr.append("Accept-Encoding: x-gzip, gzip, deflate\r\n");
-
- String userAgent = http.getUserAgent();
- if ((userAgent == null) || (userAgent.length() == 0)) {
- if (Http.LOG.isErrorEnabled()) {
- Http.LOG.error("User-agent is not set!");
- }
- } else {
- reqStr.append("User-Agent: ");
- reqStr.append(userAgent);
- reqStr.append("\r\n");
- }
-
- reqStr.append("Accept-Language: ");
- reqStr.append(this.http.getAcceptLanguage());
- reqStr.append("\r\n");
-
- reqStr.append("Accept: ");
- reqStr.append(this.http.getAccept());
- reqStr.append("\r\n");
-
- if (datum.getModifiedTime() > 0) {
- reqStr.append("If-Modified-Since: " + HttpDateFormat.toString(datum.getModifiedTime()));
- reqStr.append("\r\n");
- }
- reqStr.append("\r\n");
-
- byte[] reqBytes = reqStr.toString().getBytes();
-
- req.write(reqBytes);
- req.flush();
-
- PushbackInputStream in = // process response
- new PushbackInputStream(new BufferedInputStream(socket.getInputStream(), Http.BUFFER_SIZE),
- Http.BUFFER_SIZE);
-
- StringBuffer line = new StringBuffer();
-
- boolean haveSeenNonContinueStatus = false;
- while (!haveSeenNonContinueStatus) {
- // parse status code line
- this.code = parseStatusLine(in, line);
- // parse headers
- parseHeaders(in, line);
- haveSeenNonContinueStatus = code != 100; // 100 is "Continue"
- }
-
- // Get Content type header
- String contentType = getHeader(Response.CONTENT_TYPE);
-
- // handle with Selenium only if content type in HTML or XHTML
- if (contentType != null) {
- if (contentType.contains("text/html") || contentType.contains("application/xhtml")) {
- readPlainContent(url);
- } else {
- try {
- int contentLength = Integer.MAX_VALUE;
- String contentLengthString = headers.get(Response.CONTENT_LENGTH);
- if (contentLengthString != null) {
- try {
- contentLength = Integer.parseInt(contentLengthString.trim());
- } catch (NumberFormatException ex) {
- throw new HttpException("bad content length: " + contentLengthString);
- }
- }
-
- if (http.getMaxContent() >= 0 && contentLength > http.getMaxContent()) {
- contentLength = http.getMaxContent();
- }
-
- byte[] buffer = new byte[HttpBase.BUFFER_SIZE];
- int bufferFilled = 0;
- int totalRead = 0;
- ByteArrayOutputStream out = new ByteArrayOutputStream();
- while ((bufferFilled = in.read(buffer, 0, buffer.length)) != -1
- && totalRead + bufferFilled <= contentLength) {
- totalRead += bufferFilled;
- out.write(buffer, 0, bufferFilled);
- }
-
- content = out.toByteArray();
-
- } catch (Exception e) {
- if (code == 200)
- throw new IOException(e.toString());
- // for codes other than 200 OK, we are fine with empty content
- } finally {
- if (in != null) {
- in.close();
- }
- }
- }
- }
-
- } finally {
- if (socket != null)
- socket.close();
- }
- }
-
- /* ------------------------- *
- * <implementation:Response> *
- * ------------------------- */
-
- public URL getUrl() {
- return url;
- }
-
- public int getCode() {
- return code;
- }
-
- public String getHeader(String name) {
- return headers.get(name);
- }
-
- public Metadata getHeaders() {
- return headers;
- }
-
- public byte[] getContent() {
- return content;
- }
-
- /* ------------------------- *
- * <implementation:Response> *
- * ------------------------- */
-
- private void readPlainContent(URL url) throws IOException {
- String page = HttpWebClient.getHtmlPage(url.toString(), conf);
-
- content = page.getBytes("UTF-8");
- }
-
- private int parseStatusLine(PushbackInputStream in, StringBuffer line) throws IOException, HttpException {
- readLine(in, line, false);
-
- int codeStart = line.indexOf(" ");
- int codeEnd = line.indexOf(" ", codeStart + 1);
-
- // handle lines with no plaintext result code, ie:
- // "HTTP/1.1 200" vs "HTTP/1.1 200 OK"
- if (codeEnd == -1)
- codeEnd = line.length();
-
- int code;
- try {
- code = Integer.parseInt(line.substring(codeStart + 1, codeEnd));
- } catch (NumberFormatException e) {
- throw new HttpException("bad status line '" + line + "': " + e.getMessage(), e);
- }
-
- return code;
- }
-
- private void processHeaderLine(StringBuffer line) throws IOException, HttpException {
-
- int colonIndex = line.indexOf(":"); // key is up to colon
- if (colonIndex == -1) {
- int i;
- for (i = 0; i < line.length(); i++)
- if (!Character.isWhitespace(line.charAt(i)))
- break;
- if (i == line.length())
- return;
- throw new HttpException("No colon in header:" + line);
- }
- String key = line.substring(0, colonIndex);
-
- int valueStart = colonIndex + 1; // skip whitespace
- while (valueStart < line.length()) {
- int c = line.charAt(valueStart);
- if (c != ' ' && c != '\t')
- break;
- valueStart++;
- }
- String value = line.substring(valueStart);
- headers.set(key, value);
- }
-
- // Adds headers to our headers Metadata
- private void parseHeaders(PushbackInputStream in, StringBuffer line) throws IOException, HttpException {
-
- while (readLine(in, line, true) != 0) {
-
- // handle HTTP responses with missing blank line after headers
- int pos;
- if (((pos = line.indexOf("<!DOCTYPE")) != -1) || ((pos = line.indexOf("<HTML")) != -1)
- || ((pos = line.indexOf("<html")) != -1)) {
-
- in.unread(line.substring(pos).getBytes("UTF-8"));
- line.setLength(pos);
-
- try {
- //TODO: (CM) We don't know the header names here
- //since we're just handling them generically. It would
- //be nice to provide some sort of mapping function here
- //for the returned header names to the standard metadata
- //names in the ParseData class
- processHeaderLine(line);
- } catch (Exception e) {
- // fixme:
- Http.LOG.warn("Error: ", e);
- }
- return;
- }
-
- processHeaderLine(line);
- }
- }
-
- private static int readLine(PushbackInputStream in, StringBuffer line, boolean allowContinuedLine)
- throws IOException {
- line.setLength(0);
- for (int c = in.read(); c != -1; c = in.read()) {
- switch (c) {
- case '\r':
- if (peek(in) == '\n') {
- in.read();
- }
- case '\n':
- if (line.length() > 0) {
- // at EOL -- check for continued line if the current
- // (possibly continued) line wasn't blank
- if (allowContinuedLine)
- switch (peek(in)) {
- case ' ':
- case '\t': // line is continued
- in.read();
- continue;
- }
- }
- return line.length(); // else complete
- default:
- line.append((char) c);
- }
- }
- throw new EOFException();
- }
-
- private static int peek(PushbackInputStream in) throws IOException {
- int value = in.read();
- in.unread(value);
- return value;
- }
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html
deleted file mode 100644
index 75cd5b5..0000000
--- a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html
+++ /dev/null
@@ -1,5 +0,0 @@
-<html>
-<body>
-<p>Protocol plugin which supports retrieving documents via selenium.</p><p></p>
-</body>
-</html>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-depth/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-depth/build.xml b/src/plugin/scoring-depth/build.xml
deleted file mode 100644
index 6c041ed..0000000
--- a/src/plugin/scoring-depth/build.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-<?xml version="1.0"?>
-<project name="scoring-depth" default="jar-core">
-
- <import file="../build-plugin.xml"/>
-
-</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-depth/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-depth/ivy.xml b/src/plugin/scoring-depth/ivy.xml
deleted file mode 100644
index 24d7606..0000000
--- a/src/plugin/scoring-depth/ivy.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<ivy-module version="1.0">
- <info organisation="org.apache.nutch" module="${ant.project.name}">
- <license name="Apache 2.0"/>
- <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
- <description>
- Apache Nutch
- </description>
- </info>
-
- <configurations>
- <include file="../../../ivy/ivy-configurations.xml"/>
- </configurations>
-
- <publications>
- <!--get the artifact from our module name-->
- <artifact conf="master"/>
- </publications>
-
- <dependencies>
- </dependencies>
-
-</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-depth/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-depth/plugin.xml b/src/plugin/scoring-depth/plugin.xml
deleted file mode 100644
index ea57dc6..0000000
--- a/src/plugin/scoring-depth/plugin.xml
+++ /dev/null
@@ -1,24 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<plugin
- id="scoring-depth"
- name="Scoring plugin for depth-limited crawling."
- version="1.0.0"
- provider-name="ant.com">
-
- <runtime>
- <library name="scoring-depth.jar">
- <export name="*"/>
- </library>
- </runtime>
-
- <requires>
- <import plugin="nutch-extensionpoints"/>
- </requires>
-
- <extension id="org.apache.nutch.scoring.depth"
- name="Depth Scoring Filter"
- point="org.apache.nutch.scoring.ScoringFilter">
- <implementation id="DepthScoringFilter"
- class="org.apache.nutch.scoring.depth.DepthScoringFilter"/>
- </extension>
-</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java b/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java
deleted file mode 100644
index 0a0dd27..0000000
--- a/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java
+++ /dev/null
@@ -1,207 +0,0 @@
-package org.apache.nutch.scoring.depth;
-
-import java.util.Collection;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map.Entry;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.conf.Configured;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.scoring.ScoringFilter;
-import org.apache.nutch.scoring.ScoringFilterException;
-
-/**
- * This scoring filter limits the number of hops from the initial seed urls. If
- * the number of hops exceeds the depth (either the default value, or the one
- * set in the injector file) then all outlinks from that url are discarded,
- * effectively stopping further crawling along this path.
- */
-public class DepthScoringFilter extends Configured implements ScoringFilter {
- private static final Log LOG = LogFactory.getLog(DepthScoringFilter.class);
-
- public static final String DEPTH_KEY = "_depth_";
- public static final Text DEPTH_KEY_W = new Text(DEPTH_KEY);
- public static final String MAX_DEPTH_KEY = "_maxdepth_";
- public static final Text MAX_DEPTH_KEY_W = new Text(MAX_DEPTH_KEY);
-
- // maximum value that we are never likely to reach
- // because the depth of the Web graph is that high only
- // for spam cliques.
- public static final int DEFAULT_MAX_DEPTH = 1000;
-
- private int defaultMaxDepth;
-
- @Override
- public void setConf(Configuration conf) {
- super.setConf(conf);
- if (conf == null)
- return;
- defaultMaxDepth = conf.getInt("scoring.depth.max", DEFAULT_MAX_DEPTH);
- if (defaultMaxDepth <= 0) {
- defaultMaxDepth = DEFAULT_MAX_DEPTH;
- }
- }
-
- @Override
- public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
- ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
- CrawlDatum adjust, int allCount) throws ScoringFilterException {
- String depthString = parseData.getMeta(DEPTH_KEY);
- if (depthString == null) {
- LOG.warn("Missing depth, removing all outlinks from url " + fromUrl);
- targets.clear();
- return adjust;
- }
- int curDepth = Integer.parseInt(depthString);
- int curMaxDepth = defaultMaxDepth;
- IntWritable customMaxDepth = null;
- // allow overrides from injector
- String maxDepthString = parseData.getMeta(MAX_DEPTH_KEY);
- if (maxDepthString != null) {
- curMaxDepth = Integer.parseInt(maxDepthString);
- customMaxDepth = new IntWritable(curMaxDepth);
- }
- if (curDepth >= curMaxDepth) {
- // depth exceeded - throw away
- LOG.info("Depth limit (" + curMaxDepth
- + ") reached, ignoring outlinks for " + fromUrl);
- targets.clear();
- return adjust;
- }
- Iterator<Entry<Text, CrawlDatum>> it = targets.iterator();
- while (it.hasNext()) {
- Entry<Text, CrawlDatum> e = it.next();
- // record increased depth
- e.getValue().getMetaData()
- .put(DEPTH_KEY_W, new IntWritable(curDepth + 1));
- // record maxDepth if any
- if (customMaxDepth != null) {
- e.getValue().getMetaData().put(MAX_DEPTH_KEY_W, customMaxDepth);
- }
- }
- return adjust;
- }
-
- // prioritize by smaller values of depth
- @Override
- public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
- throws ScoringFilterException {
- // boost up by current depth
- int curDepth, curMaxDepth;
- IntWritable maxDepth = (IntWritable) datum.getMetaData().get(
- MAX_DEPTH_KEY_W);
- if (maxDepth != null) {
- curMaxDepth = maxDepth.get();
- } else {
- curMaxDepth = defaultMaxDepth;
- }
- IntWritable depth = (IntWritable) datum.getMetaData().get(DEPTH_KEY_W);
- if (depth == null) {
- // penalize
- curDepth = curMaxDepth;
- } else {
- curDepth = depth.get();
- }
- int mul = curMaxDepth - curDepth;
- return initSort * (1 + mul);
- }
-
- public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
- CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
- throws ScoringFilterException {
- return initScore;
- }
-
- @Override
- public void initialScore(Text url, CrawlDatum datum)
- throws ScoringFilterException {
- // the datum might already have some values set
- // e.g. obtained from redirection
- // in which case we don't want to override them
- if (datum.getMetaData().get(MAX_DEPTH_KEY_W) == null)
- datum.getMetaData()
- .put(MAX_DEPTH_KEY_W, new IntWritable(defaultMaxDepth));
- // initial depth is 1
- if (datum.getMetaData().get(DEPTH_KEY_W) == null)
- datum.getMetaData().put(DEPTH_KEY_W, new IntWritable(1));
- }
-
- @Override
- public void injectedScore(Text url, CrawlDatum datum)
- throws ScoringFilterException {
-
- // check for the presence of the depth limit key
- if (datum.getMetaData().get(MAX_DEPTH_KEY_W) != null) {
- // convert from Text to Int
- String depthString = datum.getMetaData().get(MAX_DEPTH_KEY_W).toString();
- datum.getMetaData().remove(MAX_DEPTH_KEY_W);
- int depth = Integer.parseInt(depthString);
- datum.getMetaData().put(MAX_DEPTH_KEY_W, new IntWritable(depth));
- } else { // put the default
- datum.getMetaData()
- .put(MAX_DEPTH_KEY_W, new IntWritable(defaultMaxDepth));
- }
- // initial depth is 1
- datum.getMetaData().put(DEPTH_KEY_W, new IntWritable(1));
- }
-
- @Override
- public void passScoreAfterParsing(Text url, Content content, Parse parse)
- throws ScoringFilterException {
- String depth = content.getMetadata().get(DEPTH_KEY);
- if (depth != null) {
- parse.getData().getParseMeta().set(DEPTH_KEY, depth);
- }
- String maxdepth = content.getMetadata().get(MAX_DEPTH_KEY);
- if (maxdepth != null) {
- parse.getData().getParseMeta().set(MAX_DEPTH_KEY, maxdepth);
- }
- }
-
- @Override
- public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content)
- throws ScoringFilterException {
- IntWritable depth = (IntWritable) datum.getMetaData().get(DEPTH_KEY_W);
- if (depth != null) {
- content.getMetadata().set(DEPTH_KEY, depth.toString());
- }
- IntWritable maxdepth = (IntWritable) datum.getMetaData().get(
- MAX_DEPTH_KEY_W);
- if (maxdepth != null) {
- content.getMetadata().set(MAX_DEPTH_KEY, maxdepth.toString());
- }
- }
-
- @Override
- public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
- List<CrawlDatum> inlinked) throws ScoringFilterException {
- // find a minimum of all depths
- int newDepth = DEFAULT_MAX_DEPTH;
- if (old != null) {
- IntWritable oldDepth = (IntWritable) old.getMetaData().get(DEPTH_KEY_W);
- if (oldDepth != null) {
- newDepth = oldDepth.get();
- } else {
- // not set ?
- initialScore(url, old);
- }
- }
- for (CrawlDatum lnk : inlinked) {
- IntWritable depth = (IntWritable) lnk.getMetaData().get(DEPTH_KEY_W);
- if (depth != null && depth.get() < newDepth) {
- newDepth = depth.get();
- }
- }
- datum.getMetaData().put(DEPTH_KEY_W, new IntWritable(newDepth));
- }
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/package-info.java
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/package-info.java b/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/package-info.java
deleted file mode 100644
index aa89797..0000000
--- a/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/package-info.java
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Scoring filter to stop crawling at a configurable depth
- * (number of "hops" from seed URLs).
- */
-package org.apache.nutch.scoring.depth;
-
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-link/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-link/build.xml b/src/plugin/scoring-link/build.xml
deleted file mode 100644
index 123b1ea..0000000
--- a/src/plugin/scoring-link/build.xml
+++ /dev/null
@@ -1,27 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="scoring-link" default="jar-core">
-
- <import file="../build-plugin.xml"/>
-
- <!-- Deploy Unit test dependencies -->
- <target name="deps-test">
- <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
- </target>
-
-</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-link/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-link/ivy.xml b/src/plugin/scoring-link/ivy.xml
deleted file mode 100644
index 1a86d68..0000000
--- a/src/plugin/scoring-link/ivy.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<ivy-module version="1.0">
- <info organisation="org.apache.nutch" module="${ant.project.name}">
- <license name="Apache 2.0"/>
- <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
- <description>
- Apache Nutch
- </description>
- </info>
-
- <configurations>
- <include file="../../..//ivy/ivy-configurations.xml"/>
- </configurations>
-
- <publications>
- <!--get the artifact from our module name-->
- <artifact conf="master"/>
- </publications>
-
- <dependencies>
- </dependencies>
-
-</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-link/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-link/plugin.xml b/src/plugin/scoring-link/plugin.xml
deleted file mode 100644
index 2b1c1e1..0000000
--- a/src/plugin/scoring-link/plugin.xml
+++ /dev/null
@@ -1,39 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
- id="scoring-link"
- name="Link Analysis Scoring Plug-in"
- version="1.0.0"
- provider-name="nutch.org">
-
-
- <runtime>
- <library name="scoring-link.jar">
- <export name="*"/>
- </library>
- </runtime>
-
- <extension id="org.apache.nutch.scoring.link"
- name="LinkAnalysisScoring"
- point="org.apache.nutch.scoring.ScoringFilter">
-
- <implementation id="org.apache.nutch.scoring.link.LinkAnalysisScoringFilter"
- class="org.apache.nutch.scoring.link.LinkAnalysisScoringFilter" />
- </extension>
-
-</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java b/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java
deleted file mode 100644
index a143f46..0000000
--- a/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.scoring.link;
-
-import java.util.Collection;
-import java.util.List;
-import java.util.Map.Entry;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.metadata.Nutch;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.scoring.ScoringFilter;
-import org.apache.nutch.scoring.ScoringFilterException;
-
-public class LinkAnalysisScoringFilter implements ScoringFilter {
-
- private Configuration conf;
- private float normalizedScore = 1.00f;
-
- public LinkAnalysisScoringFilter() {
-
- }
-
- public Configuration getConf() {
- return conf;
- }
-
- public void setConf(Configuration conf) {
- this.conf = conf;
- normalizedScore = conf.getFloat("link.analyze.normalize.score", 1.00f);
- }
-
- public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
- ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
- CrawlDatum adjust, int allCount) throws ScoringFilterException {
- return adjust;
- }
-
- public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
- throws ScoringFilterException {
- return datum.getScore() * initSort;
- }
-
- public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
- CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
- throws ScoringFilterException {
- return (normalizedScore * dbDatum.getScore());
- }
-
- public void initialScore(Text url, CrawlDatum datum)
- throws ScoringFilterException {
- datum.setScore(0.0f);
- }
-
- public void injectedScore(Text url, CrawlDatum datum)
- throws ScoringFilterException {
- }
-
- public void passScoreAfterParsing(Text url, Content content, Parse parse)
- throws ScoringFilterException {
- parse.getData().getContentMeta()
- .set(Nutch.SCORE_KEY, content.getMetadata().get(Nutch.SCORE_KEY));
- }
-
- public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content)
- throws ScoringFilterException {
- content.getMetadata().set(Nutch.SCORE_KEY, "" + datum.getScore());
- }
-
- public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
- List<CrawlDatum> inlinked) throws ScoringFilterException {
- // nothing to do
- }
-
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/package-info.java
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/package-info.java b/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/package-info.java
deleted file mode 100644
index 9dc0c35..0000000
--- a/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/package-info.java
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Scoring filter used in conjunction with
- * {@link org.apache.nutch.scoring.webgraph.WebGraph}.
- */
-package org.apache.nutch.scoring.link;
-
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-opic/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-opic/build.xml b/src/plugin/scoring-opic/build.xml
deleted file mode 100644
index 137dab4..0000000
--- a/src/plugin/scoring-opic/build.xml
+++ /dev/null
@@ -1,27 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="scoring-opic" default="jar-core">
-
- <import file="../build-plugin.xml"/>
-
- <!-- Deploy Unit test dependencies -->
- <target name="deps-test">
- <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
- </target>
-
-</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-opic/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-opic/ivy.xml b/src/plugin/scoring-opic/ivy.xml
deleted file mode 100644
index 1a86d68..0000000
--- a/src/plugin/scoring-opic/ivy.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<ivy-module version="1.0">
- <info organisation="org.apache.nutch" module="${ant.project.name}">
- <license name="Apache 2.0"/>
- <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
- <description>
- Apache Nutch
- </description>
- </info>
-
- <configurations>
- <include file="../../..//ivy/ivy-configurations.xml"/>
- </configurations>
-
- <publications>
- <!--get the artifact from our module name-->
- <artifact conf="master"/>
- </publications>
-
- <dependencies>
- </dependencies>
-
-</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-opic/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-opic/plugin.xml b/src/plugin/scoring-opic/plugin.xml
deleted file mode 100644
index 3805a31..0000000
--- a/src/plugin/scoring-opic/plugin.xml
+++ /dev/null
@@ -1,39 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
- id="scoring-opic"
- name="OPIC Scoring Plug-in"
- version="1.0.0"
- provider-name="nutch.org">
-
-
- <runtime>
- <library name="scoring-opic.jar">
- <export name="*"/>
- </library>
- </runtime>
-
- <extension id="org.apache.nutch.scoring.opic"
- name="OPICScoring"
- point="org.apache.nutch.scoring.ScoringFilter">
-
- <implementation id="org.apache.nutch.scoring.opic.OPICScoringFilter"
- class="org.apache.nutch.scoring.opic.OPICScoringFilter" />
- </extension>
-
-</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java b/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
deleted file mode 100644
index e943d06..0000000
--- a/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
+++ /dev/null
@@ -1,173 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.scoring.opic;
-
-import java.net.MalformedURLException;
-import java.net.URL;
-import java.util.Collection;
-import java.util.List;
-import java.util.Map.Entry;
-
-// Slf4j Logging imports
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.metadata.Nutch;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.scoring.ScoringFilter;
-import org.apache.nutch.scoring.ScoringFilterException;
-
-/**
- * This plugin implements a variant of an Online Page Importance Computation
- * (OPIC) score, described in this paper: <a
- * href="http://www2003.org/cdrom/papers/refereed/p007/p7-abiteboul.html"/>
- * Abiteboul, Serge and Preda, Mihai and Cobena, Gregory (2003), Adaptive
- * On-Line Page Importance Computation </a>.
- *
- * @author Andrzej Bialecki
- */
-public class OPICScoringFilter implements ScoringFilter {
-
- private final static Logger LOG = LoggerFactory
- .getLogger(OPICScoringFilter.class);
-
- private Configuration conf;
- private float scoreInjected;
- private float scorePower;
- private float internalScoreFactor;
- private float externalScoreFactor;
- private boolean countFiltered;
-
- public Configuration getConf() {
- return conf;
- }
-
- public void setConf(Configuration conf) {
- this.conf = conf;
- scorePower = conf.getFloat("indexer.score.power", 0.5f);
- internalScoreFactor = conf.getFloat("db.score.link.internal", 1.0f);
- externalScoreFactor = conf.getFloat("db.score.link.external", 1.0f);
- countFiltered = conf.getBoolean("db.score.count.filtered", false);
- }
-
- public void injectedScore(Text url, CrawlDatum datum)
- throws ScoringFilterException {
- }
-
- /**
- * Set to 0.0f (unknown value) - inlink contributions will bring it to a
- * correct level. Newly discovered pages have at least one inlink.
- */
- public void initialScore(Text url, CrawlDatum datum)
- throws ScoringFilterException {
- datum.setScore(0.0f);
- }
-
- /** Use {@link CrawlDatum#getScore()}. */
- public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
- throws ScoringFilterException {
- return datum.getScore() * initSort;
- }
-
- /** Increase the score by a sum of inlinked scores. */
- public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
- List<CrawlDatum> inlinked) throws ScoringFilterException {
- float adjust = 0.0f;
- for (int i = 0; i < inlinked.size(); i++) {
- CrawlDatum linked = inlinked.get(i);
- adjust += linked.getScore();
- }
- if (old == null)
- old = datum;
- datum.setScore(old.getScore() + adjust);
- }
-
- /** Store a float value of CrawlDatum.getScore() under Fetcher.SCORE_KEY. */
- public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) {
- content.getMetadata().set(Nutch.SCORE_KEY, "" + datum.getScore());
- }
-
- /** Copy the value from Content metadata under Fetcher.SCORE_KEY to parseData. */
- public void passScoreAfterParsing(Text url, Content content, Parse parse) {
- parse.getData().getContentMeta()
- .set(Nutch.SCORE_KEY, content.getMetadata().get(Nutch.SCORE_KEY));
- }
-
- /**
- * Get a float value from Fetcher.SCORE_KEY, divide it by the number of
- * outlinks and apply.
- */
- public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
- ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
- CrawlDatum adjust, int allCount) throws ScoringFilterException {
- float score = scoreInjected;
- String scoreString = parseData.getContentMeta().get(Nutch.SCORE_KEY);
- if (scoreString != null) {
- try {
- score = Float.parseFloat(scoreString);
- } catch (Exception e) {
- LOG.error("Error: ", e);
- }
- }
- int validCount = targets.size();
- if (countFiltered) {
- score /= allCount;
- } else {
- if (validCount == 0) {
- // no outlinks to distribute score, so just return adjust
- return adjust;
- }
- score /= validCount;
- }
- // internal and external score factor
- float internalScore = score * internalScoreFactor;
- float externalScore = score * externalScoreFactor;
- for (Entry<Text, CrawlDatum> target : targets) {
- try {
- String toHost = new URL(target.getKey().toString()).getHost();
- String fromHost = new URL(fromUrl.toString()).getHost();
- if (toHost.equalsIgnoreCase(fromHost)) {
- target.getValue().setScore(internalScore);
- } else {
- target.getValue().setScore(externalScore);
- }
- } catch (MalformedURLException e) {
- LOG.error("Error: ", e);
- target.getValue().setScore(externalScore);
- }
- }
- // XXX (ab) no adjustment? I think this is contrary to the algorithm descr.
- // XXX in the paper, where page "loses" its score if it's distributed to
- // XXX linked pages...
- return adjust;
- }
-
- /** Dampen the boost value by scorePower. */
- public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
- CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
- throws ScoringFilterException {
- return (float) Math.pow(dbDatum.getScore(), scorePower) * initScore;
- }
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/package-info.java
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/package-info.java b/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/package-info.java
deleted file mode 100644
index 26f6cbe..0000000
--- a/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/package-info.java
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Scoring filter implementing a variant of the Online Page Importance Computation
- * (OPIC) algorithm.
- */
-package org.apache.nutch.scoring.opic;
-
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-similarity/build-ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-similarity/build-ivy.xml b/src/plugin/scoring-similarity/build-ivy.xml
deleted file mode 100644
index 50fbb96..0000000
--- a/src/plugin/scoring-similarity/build-ivy.xml
+++ /dev/null
@@ -1,54 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="scoring-similarity" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant">
-
- <property name="ivy.install.version" value="2.1.0" />
- <condition property="ivy.home" value="${env.IVY_HOME}">
- <isset property="env.IVY_HOME" />
- </condition>
- <property name="ivy.home" value="${user.home}/.ant" />
- <property name="ivy.checksums" value="" />
- <property name="ivy.jar.dir" value="${ivy.home}/lib" />
- <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
-
- <target name="download-ivy" unless="offline">
-
- <mkdir dir="${ivy.jar.dir}"/>
- <!-- download Ivy from web site so that it can be used even without any special installation -->
- <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar"
- dest="${ivy.jar.file}" usetimestamp="true"/>
- </target>
-
- <target name="init-ivy" depends="download-ivy">
- <!-- try to load ivy here from ivy home, in case the user has not already dropped
- it into ant's lib dir (note that the latter copy will always take precedence).
- We will not fail as long as local lib dir exists (it may be empty) and
- ivy is in at least one of ant's lib dir or the local lib dir. -->
- <path id="ivy.lib.path">
- <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
-
- </path>
- <taskdef resource="org/apache/ivy/ant/antlib.xml"
- uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
- </target>
-
- <target name="deps-jar" depends="init-ivy">
- <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]"/>
- </target>
-
-</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-similarity/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-similarity/build.xml b/src/plugin/scoring-similarity/build.xml
deleted file mode 100644
index 66ac8f3..0000000
--- a/src/plugin/scoring-similarity/build.xml
+++ /dev/null
@@ -1,27 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="scoring-similarity" default="jar-core">
-
- <import file="../build-plugin.xml"/>
-
- <!-- Deploy Unit test dependencies -->
- <target name="deps-test">
- <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
- </target>
-
-</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-similarity/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-similarity/ivy.xml b/src/plugin/scoring-similarity/ivy.xml
deleted file mode 100644
index be0a1de..0000000
--- a/src/plugin/scoring-similarity/ivy.xml
+++ /dev/null
@@ -1,42 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<ivy-module version="1.0">
- <info organisation="org.apache.nutch" module="${ant.project.name}">
- <license name="Apache 2.0"/>
- <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
- <description>
- Apache Nutch
- </description>
- </info>
-
- <configurations>
- <include file="../../..//ivy/ivy-configurations.xml"/>
- </configurations>
-
- <publications>
- <!--get the artifact from our module name-->
- <artifact conf="master"/>
- </publications>
-
- <dependencies>
- <dependency org="org.apache.lucene" name="lucene-analyzers-common" rev="5.5.0" conf="*->default"/>
- </dependencies>
-
-</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-similarity/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-similarity/plugin.xml b/src/plugin/scoring-similarity/plugin.xml
deleted file mode 100644
index 9639c18..0000000
--- a/src/plugin/scoring-similarity/plugin.xml
+++ /dev/null
@@ -1,45 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
- id="scoring-similarity"
- name="Similarity based Scoring Plug-in"
- version="1.0.0"
- provider-name="nutch.org">
-
-
- <runtime>
- <library name="scoring-similarity.jar">
- <export name="*"/>
- </library>
- <library name="lucene-analyzers-common-5.5.0.jar"/>
- <library name="lucene-core-5.5.0.jar"/>
- </runtime>
-
- <requires>
- <import plugin="nutch-extensionpoints"/>
- </requires>
-
- <extension id="org.apache.nutch.scoring.similarity"
- name="SimilarityScoring"
- point="org.apache.nutch.scoring.ScoringFilter">
-
- <implementation id="scoring-similarity"
- class="org.apache.nutch.scoring.similarity.SimilarityScoringFilter" />
- </extension>
-
-</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityModel.java
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityModel.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityModel.java
deleted file mode 100644
index f44fabd..0000000
--- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityModel.java
+++ /dev/null
@@ -1,38 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.scoring.similarity;
-
-import java.util.Collection;
-import java.util.Map.Entry;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.protocol.Content;
-
-public interface SimilarityModel {
-
- public void setConf(Configuration conf);
-
- public float setURLScoreAfterParsing(Text url, Content content, Parse parse);
-
- public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
- ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
- CrawlDatum adjust, int allCount);
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java
deleted file mode 100644
index 0f905b8..0000000
--- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java
+++ /dev/null
@@ -1,70 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.scoring.similarity;
-
-import java.util.Collection;
-import java.util.List;
-import java.util.Map.Entry;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.metadata.Nutch;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.scoring.AbstractScoringFilter;
-import org.apache.nutch.scoring.ScoringFilterException;
-import org.apache.nutch.scoring.similarity.cosine.CosineSimilarity;
-
-public class SimilarityScoringFilter extends AbstractScoringFilter {
-
- private Configuration conf;
- private SimilarityModel similarityModel;
- @Override
- public Configuration getConf() {
- return conf;
- }
-
- @Override
- public void setConf(Configuration conf) {
- this.conf = conf;
- switch(conf.get("scoring.similarity.model","cosine")){
- case "cosine":
- similarityModel = (SimilarityModel) new CosineSimilarity();
- break;
- }
- similarityModel.setConf(conf);
- }
-
- @Override
- public void passScoreAfterParsing(Text url, Content content, Parse parse)
- throws ScoringFilterException {
-
- float score = similarityModel.setURLScoreAfterParsing(url, content, parse);
- parse.getData().getContentMeta()
- .set(Nutch.SCORE_KEY, score+"");
- }
-
- @Override
- public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
- ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
- CrawlDatum adjust, int allCount) throws ScoringFilterException {
- similarityModel.distributeScoreToOutlinks(fromUrl, parseData, targets, adjust, allCount);
- return adjust;
- }
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java
deleted file mode 100644
index 9853b34..0000000
--- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java
+++ /dev/null
@@ -1,84 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.scoring.similarity.cosine;
-
-import java.io.IOException;
-import java.util.Collection;
-import java.util.Map.Entry;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.util.StringUtils;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.metadata.Nutch;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.scoring.similarity.SimilarityModel;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public class CosineSimilarity implements SimilarityModel{
-
- private Configuration conf;
- private final static Logger LOG = LoggerFactory
- .getLogger(CosineSimilarity.class);
-
- @Override
- public void setConf(Configuration conf) {
- this.conf = conf;
- }
-
- @Override
- public float setURLScoreAfterParsing(Text url, Content content, Parse parse) {
- float score = 1;
-
- try {
- if(!Model.isModelCreated){
- Model.createModel(conf);
- }
- String metatags = parse.getData().getParseMeta().get("metatag.keyword");
- String metaDescription = parse.getData().getParseMeta().get("metatag.description");
- int[] ngramArr = Model.retrieveNgrams(conf);
- int mingram = ngramArr[0];
- int maxgram = ngramArr[1];
- DocVector docVector = Model.createDocVector(parse.getText()+metaDescription+metatags, mingram, maxgram);
- if(docVector!=null){
- score = Model.computeCosineSimilarity(docVector);
- LOG.info("Setting score of {} to {}",url, score);
- }
- else {
- throw new Exception("Could not create DocVector from parsed text");
- }
- } catch (Exception e) {
- LOG.error("Error creating Cosine Model, setting scores of urls to 1 : {}", StringUtils.stringifyException(e));
- }
- return score;
- }
-
- @Override
- public CrawlDatum distributeScoreToOutlinks(Text fromUrl, ParseData parseData,
- Collection<Entry<Text, CrawlDatum>> targets, CrawlDatum adjust,
- int allCount) {
- float score = Float.parseFloat(parseData.getContentMeta().get(Nutch.SCORE_KEY));
- for (Entry<Text, CrawlDatum> target : targets) {
- target.getValue().setScore(score);
- }
- return adjust;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/DocVector.java
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/DocVector.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/DocVector.java
deleted file mode 100644
index 33c3a23..0000000
--- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/DocVector.java
+++ /dev/null
@@ -1,57 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.scoring.similarity.cosine;
-
-import java.util.HashMap;
-import java.util.Map;
-
-public class DocVector {
-
- public HashMap<Integer, Long> termVector;
- public HashMap<String, Integer> termFreqVector;
-
- public DocVector() {
- termFreqVector = new HashMap<>();
- }
-
- public void setTermFreqVector(HashMap<String, Integer> termFreqVector) {
- this.termFreqVector = termFreqVector;
- }
-
- public void setVectorEntry(int pos, long freq) {
- termVector.put(pos, freq);
- }
-
- public float dotProduct(DocVector docVector) {
- float product = 0;
- for(Map.Entry<String, Integer> entry : termFreqVector.entrySet()) {
- if(docVector.termFreqVector.containsKey(entry.getKey())) {
- product += docVector.termFreqVector.get(entry.getKey())*entry.getValue();
- }
- }
- return product;
- }
-
- public float getL2Norm() {
- float sum = 0;
- for(Map.Entry<String, Integer> entry : termFreqVector.entrySet()) {
- sum += entry.getValue()*entry.getValue();
- }
- return (float) Math.sqrt(sum);
- }
-
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java
deleted file mode 100644
index d8180f2..0000000
--- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java
+++ /dev/null
@@ -1,190 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.scoring.similarity.cosine;
-
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.List;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.util.StringUtils;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.nutch.scoring.similarity.util.LuceneAnalyzerUtil.StemFilterType;
-import org.apache.nutch.scoring.similarity.util.LuceneTokenizer;
-import org.apache.nutch.scoring.similarity.util.LuceneTokenizer.TokenizerType;
-import org.apache.tika.Tika;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * This class creates a model used to store Document vector representation of the corpus.
- *
- */
-public class Model {
-
- //Currently only one file, but in future could accept a corpus hence an ArrayList
- public static ArrayList<DocVector> docVectors = new ArrayList<>();
- private static final Logger LOG = LoggerFactory.getLogger(Model.class);
- public static boolean isModelCreated = false;
- private static List<String> stopWords;
-
- public static synchronized void createModel(Configuration conf) throws IOException {
- if(isModelCreated) {
- LOG.info("Model exists, skipping model creation");
- return;
- }
- LOG.info("Creating Cosine model");
- try {
- //If user has specified a stopword file other than the template
- if(!conf.get("scoring.similarity.stopword.file").equals("stopwords.txt.template")) {
- stopWords = new ArrayList<String>();
- String stopWord;
- BufferedReader br = new BufferedReader(conf.getConfResourceAsReader((conf.get("scoring.similarity.stopword.file"))));
- while ((stopWord = br.readLine()) != null) {
- stopWords.add(stopWord);
- }
- LOG.info("Loaded custom stopwords from {}",conf.get("scoring.similarity.stopword.file"));
- }
-
- int[] ngramArr = retrieveNgrams(conf);
- int mingram = ngramArr[0];
- int maxgram = ngramArr[1];
- LOG.info("Value of mingram: {} maxgram: {}", mingram, maxgram);
-
- // TODO : Allow for corpus of documents to be provided as gold standard.
- String line;
- StringBuilder sb = new StringBuilder();
- BufferedReader br = new BufferedReader(conf.getConfResourceAsReader((conf.get("cosine.goldstandard.file"))));
- while ((line = br.readLine()) != null) {
- sb.append(line);
- }
- DocVector goldStandard = createDocVector(sb.toString(), mingram, maxgram);
- if(goldStandard!=null)
- docVectors.add(goldStandard);
- else {
- throw new Exception("Could not create DocVector for goldstandard");
- }
- } catch (Exception e) {
- LOG.warn("Failed to add {} to model : {}",conf.get("cosine.goldstandard.file","goldstandard.txt.template"),
- StringUtils.stringifyException(e));
- }
- if(docVectors.size()>0) {
- LOG.info("Cosine model creation complete");
- isModelCreated = true;
- }
- else
- LOG.info("Cosine model creation failed");
- }
-
- /**
- * Used to create a DocVector from given String text. Used during the parse stage of the crawl
- * cycle to create a DocVector of the currently parsed page from the parseText attribute value
- * @param content The text to tokenize
- * @param mingram Value of mingram for tokenizing
- * @param maxgram Value of maxgram for tokenizing
- */
- public static DocVector createDocVector(String content, int mingram, int maxgram) {
- LuceneTokenizer tokenizer;
-
- if(mingram > 1 && maxgram > 1){
- LOG.info("Using Ngram Cosine Model, user specified mingram value : {} maxgram value : {}", mingram, maxgram);
- tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, StemFilterType.PORTERSTEM_FILTER, mingram, maxgram);
- } else if (mingram > 1) {
- maxgram = mingram;
- LOG.info("Using Ngram Cosine Model, user specified mingram value : {} maxgram value : {}", mingram, maxgram);
- tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, StemFilterType.PORTERSTEM_FILTER, mingram, maxgram);
- }
- else if(stopWords!=null) {
- tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, stopWords, true,
- StemFilterType.PORTERSTEM_FILTER);
- }
- else {
- tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, true,
- StemFilterType.PORTERSTEM_FILTER);
- }
- TokenStream tStream = tokenizer.getTokenStream();
- HashMap<String, Integer> termVector = new HashMap<>();
- try {
- CharTermAttribute charTermAttribute = tStream.addAttribute(CharTermAttribute.class);
- tStream.reset();
- while(tStream.incrementToken()) {
- String term = charTermAttribute.toString();
- LOG.debug(term);
- if(termVector.containsKey(term)) {
- int count = termVector.get(term);
- count++;
- termVector.put(term, count);
- }
- else {
- termVector.put(term, 1);
- }
- }
- DocVector docVector = new DocVector();
- docVector.setTermFreqVector(termVector);
- return docVector;
- } catch (IOException e) {
- LOG.error("Error creating DocVector : {}",StringUtils.stringifyException(e));
- }
- return null;
- }
-
- public static float computeCosineSimilarity(DocVector docVector) {
- float scores[] = new float[docVectors.size()];
- int i=0;
- float maxScore = 0;
- for(DocVector corpusDoc : docVectors) {
- float numerator = docVector.dotProduct(corpusDoc);
- float denominator = docVector.getL2Norm()*corpusDoc.getL2Norm();
- float currentScore = numerator/denominator;
- scores[i++] = currentScore;
- maxScore = (currentScore>maxScore)? currentScore : maxScore;
- }
- // Returning the max score amongst all documents in the corpus
- return maxScore;
- }
-
- /**
- * Retrieves mingram and maxgram from configuration
- * @param conf Configuration to retrieve mingram and maxgram
- * @return ngram array as mingram at first index and maxgram at second index
- */
- public static int[] retrieveNgrams(Configuration conf){
- int[] ngramArr = new int[2];
- //Check if user has specified mingram or ngram for ngram cosine model
- String[] ngramStr = conf.getStrings("scoring.similarity.ngrams", "1,1");
- //mingram
- ngramArr[0] = Integer.parseInt(ngramStr[0]);
- int maxgram;
- if (ngramStr.length > 1) {
- //maxgram
- ngramArr[1] = Integer.parseInt(ngramStr[1]);
- } else {
- //maxgram
- ngramArr[1] = ngramArr[0];
- }
- return ngramArr;
- }
-}
\ No newline at end of file