You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2015/01/09 07:34:37 UTC
svn commit: r1650447 [14/25] - in /nutch/branches/2.x: ./ src/java/org/apache/nutch/api/ src/java/org/apache/nutch/api/impl/ src/java/org/apache/nutch/api/impl/db/ src/java/org/apache/nutch/api/model/response/ src/java/org/apache/nutch/api/resources/ s...

Modified: nutch/branches/2.x/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java (original)
+++ nutch/branches/2.x/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java Fri Jan  9 06:34:33 2015
@@ -38,101 +38,101 @@ import java.util.StringTokenizer;
 
 /** Adds basic searchable fields to a document. */
 public class CCIndexingFilter implements IndexingFilter {
-	public static final Logger LOG = LoggerFactory.getLogger(CCIndexingFilter.class);
+  public static final Logger LOG = LoggerFactory
+      .getLogger(CCIndexingFilter.class);
 
-	/** The name of the document field we use. */
-	public static String FIELD = "cc";
+  /** The name of the document field we use. */
+  public static String FIELD = "cc";
 
-	private Configuration conf;
+  private Configuration conf;
 
-	private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
+  private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
+
+  static {
+    FIELDS.add(WebPage.Field.BASE_URL);
+    FIELDS.add(WebPage.Field.METADATA);
+  }
+
+  /**
+   * Add the features represented by a license URL. Urls are of the form
+   * "http://creativecommons.org/licenses/xx-xx/xx/xx", where "xx" names a
+   * license feature.
+   */
+  public void addUrlFeatures(NutchDocument doc, String urlString) {
+    try {
+      URL url = new URL(urlString);
+
+      // tokenize the path of the url, breaking at slashes and dashes
+      StringTokenizer names = new StringTokenizer(url.getPath(), "/-");
+
+      if (names.hasMoreTokens())
+        names.nextToken(); // throw away "licenses"
+
+      // add a feature per component after "licenses"
+      while (names.hasMoreTokens()) {
+        String feature = names.nextToken();
+        addFeature(doc, feature);
+      }
+    } catch (MalformedURLException e) {
+      if (LOG.isWarnEnabled()) {
+        LOG.warn("CC: failed to parse url: " + urlString + " : " + e);
+      }
+    }
+  }
+
+  private void addFeature(NutchDocument doc, String feature) {
+    doc.add(FIELD, feature);
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  @Override
+  public Collection<Field> getFields() {
+    return FIELDS;
+  }
+
+  @Override
+  public NutchDocument filter(NutchDocument doc, String url, WebPage page)
+      throws IndexingException {
+
+    ByteBuffer blicense = page.getMetadata().get(
+        new Utf8(CreativeCommons.LICENSE_URL));
+    if (blicense != null) {
+      String licenseUrl = Bytes.toString(blicense);
+      if (LOG.isInfoEnabled()) {
+        LOG.info("CC: indexing " + licenseUrl + " for: " + url.toString());
+      }
+
+      // add the entire license as cc:license=xxx
+      addFeature(doc, "license=" + licenseUrl);
+
+      // index license attributes extracted of the license url
+      addUrlFeatures(doc, licenseUrl);
+    }
+
+    // index the license location as cc:meta=xxx
+    ByteBuffer blicenseloc = page.getMetadata().get(
+        new Utf8(CreativeCommons.LICENSE_LOCATION));
+    if (blicenseloc != null) {
+      String licenseLocation = Bytes.toString(blicenseloc);
+      addFeature(doc, "meta=" + licenseLocation);
+    }
+
+    // index the work type cc:type=xxx
+    ByteBuffer bworkType = page.getMetadata().get(
+        new Utf8(CreativeCommons.WORK_TYPE));
+    if (bworkType != null) {
+      String workType = Bytes.toString(bworkType);
+      addFeature(doc, workType);
+    }
 
-	static {
-		FIELDS.add(WebPage.Field.BASE_URL);
-		FIELDS.add(WebPage.Field.METADATA);
-	}
-
-	/**
-	 * Add the features represented by a license URL. Urls are of the form
-	 * "http://creativecommons.org/licenses/xx-xx/xx/xx", where "xx" names a
-	 * license feature.
-	 */
-	public void addUrlFeatures(NutchDocument doc, String urlString) {
-		try {
-			URL url = new URL(urlString);
-
-			// tokenize the path of the url, breaking at slashes and dashes
-			StringTokenizer names = new StringTokenizer(url.getPath(), "/-");
-
-			if (names.hasMoreTokens())
-				names.nextToken(); // throw away "licenses"
-
-			// add a feature per component after "licenses"
-			while (names.hasMoreTokens()) {
-				String feature = names.nextToken();
-				addFeature(doc, feature);
-			}
-		} catch (MalformedURLException e) {
-			if (LOG.isWarnEnabled()) {
-				LOG.warn("CC: failed to parse url: " + urlString + " : " + e);
-			}
-		}
-	}
-
-	private void addFeature(NutchDocument doc, String feature) {
-		doc.add(FIELD, feature);
-	}
-
-	public void setConf(Configuration conf) {
-		this.conf = conf;
-	}
-
-	public Configuration getConf() {
-		return this.conf;
-	}
-
-	@Override
-	public Collection<Field> getFields() {
-		return FIELDS;
-	}
-
-	@Override
-	public NutchDocument filter(NutchDocument doc, String url, WebPage page)
-			throws IndexingException {
-
-		ByteBuffer blicense = page.getMetadata().get(new Utf8(
-				CreativeCommons.LICENSE_URL));
-		if (blicense != null) {
-			String licenseUrl = Bytes.toString(blicense);
-			if (LOG.isInfoEnabled()) {
-				LOG.info("CC: indexing " + licenseUrl + " for: "
-						+ url.toString());
-			}
-
-			// add the entire license as cc:license=xxx
-			addFeature(doc, "license=" + licenseUrl);
-
-			// index license attributes extracted of the license url
-			addUrlFeatures(doc, licenseUrl);
-		}
-
-		// index the license location as cc:meta=xxx
-		ByteBuffer blicenseloc = page.getMetadata().get(new Utf8(
-				CreativeCommons.LICENSE_LOCATION));
-		if (blicenseloc != null) {
-			String licenseLocation = Bytes.toString(blicenseloc);
-			addFeature(doc, "meta=" + licenseLocation);
-		}
-
-		// index the work type cc:type=xxx
-		ByteBuffer bworkType = page.getMetadata().get(new Utf8(
-				CreativeCommons.WORK_TYPE));
-		if (bworkType != null) {
-			String workType = Bytes.toString(bworkType);
-			addFeature(doc, workType);
-		}
-
-		return doc;
-	}
+    return doc;
+  }
 
 }

Modified: nutch/branches/2.x/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java (original)
+++ nutch/branches/2.x/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java Fri Jan  9 06:34:33 2015
@@ -55,8 +55,8 @@ public class CCParseFilter implements Pa
     }
 
     /** Scan the document adding attributes to metadata. */
-    public static void walk(Node doc, URL base, WebPage page,
-        Configuration conf) throws ParseException {
+    public static void walk(Node doc, URL base, WebPage page, Configuration conf)
+        throws ParseException {
 
       // walk the DOM tree, scanning for license data
       Walker walker = new Walker(base);
@@ -67,36 +67,37 @@ public class CCParseFilter implements Pa
       String licenseLocation = null;
       if (walker.rdfLicense != null) { // 1st choice: subject in RDF
         licenseLocation = "rdf";
-	licenseUrl = walker.rdfLicense;
+        licenseUrl = walker.rdfLicense;
       } else if (walker.relLicense != null) { // 2nd: anchor w/
         // rel=license
         licenseLocation = "rel";
         licenseUrl = walker.relLicense.toString();
       } else if (walker.anchorLicense != null) { // 3rd: anchor w/ CC
         // license
-	licenseLocation = "a";
-	licenseUrl = walker.anchorLicense.toString();
+        licenseLocation = "a";
+        licenseUrl = walker.anchorLicense.toString();
       } else if (conf.getBoolean("creativecommons.exclude.unlicensed", false)) {
-          throw new ParseException("No CC license.  Excluding.");
+        throw new ParseException("No CC license.  Excluding.");
       }
 
       // add license to metadata
       if (licenseUrl != null) {
         if (LOG.isDebugEnabled()) {
-	  LOG.debug("CC: found " + licenseUrl + " in " + licenseLocation + " of " + base);
-	}
-	page.getMetadata().put(new Utf8(CreativeCommons.LICENSE_URL),
-	ByteBuffer.wrap(licenseUrl.getBytes()));
-	page.getMetadata().put(new Utf8(CreativeCommons.LICENSE_LOCATION),
-	    ByteBuffer.wrap(licenseLocation.getBytes()));
+          LOG.debug("CC: found " + licenseUrl + " in " + licenseLocation
+              + " of " + base);
+        }
+        page.getMetadata().put(new Utf8(CreativeCommons.LICENSE_URL),
+            ByteBuffer.wrap(licenseUrl.getBytes()));
+        page.getMetadata().put(new Utf8(CreativeCommons.LICENSE_LOCATION),
+            ByteBuffer.wrap(licenseLocation.getBytes()));
       }
 
       if (walker.workType != null) {
         if (LOG.isDebugEnabled()) {
-	  LOG.debug("CC: found " + walker.workType + " in " + base);
-	}
-	page.getMetadata().put(new Utf8(CreativeCommons.WORK_TYPE),
-	   ByteBuffer.wrap(walker.workType.getBytes()));
+          LOG.debug("CC: found " + walker.workType + " in " + base);
+        }
+        page.getMetadata().put(new Utf8(CreativeCommons.WORK_TYPE),
+            ByteBuffer.wrap(walker.workType.getBytes()));
       }
 
     }
@@ -121,8 +122,8 @@ public class CCParseFilter implements Pa
     }
 
     /**
-     * Extract license url from element, if any. Thse are the href attribute
-     * of anchor elements with rel="license". These must also point to
+     * Extract license url from element, if any. Thse are the href attribute of
+     * anchor elements with rel="license". These must also point to
      * http://creativecommons.org/licenses/.
      */
     private void findLicenseUrl(Element element) {
@@ -137,27 +138,27 @@ public class CCParseFilter implements Pa
       try {
         URL url = new URL(base, href); // resolve the url
         // check that it's a CC license URL
-	if ("http".equalsIgnoreCase(url.getProtocol())
-	    && "creativecommons.org".equalsIgnoreCase(url.getHost())
-	    && url.getPath() != null && url.getPath().startsWith("/licenses/")
-	    && url.getPath().length() > "/licenses/".length()) {
-
-	  // check rel="license"
-	  String rel = element.getAttribute("rel");
-	  if (rel != null && "license".equals(rel)
-	      && this.relLicense == null) {
-	    this.relLicense = url; // found rel license
-	  } else if (this.anchorLicense == null) {
-	    this.anchorLicense = url; // found anchor license
-	  }
-	}
+        if ("http".equalsIgnoreCase(url.getProtocol())
+            && "creativecommons.org".equalsIgnoreCase(url.getHost())
+            && url.getPath() != null && url.getPath().startsWith("/licenses/")
+            && url.getPath().length() > "/licenses/".length()) {
+
+          // check rel="license"
+          String rel = element.getAttribute("rel");
+          if (rel != null && "license".equals(rel) && this.relLicense == null) {
+            this.relLicense = url; // found rel license
+          } else if (this.anchorLicense == null) {
+            this.anchorLicense = url; // found anchor license
+          }
+        }
       } catch (MalformedURLException e) { // ignore malformed urls
       }
     }
 
     /** Configure a namespace aware XML parser. */
-    private static final DocumentBuilderFactory FACTORY = DocumentBuilderFactory.newInstance();
-      
+    private static final DocumentBuilderFactory FACTORY = DocumentBuilderFactory
+        .newInstance();
+
     static {
       FACTORY.setNamespaceAware(true);
     }
@@ -177,129 +178,132 @@ public class CCParseFilter implements Pa
       if (rdfPosition < 0)
         return; // no RDF, abort
       int nsPosition = comment.indexOf(CC_NS);
-        if (nsPosition < 0)
-	  return; // no RDF, abort
-	// try to parse the XML
-	Document doc;
-	try {
-          DocumentBuilder parser = FACTORY.newDocumentBuilder();
-	  doc = parser.parse(new InputSource(new StringReader(comment)));
-	} catch (Exception e) {
-	  if (LOG.isWarnEnabled()) {
-	    LOG.warn("CC: Failed to parse RDF in " + base + ": " + e);
-	  }
-	  // e.printStackTrace();
-	  return;
-	}
-
-	// check that root is rdf:RDF
-	NodeList roots = doc.getElementsByTagNameNS(RDF_NS, "RDF");
-	if (roots.getLength() != 1) {
-	  if (LOG.isWarnEnabled()) {
-	    LOG.warn("CC: No RDF root in " + base);
-	  }
-	  return;
-	}
-	Element rdf = (Element) roots.item(0);
-
-	// get cc:License nodes inside rdf:RDF
-	NodeList licenses = rdf.getElementsByTagNameNS(CC_NS, "License");
-	for (int i = 0; i < licenses.getLength(); i++) {
-          Element l = (Element) licenses.item(i);
-	  // license is rdf:about= attribute from cc:License
-	  this.rdfLicense = l.getAttributeNodeNS(RDF_NS, "about").getValue();
-
-          // walk predicates of cc:License
-	  NodeList predicates = l.getChildNodes();
-	  for (int j = 0; j < predicates.getLength(); j++) {
-	    Node predicateNode = predicates.item(j);
-	    if (!(predicateNode instanceof Element))
-	      continue;
-	      Element predicateElement = (Element) predicateNode;
-              // extract predicates of cc:xxx predicates
-	      if (!CC_NS.equals(predicateElement.getNamespaceURI())) {
-	        continue;
-	      }
-	      String predicate = predicateElement.getLocalName();
-              // object is rdf:resource from cc:xxx predicates
-	      String object = predicateElement.getAttributeNodeNS(RDF_NS, "resource").getValue();
-              // add object and predicate to metadata
-	      // metadata.put(object, predicate);
-	      //if (LOG.isInfoEnabled()) {
-	      // LOG.info("CC: found: "+predicate+"="+object);
-	      // }
-	  }
-	}
-
-	// get cc:Work nodes from rdf:RDF
-	NodeList works = rdf.getElementsByTagNameNS(CC_NS, "Work");
-	for (int i = 0; i < works.getLength(); i++) {
-	  Element l = (Element) works.item(i);
-
-	  // get dc:type nodes from cc:Work
-	  NodeList types = rdf.getElementsByTagNameNS(DC_NS, "type");
-	  for (int j = 0; j < types.getLength(); j++) {
-	    Element type = (Element) types.item(j);
-	    String workUri = type.getAttributeNodeNS(RDF_NS, "resource").getValue();
-	    this.workType = (String) WORK_TYPE_NAMES.get(workUri);
-	    break;
-	  }
-	}
-      }
-    }
-
-    private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
-      
-    static {
-      FIELDS.add(WebPage.Field.BASE_URL);
-      FIELDS.add(WebPage.Field.METADATA);
-    }
-
-    private static final HashMap<String,String> WORK_TYPE_NAMES = new HashMap<String,String>();
-        
-    static {
-      WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/MovingImage", "video");
-      WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/StillImage", "image");
-      WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Sound", "audio");
-      WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Text", "text");
-      WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Interactive", "interactive");
-      WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Software", "software");
-      WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Image", "image");
-    }
-
-    private Configuration conf;
-
-    public void setConf(Configuration conf) {
-      this.conf = conf;
-    }
-
-    public Configuration getConf() {
-      return this.conf;
-    }
-
-    @Override
-    public Collection<Field> getFields() {
-      return FIELDS;
-    }
-
-    /**
-     * Adds metadata or otherwise modifies a parse of an HTML document, given
-     * the DOM tree of a page.
-     */
-    @Override
-    public Parse filter(String url, WebPage page, Parse parse,
-        HTMLMetaTags metaTags, DocumentFragment doc) {
-      // construct base url
-      URL base;
+      if (nsPosition < 0)
+        return; // no RDF, abort
+      // try to parse the XML
+      Document doc;
       try {
-        base = new URL(page.getBaseUrl().toString());
-	// extract license metadata
-	Walker.walk(doc, base, page, getConf());
+        DocumentBuilder parser = FACTORY.newDocumentBuilder();
+        doc = parser.parse(new InputSource(new StringReader(comment)));
       } catch (Exception e) {
-        LOG.error("Error parsing " + url, e);
-	return ParseStatusUtils.getEmptyParse(e, getConf());
+        if (LOG.isWarnEnabled()) {
+          LOG.warn("CC: Failed to parse RDF in " + base + ": " + e);
+        }
+        // e.printStackTrace();
+        return;
+      }
+
+      // check that root is rdf:RDF
+      NodeList roots = doc.getElementsByTagNameNS(RDF_NS, "RDF");
+      if (roots.getLength() != 1) {
+        if (LOG.isWarnEnabled()) {
+          LOG.warn("CC: No RDF root in " + base);
+        }
+        return;
       }
+      Element rdf = (Element) roots.item(0);
 
-      return parse;
+      // get cc:License nodes inside rdf:RDF
+      NodeList licenses = rdf.getElementsByTagNameNS(CC_NS, "License");
+      for (int i = 0; i < licenses.getLength(); i++) {
+        Element l = (Element) licenses.item(i);
+        // license is rdf:about= attribute from cc:License
+        this.rdfLicense = l.getAttributeNodeNS(RDF_NS, "about").getValue();
+
+        // walk predicates of cc:License
+        NodeList predicates = l.getChildNodes();
+        for (int j = 0; j < predicates.getLength(); j++) {
+          Node predicateNode = predicates.item(j);
+          if (!(predicateNode instanceof Element))
+            continue;
+          Element predicateElement = (Element) predicateNode;
+          // extract predicates of cc:xxx predicates
+          if (!CC_NS.equals(predicateElement.getNamespaceURI())) {
+            continue;
+          }
+          String predicate = predicateElement.getLocalName();
+          // object is rdf:resource from cc:xxx predicates
+          String object = predicateElement.getAttributeNodeNS(RDF_NS,
+              "resource").getValue();
+          // add object and predicate to metadata
+          // metadata.put(object, predicate);
+          // if (LOG.isInfoEnabled()) {
+          // LOG.info("CC: found: "+predicate+"="+object);
+          // }
+        }
+      }
+
+      // get cc:Work nodes from rdf:RDF
+      NodeList works = rdf.getElementsByTagNameNS(CC_NS, "Work");
+      for (int i = 0; i < works.getLength(); i++) {
+        Element l = (Element) works.item(i);
+
+        // get dc:type nodes from cc:Work
+        NodeList types = rdf.getElementsByTagNameNS(DC_NS, "type");
+        for (int j = 0; j < types.getLength(); j++) {
+          Element type = (Element) types.item(j);
+          String workUri = type.getAttributeNodeNS(RDF_NS, "resource")
+              .getValue();
+          this.workType = (String) WORK_TYPE_NAMES.get(workUri);
+          break;
+        }
+      }
+    }
+  }
+
+  private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
+
+  static {
+    FIELDS.add(WebPage.Field.BASE_URL);
+    FIELDS.add(WebPage.Field.METADATA);
+  }
+
+  private static final HashMap<String, String> WORK_TYPE_NAMES = new HashMap<String, String>();
+
+  static {
+    WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/MovingImage", "video");
+    WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/StillImage", "image");
+    WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Sound", "audio");
+    WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Text", "text");
+    WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Interactive",
+        "interactive");
+    WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Software", "software");
+    WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Image", "image");
+  }
+
+  private Configuration conf;
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  @Override
+  public Collection<Field> getFields() {
+    return FIELDS;
+  }
+
+  /**
+   * Adds metadata or otherwise modifies a parse of an HTML document, given the
+   * DOM tree of a page.
+   */
+  @Override
+  public Parse filter(String url, WebPage page, Parse parse,
+      HTMLMetaTags metaTags, DocumentFragment doc) {
+    // construct base url
+    URL base;
+    try {
+      base = new URL(page.getBaseUrl().toString());
+      // extract license metadata
+      Walker.walk(doc, base, page, getConf());
+    } catch (Exception e) {
+      LOG.error("Error parsing " + url, e);
+      return ParseStatusUtils.getEmptyParse(e, getConf());
     }
+
+    return parse;
+  }
 }

Modified: nutch/branches/2.x/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java (original)
+++ nutch/branches/2.x/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java Fri Jan  9 06:34:33 2015
@@ -36,52 +36,50 @@ import static org.junit.Assert.assertEqu
 
 public class TestCCParseFilter {
 
-	private static final File testDir = new File(
-			System.getProperty("test.input"));
+  private static final File testDir = new File(System.getProperty("test.input"));
 
   @Test
-	public void testPages() throws Exception {
-		pageTest(new File(testDir, "anchor.html"), "http://foo.com/",
-				"http://creativecommons.org/licenses/by-nc-sa/1.0", "a", null);
-		// Tika returns <a> whereas parse-html returns <rel>
-		// check later
-		pageTest(new File(testDir, "rel.html"), "http://foo.com/",
-				"http://creativecommons.org/licenses/by-nc/2.0", "rel", null);
-		// Tika returns <a> whereas parse-html returns <rdf>
-		// check later
-		pageTest(new File(testDir, "rdf.html"), "http://foo.com/",
-				"http://creativecommons.org/licenses/by-nc/1.0", "rdf", "text");
-	}
-
-	public void pageTest(File file, String url, String license,
-			String location, String type) throws Exception {
-
-		InputStream in = new FileInputStream(file);
-		ByteArrayOutputStream out = new ByteArrayOutputStream(
-				(int) file.length());
-		byte[] buffer = new byte[1024];
-		int i;
-		while ((i = in.read(buffer)) != -1) {
-			out.write(buffer, 0, i);
-		}
-		in.close();
-		byte[] bytes = out.toByteArray();
-		Configuration conf = NutchConfiguration.create();
-
-		WebPage page = WebPage.newBuilder().build();
-		page.setBaseUrl(new Utf8(url));
-		page.setContent(ByteBuffer.wrap(bytes));
-		MimeUtil mimeutil = new MimeUtil(conf);
-		String mtype = mimeutil.getMimeType(file);
-		page.setContentType(new Utf8(mtype));
-
-		new ParseUtil(conf).parse(url, page);
-
-		ByteBuffer bb = page.getMetadata().get(new Utf8("License-Url"));
-		assertEquals(license, Bytes.toString(bb));
-		bb = page.getMetadata().get(new Utf8("License-Location"));
-		assertEquals(location, Bytes.toString(bb));
-		bb = page.getMetadata().get(new Utf8("Work-Type"));
-        assertEquals(type, Bytes.toString(bb));
-	}
+  public void testPages() throws Exception {
+    pageTest(new File(testDir, "anchor.html"), "http://foo.com/",
+        "http://creativecommons.org/licenses/by-nc-sa/1.0", "a", null);
+    // Tika returns <a> whereas parse-html returns <rel>
+    // check later
+    pageTest(new File(testDir, "rel.html"), "http://foo.com/",
+        "http://creativecommons.org/licenses/by-nc/2.0", "rel", null);
+    // Tika returns <a> whereas parse-html returns <rdf>
+    // check later
+    pageTest(new File(testDir, "rdf.html"), "http://foo.com/",
+        "http://creativecommons.org/licenses/by-nc/1.0", "rdf", "text");
+  }
+
+  public void pageTest(File file, String url, String license, String location,
+      String type) throws Exception {
+
+    InputStream in = new FileInputStream(file);
+    ByteArrayOutputStream out = new ByteArrayOutputStream((int) file.length());
+    byte[] buffer = new byte[1024];
+    int i;
+    while ((i = in.read(buffer)) != -1) {
+      out.write(buffer, 0, i);
+    }
+    in.close();
+    byte[] bytes = out.toByteArray();
+    Configuration conf = NutchConfiguration.create();
+
+    WebPage page = WebPage.newBuilder().build();
+    page.setBaseUrl(new Utf8(url));
+    page.setContent(ByteBuffer.wrap(bytes));
+    MimeUtil mimeutil = new MimeUtil(conf);
+    String mtype = mimeutil.getMimeType(file);
+    page.setContentType(new Utf8(mtype));
+
+    new ParseUtil(conf).parse(url, page);
+
+    ByteBuffer bb = page.getMetadata().get(new Utf8("License-Url"));
+    assertEquals(license, Bytes.toString(bb));
+    bb = page.getMetadata().get(new Utf8("License-Location"));
+    assertEquals(location, Bytes.toString(bb));
+    bb = page.getMetadata().get(new Utf8("Work-Type"));
+    assertEquals(type, Bytes.toString(bb));
+  }
 }

Modified: nutch/branches/2.x/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java (original)
+++ nutch/branches/2.x/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java Fri Jan  9 06:34:33 2015
@@ -32,13 +32,15 @@ import java.util.HashSet;
 import java.util.Map.Entry;
 
 /**
- * Indexing filter that offers an option to either index all inbound anchor text for 
- * a document or deduplicate anchors. Deduplication does have it's con's, 
+ * Indexing filter that offers an option to either index all inbound anchor text
+ * for a document or deduplicate anchors. Deduplication does have it's con's,
+ * 
  * @see {@code anchorIndexingFilter.deduplicate} in nutch-default.xml.
  */
 public class AnchorIndexingFilter implements IndexingFilter {
 
-  public static final Logger LOG = LoggerFactory.getLogger(AnchorIndexingFilter.class);
+  public static final Logger LOG = LoggerFactory
+      .getLogger(AnchorIndexingFilter.class);
   private Configuration conf;
   private boolean deduplicate = false;
 
@@ -47,7 +49,7 @@ public class AnchorIndexingFilter implem
   static {
     FIELDS.add(WebPage.Field.INLINKS);
   }
-  
+
   /**
    * Set the {@link Configuration} object
    */
@@ -57,40 +59,44 @@ public class AnchorIndexingFilter implem
     deduplicate = conf.getBoolean("anchorIndexingFilter.deduplicate", false);
     LOG.info("Anchor deduplication is: " + (deduplicate ? "on" : "off"));
   }
-  
+
   /**
    * Get the {@link Configuration} object
    */
   public Configuration getConf() {
     return this.conf;
   }
-  
+
   public void addIndexBackendOptions(Configuration conf) {
   }
-  
+
   /**
-   * The {@link AnchorIndexingFilter} filter object which supports boolean 
-   * configuration settings for the deduplication of anchors. 
-   * See {@code anchorIndexingFilter.deduplicate} in nutch-default.xml.
-   *  
-   * @param doc The {@link NutchDocument} object
-   * @param url URL to be filtered for anchor text
-   * @param page {@link WebPage} object relative to the URL
+   * The {@link AnchorIndexingFilter} filter object which supports boolean
+   * configuration settings for the deduplication of anchors. See
+   * {@code anchorIndexingFilter.deduplicate} in nutch-default.xml.
+   * 
+   * @param doc
+   *          The {@link NutchDocument} object
+   * @param url
+   *          URL to be filtered for anchor text
+   * @param page
+   *          {@link WebPage} object relative to the URL
    * @return filtered NutchDocument
    */
   @Override
   public NutchDocument filter(NutchDocument doc, String url, WebPage page)
       throws IndexingException {
     HashSet<String> set = null;
-    
+
     for (Entry<CharSequence, CharSequence> e : page.getInlinks().entrySet()) {
       String anchor = TableUtil.toString(e.getValue());
-      
-      if(anchor.equals(""))
+
+      if (anchor.equals(""))
         continue;
-      
+
       if (deduplicate) {
-        if (set == null) set = new HashSet<String>();
+        if (set == null)
+          set = new HashSet<String>();
         String lcAnchor = anchor.toLowerCase();
 
         // Check if already processed the current anchor
@@ -104,15 +110,14 @@ public class AnchorIndexingFilter implem
         doc.add("anchor", anchor);
       }
     }
-    
+
     return doc;
   }
-  
+
   /**
-   * Gets all the fields for a given {@link WebPage}
-   * Many datastores need to setup the mapreduce job by specifying the fields
-   * needed. All extensions that work on WebPage are able to specify what fields
-   * they need.
+   * Gets all the fields for a given {@link WebPage} Many datastores need to
+   * setup the mapreduce job by specifying the fields needed. All extensions
+   * that work on WebPage are able to specify what fields they need.
    */
   @Override
   public Collection<WebPage.Field> getFields() {

Modified: nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java (original)
+++ nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java Fri Jan  9 06:34:33 2015
@@ -25,13 +25,12 @@ import org.junit.Test;
 import static org.junit.Assert.*;
 
 /**
- * JUnit test case which tests
- * 1. that anchor text is obtained
- * 2. that anchor deduplication functionality is working
- *
+ * JUnit test case which tests 1. that anchor text is obtained 2. that anchor
+ * deduplication functionality is working
+ * 
  */
 public class TestAnchorIndexingFilter {
-  
+
   @Test
   public void testDeduplicateAnchor() throws Exception {
     Configuration conf = NutchConfiguration.create();
@@ -40,14 +39,19 @@ public class TestAnchorIndexingFilter {
     filter.setConf(conf);
     NutchDocument doc = new NutchDocument();
     WebPage page = WebPage.newBuilder().build();
-    page.getInlinks().put(new Utf8("http://example1.com/"), new Utf8("cool site"));
-    page.getInlinks().put(new Utf8("http://example2.com/"), new Utf8("cool site"));
-    page.getInlinks().put(new Utf8("http://example3.com/"), new Utf8("fun site"));
+    page.getInlinks().put(new Utf8("http://example1.com/"),
+        new Utf8("cool site"));
+    page.getInlinks().put(new Utf8("http://example2.com/"),
+        new Utf8("cool site"));
+    page.getInlinks().put(new Utf8("http://example3.com/"),
+        new Utf8("fun site"));
     filter.filter(doc, "http://myurldoesnotmatter.com/", page);
-    
-    assertTrue("test if there is an anchor at all", doc.getFieldNames().contains("anchor"));
-    
-    assertEquals("test dedup, we expect 2", 2, doc.getFieldValues("anchor").size());
+
+    assertTrue("test if there is an anchor at all", doc.getFieldNames()
+        .contains("anchor"));
+
+    assertEquals("test dedup, we expect 2", 2, doc.getFieldValues("anchor")
+        .size());
   }
 
 }

Modified: nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java (original)
+++ nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java Fri Jan  9 06:34:33 2015
@@ -36,17 +36,17 @@ import java.util.Collection;
 import java.util.Date;
 import java.util.HashSet;
 
-/** Adds basic searchable fields to a document. The fields are:
- * host - add host as un-stored, indexed and tokenized
- * url - url is both stored and indexed, so it's both searchable and returned. 
- * This is also a required field.
- * content - content is indexed, so that it's searchable, but not stored in index
- * title - title is stored and indexed
- * cache - add cached content/summary display policy, if available
- * tstamp - add timestamp when fetched, for deduplication
+/**
+ * Adds basic searchable fields to a document. The fields are: host - add host
+ * as un-stored, indexed and tokenized url - url is both stored and indexed, so
+ * it's both searchable and returned. This is also a required field. content -
+ * content is indexed, so that it's searchable, but not stored in index title -
+ * title is stored and indexed cache - add cached content/summary display
+ * policy, if available tstamp - add timestamp when fetched, for deduplication
  */
 public class BasicIndexingFilter implements IndexingFilter {
-  public static final Logger LOG = LoggerFactory.getLogger(BasicIndexingFilter.class);
+  public static final Logger LOG = LoggerFactory
+      .getLogger(BasicIndexingFilter.class);
 
   private int MAX_TITLE_LENGTH;
   private Configuration conf;
@@ -60,22 +60,25 @@ public class BasicIndexingFilter impleme
   }
 
   /**
-   * The {@link BasicIndexingFilter} filter object which supports boolean 
-   * configurable value for length of characters permitted within the 
-   * title @see {@code indexer.max.title.length} in nutch-default.xml
-   *  
-   * @param doc The {@link NutchDocument} object
-   * @param url URL to be filtered for anchor text
-   * @param page {@link WebPage} object relative to the URL
+   * The {@link BasicIndexingFilter} filter object which supports boolean
+   * configurable value for length of characters permitted within the title @see
+   * {@code indexer.max.title.length} in nutch-default.xml
+   * 
+   * @param doc
+   *          The {@link NutchDocument} object
+   * @param url
+   *          URL to be filtered for anchor text
+   * @param page
+   *          {@link WebPage} object relative to the URL
    * @return filtered NutchDocument
    */
   public NutchDocument filter(NutchDocument doc, String url, WebPage page)
       throws IndexingException {
 
     String reprUrl = null;
-//    if (page.isReadable(WebPage.Field.REPR_URL.getIndex())) {
-      reprUrl = TableUtil.toString(page.getReprUrl());
-//    }
+    // if (page.isReadable(WebPage.Field.REPR_URL.getIndex())) {
+    reprUrl = TableUtil.toString(page.getReprUrl());
+    // }
 
     String host = null;
     try {
@@ -103,7 +106,10 @@ public class BasicIndexingFilter impleme
 
     // title
     String title = TableUtil.toString(page.getTitle());
-    if (MAX_TITLE_LENGTH > -1 && title.length() > MAX_TITLE_LENGTH) { // truncate title if needed
+    if (MAX_TITLE_LENGTH > -1 && title.length() > MAX_TITLE_LENGTH) { // truncate
+                                                                      // title
+                                                                      // if
+                                                                      // needed
       title = title.substring(0, MAX_TITLE_LENGTH);
     }
     if (title.length() > 0) {
@@ -111,15 +117,16 @@ public class BasicIndexingFilter impleme
       doc.add("title", title);
     }
     // add cached content/summary display policy, if available
-    ByteBuffer cachingRaw = page
-        .getMetadata().get(Nutch.CACHING_FORBIDDEN_KEY_UTF8);
+    ByteBuffer cachingRaw = page.getMetadata().get(
+        Nutch.CACHING_FORBIDDEN_KEY_UTF8);
     String caching = Bytes.toString(cachingRaw);
     if (caching != null && !caching.equals(Nutch.CACHING_FORBIDDEN_NONE)) {
       doc.add("cache", caching);
     }
 
     // add timestamp when fetched, for deduplication
-    String tstamp = DateUtil.getThreadLocalDateFormat().format(new Date(page.getFetchTime()));
+    String tstamp = DateUtil.getThreadLocalDateFormat().format(
+        new Date(page.getFetchTime()));
     doc.add("tstamp", tstamp);
 
     return doc;
@@ -134,7 +141,8 @@ public class BasicIndexingFilter impleme
   public void setConf(Configuration conf) {
     this.conf = conf;
     this.MAX_TITLE_LENGTH = conf.getInt("indexer.max.title.length", 100);
-    LOG.info("Maximum title length for indexing set to: " + this.MAX_TITLE_LENGTH);
+    LOG.info("Maximum title length for indexing set to: "
+        + this.MAX_TITLE_LENGTH);
   }
 
   /**
@@ -145,10 +153,9 @@ public class BasicIndexingFilter impleme
   }
 
   /**
-   * Gets all the fields for a given {@link WebPage}
-   * Many datastores need to setup the mapreduce job by specifying the fields
-   * needed. All extensions that work on WebPage are able to specify what fields
-   * they need.
+   * Gets all the fields for a given {@link WebPage} Many datastores need to
+   * setup the mapreduce job by specifying the fields needed. All extensions
+   * that work on WebPage are able to specify what fields they need.
    */
   @Override
   public Collection<WebPage.Field> getFields() {

Modified: nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java (original)
+++ nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java Fri Jan  9 06:34:33 2015
@@ -29,66 +29,69 @@ import java.nio.ByteBuffer;
 import static org.junit.Assert.*;
 
 /**
- * JUnit test case which tests
- * 1. that the host, url, content, title, cache and tstamp fields 
- * are obtained by the filter.
- * 2. that configurable maximum length functionality for titles actually works. .
- * This property defaults at 100 characters @see {@code indexer.max.title.length} 
- * in nutch-default.xml but has been set to 10 for this test.
+ * JUnit test case which tests 1. that the host, url, content, title, cache and
+ * tstamp fields are obtained by the filter. 2. that configurable maximum length
+ * functionality for titles actually works. . This property defaults at 100
+ * characters @see {@code indexer.max.title.length} in nutch-default.xml but has
+ * been set to 10 for this test.
  * 
  * @author lewismc
  */
 
 public class TestBasicIndexingFilter {
-  
+
   @Test
   public void testBasicFields() throws Exception {
-	Configuration conf = NutchConfiguration.create();
-	BasicIndexingFilter filter = new BasicIndexingFilter();
-	filter.setConf(conf);
-	assertNotNull(filter);
-	NutchDocument doc = new NutchDocument();
-	WebPage page = WebPage.newBuilder().build();
-	page.getInlinks().put(new Utf8("http://nutch.apache.org/"), new Utf8("Welcome to Nutch"));
-	page.setTitle(new Utf8("Welcome to Nutch"));
+    Configuration conf = NutchConfiguration.create();
+    BasicIndexingFilter filter = new BasicIndexingFilter();
+    filter.setConf(conf);
+    assertNotNull(filter);
+    NutchDocument doc = new NutchDocument();
+    WebPage page = WebPage.newBuilder().build();
+    page.getInlinks().put(new Utf8("http://nutch.apache.org/"),
+        new Utf8("Welcome to Nutch"));
+    page.setTitle(new Utf8("Welcome to Nutch"));
     page.setReprUrl(new Utf8("http://www.urldoesnotmatter.org"));
     byte[] bytes = new byte[10];
     ByteBuffer bbuf = ByteBuffer.wrap(bytes);
     page.getMetadata().put(Nutch.CACHING_FORBIDDEN_KEY_UTF8, bbuf);
     page.setFetchTime(System.currentTimeMillis());
-	try {
-	  filter.filter(doc, "http://www.apache.org/", page);
-	} catch(Exception e) {
-	  e.printStackTrace();
-	  fail(e.getMessage());
-	}
-	assertNotNull(doc);
-	assertTrue("check for host field ", doc.getFieldNames().contains("host"));
-	assertTrue("check for url field", doc.getFieldNames().contains("url"));
-	assertTrue("check for content field", doc.getFieldNames().contains("content"));
-	assertTrue("check for title field", doc.getFieldNames().contains("title"));
-	assertTrue("check for cache field", doc.getFieldNames().contains("cache"));
-	assertTrue("check for tstamp field", doc.getFieldNames().contains("tstamp"));
+    try {
+      filter.filter(doc, "http://www.apache.org/", page);
+    } catch (Exception e) {
+      e.printStackTrace();
+      fail(e.getMessage());
+    }
+    assertNotNull(doc);
+    assertTrue("check for host field ", doc.getFieldNames().contains("host"));
+    assertTrue("check for url field", doc.getFieldNames().contains("url"));
+    assertTrue("check for content field",
+        doc.getFieldNames().contains("content"));
+    assertTrue("check for title field", doc.getFieldNames().contains("title"));
+    assertTrue("check for cache field", doc.getFieldNames().contains("cache"));
+    assertTrue("check for tstamp field", doc.getFieldNames().contains("tstamp"));
   }
-  
+
   @Test
   public void testTitleFieldLength() throws Exception {
-	Configuration conf = NutchConfiguration.create();
-	conf.setInt("indexer.max.title.length", 10);
-	BasicIndexingFilter filter = new BasicIndexingFilter();
-	filter.setConf(conf);
-	assertNotNull(filter);
-	NutchDocument doc = new NutchDocument();
-	WebPage page = WebPage.newBuilder().build();
-	page.getInlinks().put(new Utf8("http://exceedmaximumtitleurl.org/"), new Utf8("exceeding title site"));
-	page.setTitle(new Utf8("This title exceeds maximum characters"));
-	try {
-	  filter.filter(doc, "http://www.apache.org/", page);
-	} catch (Exception e) {
-	  e.printStackTrace();
-	  fail(e.getMessage());
-	}
-	assertNotNull(doc);
-	assertEquals("assert title field only has 10 characters", 10, doc.getFieldValue("title").length());
+    Configuration conf = NutchConfiguration.create();
+    conf.setInt("indexer.max.title.length", 10);
+    BasicIndexingFilter filter = new BasicIndexingFilter();
+    filter.setConf(conf);
+    assertNotNull(filter);
+    NutchDocument doc = new NutchDocument();
+    WebPage page = WebPage.newBuilder().build();
+    page.getInlinks().put(new Utf8("http://exceedmaximumtitleurl.org/"),
+        new Utf8("exceeding title site"));
+    page.setTitle(new Utf8("This title exceeds maximum characters"));
+    try {
+      filter.filter(doc, "http://www.apache.org/", page);
+    } catch (Exception e) {
+      e.printStackTrace();
+      fail(e.getMessage());
+    }
+    assertNotNull(doc);
+    assertEquals("assert title field only has 10 characters", 10, doc
+        .getFieldValue("title").length());
   }
 }

Modified: nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java (original)
+++ nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java Fri Jan  9 06:34:33 2015
@@ -42,7 +42,7 @@ import org.apache.nutch.util.Bytes;
 
 public class MetadataIndexer implements IndexingFilter {
   private Configuration conf;
-  private static Map<Utf8,String> parseFieldnames;
+  private static Map<Utf8, String> parseFieldnames;
   private static final String PARSE_CONF_PROPERTY = "index.metadata";
   private static final String INDEX_PREFIX = "meta_";
   private static final String PARSE_META_PREFIX = "meta_";
@@ -56,7 +56,7 @@ public class MetadataIndexer implements
 
     // add the fields from parsemd
     if (parseFieldnames != null) {
-      for (Entry<Utf8,String> metatag : parseFieldnames.entrySet()) {
+      for (Entry<Utf8, String> metatag : parseFieldnames.entrySet()) {
         ByteBuffer bvalues = page.getMetadata().get(metatag.getKey());
         if (bvalues != null) {
           String key = metatag.getValue();
@@ -75,7 +75,7 @@ public class MetadataIndexer implements
   public void setConf(Configuration conf) {
     this.conf = conf;
     String[] metatags = conf.getStrings(PARSE_CONF_PROPERTY);
-    parseFieldnames = new TreeMap<Utf8,String>();
+    parseFieldnames = new TreeMap<Utf8, String>();
     for (int i = 0; i < metatags.length; i++) {
       parseFieldnames.put(
           new Utf8(PARSE_META_PREFIX + metatags[i].toLowerCase(Locale.ROOT)),

Modified: nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/package-info.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/package-info.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/package-info.java (original)
+++ nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/package-info.java Fri Jan  9 06:34:33 2015
@@ -20,3 +20,4 @@
  * Metadata may come from CrawlDb, parse or content metadata.
  */
 package org.apache.nutch.indexer.metadata;
+

Modified: nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (original)
+++ nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Fri Jan  9 06:34:33 2015
@@ -30,10 +30,12 @@ import org.slf4j.LoggerFactory;
  * Add (or reset) a few metaData properties as respective fields (if they are
  * available), so that they can be accurately used within the search index.
  * 
- * 'lastModifed' is indexed to support query by date, 'contentLength' obtains content length from the HTTP
- * header, 'type' field is indexed to support query by type and finally the 'title' field is an attempt 
- * to reset the title if a content-disposition hint exists. The logic is that such a presence is indicative 
- * that the content provider wants the filename therein to be used as the title.
+ * 'lastModifed' is indexed to support query by date, 'contentLength' obtains
+ * content length from the HTTP header, 'type' field is indexed to support query
+ * by type and finally the 'title' field is an attempt to reset the title if a
+ * content-disposition hint exists. The logic is that such a presence is
+ * indicative that the content provider wants the filename therein to be used as
+ * the title.
  * 
  * Still need to make content-length searchable!
  * 
@@ -41,7 +43,8 @@ import org.slf4j.LoggerFactory;
  */
 
 public class MoreIndexingFilter implements IndexingFilter {
-  public static final Logger LOG = LoggerFactory.getLogger(MoreIndexingFilter.class);
+  public static final Logger LOG = LoggerFactory
+      .getLogger(MoreIndexingFilter.class);
 
   /** Get the MimeTypes resolver instance. */
   private MimeUtil MIME;
@@ -68,12 +71,13 @@ public class MoreIndexingFilter implemen
   // last-modified, or, if that's not present, use fetch time.
   private NutchDocument addTime(NutchDocument doc, WebPage page, String url) {
     long time = -1;
-    CharSequence lastModified = page
-        .getHeaders().get(new Utf8(HttpHeaders.LAST_MODIFIED));
+    CharSequence lastModified = page.getHeaders().get(
+        new Utf8(HttpHeaders.LAST_MODIFIED));
     // String lastModified = data.getMeta(Metadata.LAST_MODIFIED);
     if (lastModified != null) { // try parse last-modified
       time = getTime(lastModified.toString(), url); // use as time
-      String formlastModified = DateUtil.getThreadLocalDateFormat().format(new Date(time));
+      String formlastModified = DateUtil.getThreadLocalDateFormat().format(
+          new Date(time));
       // store as string
       doc.add("lastModified", formlastModified);
     }
@@ -82,7 +86,8 @@ public class MoreIndexingFilter implemen
       time = page.getModifiedTime(); // use Modified time
     }
 
-    String dateString = DateUtil.getThreadLocalDateFormat().format(new Date(time));
+    String dateString = DateUtil.getThreadLocalDateFormat().format(
+        new Date(time));
 
     // un-stored, indexed and un-tokenized
     doc.add("date", dateString);
@@ -97,17 +102,19 @@ public class MoreIndexingFilter implemen
     } catch (ParseException e) {
       // try to parse it as date in alternative format
       try {
-        Date parsedDate = DateUtils.parseDate(date, new String[] {
-            "EEE MMM dd HH:mm:ss yyyy", "EEE MMM dd HH:mm:ss yyyy zzz",
-            "EEE MMM dd HH:mm:ss zzz yyyy", "EEE, dd MMM yyyy HH:mm:ss zzz",
-            "EEE,dd MMM yyyy HH:mm:ss zzz", "EEE, dd MMM yyyy HH:mm:sszzz",
-            "EEE, dd MMM yyyy HH:mm:ss", "EEE, dd-MMM-yy HH:mm:ss zzz",
-            "yyyy/MM/dd HH:mm:ss.SSS zzz", "yyyy/MM/dd HH:mm:ss.SSS",
-            "yyyy/MM/dd HH:mm:ss zzz", "yyyy/MM/dd", "yyyy.MM.dd HH:mm:ss",
-            "yyyy-MM-dd HH:mm", "MMM dd yyyy HH:mm:ss. zzz",
-            "MMM dd yyyy HH:mm:ss zzz", "dd.MM.yyyy HH:mm:ss zzz",
-            "dd MM yyyy HH:mm:ss zzz", "dd.MM.yyyy; HH:mm:ss",
-            "dd.MM.yyyy HH:mm:ss", "dd.MM.yyyy zzz", "yyyy-MM-dd'T'HH:mm:ss'Z'" });
+        Date parsedDate = DateUtils.parseDate(date,
+            new String[] { "EEE MMM dd HH:mm:ss yyyy",
+                "EEE MMM dd HH:mm:ss yyyy zzz", "EEE MMM dd HH:mm:ss zzz yyyy",
+                "EEE, dd MMM yyyy HH:mm:ss zzz",
+                "EEE,dd MMM yyyy HH:mm:ss zzz", "EEE, dd MMM yyyy HH:mm:sszzz",
+                "EEE, dd MMM yyyy HH:mm:ss", "EEE, dd-MMM-yy HH:mm:ss zzz",
+                "yyyy/MM/dd HH:mm:ss.SSS zzz", "yyyy/MM/dd HH:mm:ss.SSS",
+                "yyyy/MM/dd HH:mm:ss zzz", "yyyy/MM/dd", "yyyy.MM.dd HH:mm:ss",
+                "yyyy-MM-dd HH:mm", "MMM dd yyyy HH:mm:ss. zzz",
+                "MMM dd yyyy HH:mm:ss zzz", "dd.MM.yyyy HH:mm:ss zzz",
+                "dd MM yyyy HH:mm:ss zzz", "dd.MM.yyyy; HH:mm:ss",
+                "dd.MM.yyyy HH:mm:ss", "dd.MM.yyyy zzz",
+                "yyyy-MM-dd'T'HH:mm:ss'Z'" });
         time = parsedDate.getTime();
         // if (LOG.isWarnEnabled()) {
         // LOG.warn(url + ": parsed date: " + date +" to:"+time);
@@ -123,8 +130,8 @@ public class MoreIndexingFilter implemen
 
   // Add Content-Length
   private NutchDocument addLength(NutchDocument doc, WebPage page, String url) {
-    CharSequence contentLength = page.getHeaders().get(new Utf8(
-            HttpHeaders.CONTENT_LENGTH));
+    CharSequence contentLength = page.getHeaders().get(
+        new Utf8(HttpHeaders.CONTENT_LENGTH));
     if (contentLength != null) {
       // NUTCH-1010 ContentLength not trimmed
       String trimmed = contentLength.toString().trim();
@@ -188,7 +195,7 @@ public class MoreIndexingFilter implemen
     if (conf.getBoolean("moreIndexingFilter.indexMimeTypeParts", true)) {
       String[] parts = getParts(mimeType);
 
-      for(String part: parts) {
+      for (String part : parts) {
         doc.add("type", part);
       }
     }
@@ -233,8 +240,8 @@ public class MoreIndexingFilter implemen
   }
 
   private NutchDocument resetTitle(NutchDocument doc, WebPage page, String url) {
-    CharSequence contentDisposition = page.getHeaders().get(new Utf8(
-        HttpHeaders.CONTENT_DISPOSITION));
+    CharSequence contentDisposition = page.getHeaders().get(
+        new Utf8(HttpHeaders.CONTENT_DISPOSITION));
     if (contentDisposition == null)
       return doc;
 

Modified: nutch/branches/2.x/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java (original)
+++ nutch/branches/2.x/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java Fri Jan  9 06:34:33 2015
@@ -37,7 +37,7 @@ public class TestMoreIndexingFilter {
     assertContentType(conf, "text/html", "text/html");
     assertContentType(conf, "text/html; charset=UTF-8", "text/html");
   }
-  
+
   public void testGetParts() {
     String[] parts = MoreIndexingFilter.getParts("text/html");
     assertParts(parts, 2, "text", "html");
@@ -48,34 +48,35 @@ public class TestMoreIndexingFilter {
    * @since NUTCH-901
    */
   @Test
-  public void testNoParts(){
-     Configuration conf = NutchConfiguration.create();
-     conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);
-     MoreIndexingFilter filter = new MoreIndexingFilter();
-     filter.setConf(conf);
-     assertNotNull(filter);
-     NutchDocument doc = new NutchDocument();
-     try{
-       filter.filter(doc, "http://nutch.apache.org/index.html", WebPage.newBuilder().build());
-     }
-     catch(Exception e){
-       e.printStackTrace();
-       fail(e.getMessage());
-     }
-     assertNotNull(doc);
-     assertTrue(doc.getFieldNames().contains("type"));
-     assertEquals(1, doc.getFieldValues("type").size());
-     assertEquals("text/html", doc.getFieldValue("type"));     
+  public void testNoParts() {
+    Configuration conf = NutchConfiguration.create();
+    conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);
+    MoreIndexingFilter filter = new MoreIndexingFilter();
+    filter.setConf(conf);
+    assertNotNull(filter);
+    NutchDocument doc = new NutchDocument();
+    try {
+      filter.filter(doc, "http://nutch.apache.org/index.html", WebPage
+          .newBuilder().build());
+    } catch (Exception e) {
+      e.printStackTrace();
+      fail(e.getMessage());
+    }
+    assertNotNull(doc);
+    assertTrue(doc.getFieldNames().contains("type"));
+    assertEquals(1, doc.getFieldValues("type").size());
+    assertEquals("text/html", doc.getFieldValue("type"));
   }
-  
+
   private void assertParts(String[] parts, int count, String... expected) {
     assertEquals(count, parts.length);
     for (int i = 0; i < expected.length; i++) {
       assertEquals(expected[i], parts[i]);
     }
   }
-  
-  private void assertContentType(Configuration conf, String source, String expected) throws IndexingException {
+
+  private void assertContentType(Configuration conf, String source,
+      String expected) throws IndexingException {
     MoreIndexingFilter filter = new MoreIndexingFilter();
     filter.setConf(conf);
     WebPage page = WebPage.newBuilder().build();

Modified: nutch/branches/2.x/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/package-info.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/package-info.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/package-info.java (original)
+++ nutch/branches/2.x/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/package-info.java Fri Jan  9 06:34:33 2015
@@ -19,3 +19,4 @@
  * Index writer plugin for <a href="http://www.elasticsearch.org/">Elasticsearch</a>.
  */
 package org.apache.nutch.indexwriter.elastic;
+

Modified: nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java (original)
+++ nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java Fri Jan  9 06:34:33 2015
@@ -22,7 +22,7 @@ public interface SolrConstants {
   public static final String SERVER_URL = SOLR_PREFIX + "server.url";
 
   public static final String COMMIT_SIZE = SOLR_PREFIX + "commit.size";
-  
+
   public static final String COMMIT_INDEX = SOLR_PREFIX + "commit.index";
 
   public static final String MAPPING_FILE = SOLR_PREFIX + "mapping.file";
@@ -32,15 +32,15 @@ public interface SolrConstants {
   public static final String USERNAME = SOLR_PREFIX + "auth.username";
 
   public static final String PASSWORD = SOLR_PREFIX + "auth.password";
-  
+
   public static final String ID_FIELD = "id";
-  
+
   public static final String URL_FIELD = "url";
-  
+
   public static final String BOOST_FIELD = "boost";
-  
+
   public static final String TIMESTAMP_FIELD = "tstamp";
-  
+
   public static final String DIGEST_FIELD = "digest";
 
 }

Modified: nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java (original)
+++ nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java Fri Jan  9 06:34:33 2015
@@ -144,7 +144,9 @@ public class SolrIndexWriter implements
   public void commit() throws IOException {
     try {
       solr.commit();
-      LOG.info("Total " + documentCount + (documentCount > 1 ? " documents are " : " document is ") + "added.");
+      LOG.info("Total " + documentCount
+          + (documentCount > 1 ? " documents are " : " document is ")
+          + "added.");
     } catch (SolrServerException e) {
       throw makeIOException(e);
     }

Modified: nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrMappingReader.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrMappingReader.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrMappingReader.java (original)
+++ nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrMappingReader.java Fri Jan  9 06:34:33 2015
@@ -38,16 +38,17 @@ import org.xml.sax.SAXException;
 
 public class SolrMappingReader {
   public static Logger LOG = LoggerFactory.getLogger(SolrMappingReader.class);
-  
+
   private Configuration conf;
-  
+
   private Map<String, String> keyMap = new HashMap<String, String>();
   private Map<String, String> copyMap = new HashMap<String, String>();
   private String uniqueKey = "id";
-  
+
   public static synchronized SolrMappingReader getInstance(Configuration conf) {
     ObjectCache cache = ObjectCache.get(conf);
-    SolrMappingReader instance = (SolrMappingReader)cache.getObject(SolrMappingReader.class.getName());
+    SolrMappingReader instance = (SolrMappingReader) cache
+        .getObject(SolrMappingReader.class.getName());
     if (instance == null) {
       instance = new SolrMappingReader(conf);
       cache.setObject(SolrMappingReader.class.getName(), instance);
@@ -60,9 +61,10 @@ public class SolrMappingReader {
     parseMapping();
   }
 
-  private void parseMapping() {    
+  private void parseMapping() {
     InputStream ssInputStream = null;
-    ssInputStream = conf.getConfResourceAsInputStream(conf.get(SolrConstants.MAPPING_FILE, "solrindex-mapping.xml"));
+    ssInputStream = conf.getConfResourceAsInputStream(conf.get(
+        SolrConstants.MAPPING_FILE, "solrindex-mapping.xml"));
 
     InputSource inputSource = new InputSource(ssInputStream);
     try {
@@ -74,48 +76,50 @@ public class SolrMappingReader {
       if (fieldList.getLength() > 0) {
         for (int i = 0; i < fieldList.getLength(); i++) {
           Element element = (Element) fieldList.item(i);
-          LOG.info("source: " + element.getAttribute("source") + " dest: " + element.getAttribute("dest"));
-          keyMap.put(element.getAttribute("source"), element.getAttribute("dest"));
+          LOG.info("source: " + element.getAttribute("source") + " dest: "
+              + element.getAttribute("dest"));
+          keyMap.put(element.getAttribute("source"),
+              element.getAttribute("dest"));
         }
       }
       NodeList copyFieldList = rootElement.getElementsByTagName("copyField");
       if (copyFieldList.getLength() > 0) {
         for (int i = 0; i < copyFieldList.getLength(); i++) {
           Element element = (Element) copyFieldList.item(i);
-          LOG.info("source: " + element.getAttribute("source") + " dest: " + element.getAttribute("dest"));
-          copyMap.put(element.getAttribute("source"), element.getAttribute("dest"));
+          LOG.info("source: " + element.getAttribute("source") + " dest: "
+              + element.getAttribute("dest"));
+          copyMap.put(element.getAttribute("source"),
+              element.getAttribute("dest"));
         }
       }
       NodeList uniqueKeyItem = rootElement.getElementsByTagName("uniqueKey");
       if (uniqueKeyItem.getLength() > 1) {
         LOG.warn("More than one unique key definitions found in solr index mapping, using default 'id'");
         uniqueKey = "id";
-      }
-      else if (uniqueKeyItem.getLength() == 0) {
+      } else if (uniqueKeyItem.getLength() == 0) {
         LOG.warn("No unique key definition found in solr index mapping using, default 'id'");
-      }
-      else{
-    	  uniqueKey = uniqueKeyItem.item(0).getFirstChild().getNodeValue();
+      } else {
+        uniqueKey = uniqueKeyItem.item(0).getFirstChild().getNodeValue();
       }
     } catch (MalformedURLException e) {
-        LOG.warn(e.toString());
+      LOG.warn(e.toString());
     } catch (SAXException e) {
-        LOG.warn(e.toString());
+      LOG.warn(e.toString());
     } catch (IOException e) {
-    	LOG.warn(e.toString());
+      LOG.warn(e.toString());
     } catch (ParserConfigurationException e) {
-    	LOG.warn(e.toString());
-    } 
+      LOG.warn(e.toString());
+    }
   }
-	  
+
   public Map<String, String> getKeyMap() {
     return keyMap;
   }
-	  
+
   public Map<String, String> getCopyMap() {
     return copyMap;
   }
-	  
+
   public String getUniqueKey() {
     return uniqueKey;
   }
@@ -128,14 +132,14 @@ public class SolrMappingReader {
   }
 
   public String mapKey(String key) throws IOException {
-    if(keyMap.containsKey(key)) {
+    if (keyMap.containsKey(key)) {
       key = keyMap.get(key);
     }
     return key;
   }
 
   public String mapCopyKey(String key) throws IOException {
-    if(copyMap.containsKey(key)) {
+    if (copyMap.containsKey(key)) {
       key = copyMap.get(key);
     }
     return key;

Modified: nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java (original)
+++ nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java Fri Jan  9 06:34:33 2015
@@ -1,6 +1,5 @@
 package org.apache.nutch.indexwriter.solr;
 
-
 import org.apache.http.impl.client.DefaultHttpClient;
 import org.apache.http.auth.AuthScope;
 import org.apache.http.auth.UsernamePasswordCredentials;
@@ -17,7 +16,8 @@ public class SolrUtils {
 
   public static Logger LOG = LoggerFactory.getLogger(SolrUtils.class);
 
-  public static HttpSolrServer getHttpSolrServer(Configuration job) throws MalformedURLException {
+  public static HttpSolrServer getHttpSolrServer(Configuration job)
+      throws MalformedURLException {
     DefaultHttpClient client = new DefaultHttpClient();
 
     // Check for username/password
@@ -26,10 +26,14 @@ public class SolrUtils {
 
       LOG.info("Authenticating as: " + username);
 
-      AuthScope scope = new AuthScope(AuthScope.ANY_HOST, AuthScope.ANY_PORT, AuthScope.ANY_REALM, AuthScope.ANY_SCHEME);
+      AuthScope scope = new AuthScope(AuthScope.ANY_HOST, AuthScope.ANY_PORT,
+          AuthScope.ANY_REALM, AuthScope.ANY_SCHEME);
+
+      client.getCredentialsProvider().setCredentials(
+          scope,
+          new UsernamePasswordCredentials(username, job
+              .get(SolrConstants.PASSWORD)));
 
-      client.getCredentialsProvider().setCredentials(scope, new UsernamePasswordCredentials(username, job.get(SolrConstants.PASSWORD)));
-      
       HttpParams params = client.getParams();
       HttpClientParams.setAuthenticating(params, true);
 
@@ -46,12 +50,14 @@ public class SolrUtils {
     for (int i = 0; i < input.length(); i++) {
       ch = input.charAt(i);
 
-      // Strip all non-characters http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:Noncharacter_Code_Point=True:]
-      // and non-printable control characters except tabulator, new line and carriage return
+      // Strip all non-characters
+      // http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:Noncharacter_Code_Point=True:]
+      // and non-printable control characters except tabulator, new line and
+      // carriage return
       if (ch % 0x10000 != 0xffff && // 0xffff - 0x10ffff range step 0x10000
-              ch % 0x10000 != 0xfffe && // 0xfffe - 0x10fffe range
-              (ch <= 0xfdd0 || ch >= 0xfdef) && // 0xfdd0 - 0xfdef
-              (ch > 0x1F || ch == 0x9 || ch == 0xa || ch == 0xd)) {
+          ch % 0x10000 != 0xfffe && // 0xfffe - 0x10fffe range
+          (ch <= 0xfdd0 || ch >= 0xfdef) && // 0xfdd0 - 0xfdef
+          (ch > 0x1F || ch == 0x9 || ch == 0xa || ch == 0xd)) {
 
         retval.append(ch);
       }

Modified: nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/package-info.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/package-info.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/package-info.java (original)
+++ nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/package-info.java Fri Jan  9 06:34:33 2015
@@ -19,3 +19,4 @@
  * Index writer plugin for <a href="http://lucene.apache.org/solr/">Apache Solr</a>.
  */
 package org.apache.nutch.indexwriter.solr;
+

Modified: nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java (original)
+++ nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java Fri Jan  9 06:34:33 2015
@@ -47,7 +47,8 @@ import java.util.*;
  */
 public class HTMLLanguageParser implements ParseFilter {
 
-  public static final Logger LOG = LoggerFactory.getLogger(HTMLLanguageParser.class);
+  public static final Logger LOG = LoggerFactory
+      .getLogger(HTMLLanguageParser.class);
 
   private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
 
@@ -113,8 +114,8 @@ public class HTMLLanguageParser implemen
     }
 
     if (lang != null) {
-      page.getMetadata().put(new Utf8(Metadata.LANGUAGE), ByteBuffer.wrap(lang
-              .getBytes()));
+      page.getMetadata().put(new Utf8(Metadata.LANGUAGE),
+          ByteBuffer.wrap(lang.getBytes()));
       return parse;
     }
 
@@ -135,7 +136,8 @@ public class HTMLLanguageParser implemen
       return lang;
     }
 
-    CharSequence ulang = page.getHeaders().get(new Utf8(Response.CONTENT_LANGUAGE));
+    CharSequence ulang = page.getHeaders().get(
+        new Utf8(Response.CONTENT_LANGUAGE));
     if (ulang != null) {
       lang = ulang.toString();
     }
@@ -154,7 +156,7 @@ public class HTMLLanguageParser implemen
 
       String content = parse.getText();
       if (content != null) {
-       text.append(" ").append(content.toString());
+        text.append(" ").append(content.toString());
       }
 
       LanguageIdentifier identifier = new LanguageIdentifier(text.toString());

Modified: nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java (original)
+++ nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java Fri Jan  9 06:34:33 2015
@@ -35,11 +35,10 @@ import java.util.HashSet;
 /**
  * An {@link org.apache.nutch.indexer.IndexingFilter} that adds a
  * <code>lang</code> (language) field to the document.
- *
- * It tries to find the language of the document by checking
- * if {@link HTMLLanguageParser} has added some language
- * information
- *
+ * 
+ * It tries to find the language of the document by checking if
+ * {@link HTMLLanguageParser} has added some language information
+ * 
  * @author Sami Siren
  * @author Jerome Charron
  */
@@ -56,7 +55,8 @@ public class LanguageIndexingFilter impl
   /**
    * Constructs a new Language Indexing Filter.
    */
-  public LanguageIndexingFilter() {}
+  public LanguageIndexingFilter() {
+  }
 
   public NutchDocument filter(NutchDocument doc, String url, WebPage page)
       throws IndexingException {

Modified: nutch/branches/2.x/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java (original)
+++ nutch/branches/2.x/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java Fri Jan  9 06:34:33 2015
@@ -96,8 +96,8 @@ public class TestHTMLLanguageParser {
         { "torp, stuga, uthyres, bed & breakfast", null } };
 
     for (int i = 0; i < 44; i++) {
-      assertEquals(tests[i][1], HTMLLanguageParser.LanguageParser
-          .parseLanguage(tests[i][0]));
+      assertEquals(tests[i][1],
+          HTMLLanguageParser.LanguageParser.parseLanguage(tests[i][0]));
     }
   }
 
@@ -151,8 +151,8 @@ public class TestHTMLLanguageParser {
     page.setBaseUrl(BASE);
     page.setContent(ByteBuffer.wrap(text.getBytes()));
     page.setContentType(new Utf8("text/html"));
-    page
-        .getHeaders().put(EncodingDetector.CONTENT_TYPE_UTF8, new Utf8("text/html"));
+    page.getHeaders().put(EncodingDetector.CONTENT_TYPE_UTF8,
+        new Utf8("text/html"));
     return page;
   }
 }

Modified: nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java (original)
+++ nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java Fri Jan  9 06:34:33 2015
@@ -19,7 +19,7 @@ package org.apache.nutch.protocol.http.a
 
 @SuppressWarnings("serial")
 public class BlockedException extends HttpException {
-  
+
   public BlockedException(String msg) {
     super(msg);
   }