You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2015/01/09 07:34:37 UTC
svn commit: r1650447 [14/25] - in /nutch/branches/2.x: ./
src/java/org/apache/nutch/api/ src/java/org/apache/nutch/api/impl/
src/java/org/apache/nutch/api/impl/db/
src/java/org/apache/nutch/api/model/response/
src/java/org/apache/nutch/api/resources/ s...
Modified: nutch/branches/2.x/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java (original)
+++ nutch/branches/2.x/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java Fri Jan 9 06:34:33 2015
@@ -38,101 +38,101 @@ import java.util.StringTokenizer;
/** Adds basic searchable fields to a document. */
public class CCIndexingFilter implements IndexingFilter {
- public static final Logger LOG = LoggerFactory.getLogger(CCIndexingFilter.class);
+ public static final Logger LOG = LoggerFactory
+ .getLogger(CCIndexingFilter.class);
- /** The name of the document field we use. */
- public static String FIELD = "cc";
+ /** The name of the document field we use. */
+ public static String FIELD = "cc";
- private Configuration conf;
+ private Configuration conf;
- private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
+ private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
+
+ static {
+ FIELDS.add(WebPage.Field.BASE_URL);
+ FIELDS.add(WebPage.Field.METADATA);
+ }
+
+ /**
+ * Add the features represented by a license URL. Urls are of the form
+ * "http://creativecommons.org/licenses/xx-xx/xx/xx", where "xx" names a
+ * license feature.
+ */
+ public void addUrlFeatures(NutchDocument doc, String urlString) {
+ try {
+ URL url = new URL(urlString);
+
+ // tokenize the path of the url, breaking at slashes and dashes
+ StringTokenizer names = new StringTokenizer(url.getPath(), "/-");
+
+ if (names.hasMoreTokens())
+ names.nextToken(); // throw away "licenses"
+
+ // add a feature per component after "licenses"
+ while (names.hasMoreTokens()) {
+ String feature = names.nextToken();
+ addFeature(doc, feature);
+ }
+ } catch (MalformedURLException e) {
+ if (LOG.isWarnEnabled()) {
+ LOG.warn("CC: failed to parse url: " + urlString + " : " + e);
+ }
+ }
+ }
+
+ private void addFeature(NutchDocument doc, String feature) {
+ doc.add(FIELD, feature);
+ }
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ }
+
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+ @Override
+ public Collection<Field> getFields() {
+ return FIELDS;
+ }
+
+ @Override
+ public NutchDocument filter(NutchDocument doc, String url, WebPage page)
+ throws IndexingException {
+
+ ByteBuffer blicense = page.getMetadata().get(
+ new Utf8(CreativeCommons.LICENSE_URL));
+ if (blicense != null) {
+ String licenseUrl = Bytes.toString(blicense);
+ if (LOG.isInfoEnabled()) {
+ LOG.info("CC: indexing " + licenseUrl + " for: " + url.toString());
+ }
+
+ // add the entire license as cc:license=xxx
+ addFeature(doc, "license=" + licenseUrl);
+
+ // index license attributes extracted of the license url
+ addUrlFeatures(doc, licenseUrl);
+ }
+
+ // index the license location as cc:meta=xxx
+ ByteBuffer blicenseloc = page.getMetadata().get(
+ new Utf8(CreativeCommons.LICENSE_LOCATION));
+ if (blicenseloc != null) {
+ String licenseLocation = Bytes.toString(blicenseloc);
+ addFeature(doc, "meta=" + licenseLocation);
+ }
+
+ // index the work type cc:type=xxx
+ ByteBuffer bworkType = page.getMetadata().get(
+ new Utf8(CreativeCommons.WORK_TYPE));
+ if (bworkType != null) {
+ String workType = Bytes.toString(bworkType);
+ addFeature(doc, workType);
+ }
- static {
- FIELDS.add(WebPage.Field.BASE_URL);
- FIELDS.add(WebPage.Field.METADATA);
- }
-
- /**
- * Add the features represented by a license URL. Urls are of the form
- * "http://creativecommons.org/licenses/xx-xx/xx/xx", where "xx" names a
- * license feature.
- */
- public void addUrlFeatures(NutchDocument doc, String urlString) {
- try {
- URL url = new URL(urlString);
-
- // tokenize the path of the url, breaking at slashes and dashes
- StringTokenizer names = new StringTokenizer(url.getPath(), "/-");
-
- if (names.hasMoreTokens())
- names.nextToken(); // throw away "licenses"
-
- // add a feature per component after "licenses"
- while (names.hasMoreTokens()) {
- String feature = names.nextToken();
- addFeature(doc, feature);
- }
- } catch (MalformedURLException e) {
- if (LOG.isWarnEnabled()) {
- LOG.warn("CC: failed to parse url: " + urlString + " : " + e);
- }
- }
- }
-
- private void addFeature(NutchDocument doc, String feature) {
- doc.add(FIELD, feature);
- }
-
- public void setConf(Configuration conf) {
- this.conf = conf;
- }
-
- public Configuration getConf() {
- return this.conf;
- }
-
- @Override
- public Collection<Field> getFields() {
- return FIELDS;
- }
-
- @Override
- public NutchDocument filter(NutchDocument doc, String url, WebPage page)
- throws IndexingException {
-
- ByteBuffer blicense = page.getMetadata().get(new Utf8(
- CreativeCommons.LICENSE_URL));
- if (blicense != null) {
- String licenseUrl = Bytes.toString(blicense);
- if (LOG.isInfoEnabled()) {
- LOG.info("CC: indexing " + licenseUrl + " for: "
- + url.toString());
- }
-
- // add the entire license as cc:license=xxx
- addFeature(doc, "license=" + licenseUrl);
-
- // index license attributes extracted of the license url
- addUrlFeatures(doc, licenseUrl);
- }
-
- // index the license location as cc:meta=xxx
- ByteBuffer blicenseloc = page.getMetadata().get(new Utf8(
- CreativeCommons.LICENSE_LOCATION));
- if (blicenseloc != null) {
- String licenseLocation = Bytes.toString(blicenseloc);
- addFeature(doc, "meta=" + licenseLocation);
- }
-
- // index the work type cc:type=xxx
- ByteBuffer bworkType = page.getMetadata().get(new Utf8(
- CreativeCommons.WORK_TYPE));
- if (bworkType != null) {
- String workType = Bytes.toString(bworkType);
- addFeature(doc, workType);
- }
-
- return doc;
- }
+ return doc;
+ }
}
Modified: nutch/branches/2.x/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java (original)
+++ nutch/branches/2.x/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java Fri Jan 9 06:34:33 2015
@@ -55,8 +55,8 @@ public class CCParseFilter implements Pa
}
/** Scan the document adding attributes to metadata. */
- public static void walk(Node doc, URL base, WebPage page,
- Configuration conf) throws ParseException {
+ public static void walk(Node doc, URL base, WebPage page, Configuration conf)
+ throws ParseException {
// walk the DOM tree, scanning for license data
Walker walker = new Walker(base);
@@ -67,36 +67,37 @@ public class CCParseFilter implements Pa
String licenseLocation = null;
if (walker.rdfLicense != null) { // 1st choice: subject in RDF
licenseLocation = "rdf";
- licenseUrl = walker.rdfLicense;
+ licenseUrl = walker.rdfLicense;
} else if (walker.relLicense != null) { // 2nd: anchor w/
// rel=license
licenseLocation = "rel";
licenseUrl = walker.relLicense.toString();
} else if (walker.anchorLicense != null) { // 3rd: anchor w/ CC
// license
- licenseLocation = "a";
- licenseUrl = walker.anchorLicense.toString();
+ licenseLocation = "a";
+ licenseUrl = walker.anchorLicense.toString();
} else if (conf.getBoolean("creativecommons.exclude.unlicensed", false)) {
- throw new ParseException("No CC license. Excluding.");
+ throw new ParseException("No CC license. Excluding.");
}
// add license to metadata
if (licenseUrl != null) {
if (LOG.isDebugEnabled()) {
- LOG.debug("CC: found " + licenseUrl + " in " + licenseLocation + " of " + base);
- }
- page.getMetadata().put(new Utf8(CreativeCommons.LICENSE_URL),
- ByteBuffer.wrap(licenseUrl.getBytes()));
- page.getMetadata().put(new Utf8(CreativeCommons.LICENSE_LOCATION),
- ByteBuffer.wrap(licenseLocation.getBytes()));
+ LOG.debug("CC: found " + licenseUrl + " in " + licenseLocation
+ + " of " + base);
+ }
+ page.getMetadata().put(new Utf8(CreativeCommons.LICENSE_URL),
+ ByteBuffer.wrap(licenseUrl.getBytes()));
+ page.getMetadata().put(new Utf8(CreativeCommons.LICENSE_LOCATION),
+ ByteBuffer.wrap(licenseLocation.getBytes()));
}
if (walker.workType != null) {
if (LOG.isDebugEnabled()) {
- LOG.debug("CC: found " + walker.workType + " in " + base);
- }
- page.getMetadata().put(new Utf8(CreativeCommons.WORK_TYPE),
- ByteBuffer.wrap(walker.workType.getBytes()));
+ LOG.debug("CC: found " + walker.workType + " in " + base);
+ }
+ page.getMetadata().put(new Utf8(CreativeCommons.WORK_TYPE),
+ ByteBuffer.wrap(walker.workType.getBytes()));
}
}
@@ -121,8 +122,8 @@ public class CCParseFilter implements Pa
}
/**
- * Extract license url from element, if any. Thse are the href attribute
- * of anchor elements with rel="license". These must also point to
+ * Extract license url from element, if any. Thse are the href attribute of
+ * anchor elements with rel="license". These must also point to
* http://creativecommons.org/licenses/.
*/
private void findLicenseUrl(Element element) {
@@ -137,27 +138,27 @@ public class CCParseFilter implements Pa
try {
URL url = new URL(base, href); // resolve the url
// check that it's a CC license URL
- if ("http".equalsIgnoreCase(url.getProtocol())
- && "creativecommons.org".equalsIgnoreCase(url.getHost())
- && url.getPath() != null && url.getPath().startsWith("/licenses/")
- && url.getPath().length() > "/licenses/".length()) {
-
- // check rel="license"
- String rel = element.getAttribute("rel");
- if (rel != null && "license".equals(rel)
- && this.relLicense == null) {
- this.relLicense = url; // found rel license
- } else if (this.anchorLicense == null) {
- this.anchorLicense = url; // found anchor license
- }
- }
+ if ("http".equalsIgnoreCase(url.getProtocol())
+ && "creativecommons.org".equalsIgnoreCase(url.getHost())
+ && url.getPath() != null && url.getPath().startsWith("/licenses/")
+ && url.getPath().length() > "/licenses/".length()) {
+
+ // check rel="license"
+ String rel = element.getAttribute("rel");
+ if (rel != null && "license".equals(rel) && this.relLicense == null) {
+ this.relLicense = url; // found rel license
+ } else if (this.anchorLicense == null) {
+ this.anchorLicense = url; // found anchor license
+ }
+ }
} catch (MalformedURLException e) { // ignore malformed urls
}
}
/** Configure a namespace aware XML parser. */
- private static final DocumentBuilderFactory FACTORY = DocumentBuilderFactory.newInstance();
-
+ private static final DocumentBuilderFactory FACTORY = DocumentBuilderFactory
+ .newInstance();
+
static {
FACTORY.setNamespaceAware(true);
}
@@ -177,129 +178,132 @@ public class CCParseFilter implements Pa
if (rdfPosition < 0)
return; // no RDF, abort
int nsPosition = comment.indexOf(CC_NS);
- if (nsPosition < 0)
- return; // no RDF, abort
- // try to parse the XML
- Document doc;
- try {
- DocumentBuilder parser = FACTORY.newDocumentBuilder();
- doc = parser.parse(new InputSource(new StringReader(comment)));
- } catch (Exception e) {
- if (LOG.isWarnEnabled()) {
- LOG.warn("CC: Failed to parse RDF in " + base + ": " + e);
- }
- // e.printStackTrace();
- return;
- }
-
- // check that root is rdf:RDF
- NodeList roots = doc.getElementsByTagNameNS(RDF_NS, "RDF");
- if (roots.getLength() != 1) {
- if (LOG.isWarnEnabled()) {
- LOG.warn("CC: No RDF root in " + base);
- }
- return;
- }
- Element rdf = (Element) roots.item(0);
-
- // get cc:License nodes inside rdf:RDF
- NodeList licenses = rdf.getElementsByTagNameNS(CC_NS, "License");
- for (int i = 0; i < licenses.getLength(); i++) {
- Element l = (Element) licenses.item(i);
- // license is rdf:about= attribute from cc:License
- this.rdfLicense = l.getAttributeNodeNS(RDF_NS, "about").getValue();
-
- // walk predicates of cc:License
- NodeList predicates = l.getChildNodes();
- for (int j = 0; j < predicates.getLength(); j++) {
- Node predicateNode = predicates.item(j);
- if (!(predicateNode instanceof Element))
- continue;
- Element predicateElement = (Element) predicateNode;
- // extract predicates of cc:xxx predicates
- if (!CC_NS.equals(predicateElement.getNamespaceURI())) {
- continue;
- }
- String predicate = predicateElement.getLocalName();
- // object is rdf:resource from cc:xxx predicates
- String object = predicateElement.getAttributeNodeNS(RDF_NS, "resource").getValue();
- // add object and predicate to metadata
- // metadata.put(object, predicate);
- //if (LOG.isInfoEnabled()) {
- // LOG.info("CC: found: "+predicate+"="+object);
- // }
- }
- }
-
- // get cc:Work nodes from rdf:RDF
- NodeList works = rdf.getElementsByTagNameNS(CC_NS, "Work");
- for (int i = 0; i < works.getLength(); i++) {
- Element l = (Element) works.item(i);
-
- // get dc:type nodes from cc:Work
- NodeList types = rdf.getElementsByTagNameNS(DC_NS, "type");
- for (int j = 0; j < types.getLength(); j++) {
- Element type = (Element) types.item(j);
- String workUri = type.getAttributeNodeNS(RDF_NS, "resource").getValue();
- this.workType = (String) WORK_TYPE_NAMES.get(workUri);
- break;
- }
- }
- }
- }
-
- private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
-
- static {
- FIELDS.add(WebPage.Field.BASE_URL);
- FIELDS.add(WebPage.Field.METADATA);
- }
-
- private static final HashMap<String,String> WORK_TYPE_NAMES = new HashMap<String,String>();
-
- static {
- WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/MovingImage", "video");
- WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/StillImage", "image");
- WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Sound", "audio");
- WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Text", "text");
- WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Interactive", "interactive");
- WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Software", "software");
- WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Image", "image");
- }
-
- private Configuration conf;
-
- public void setConf(Configuration conf) {
- this.conf = conf;
- }
-
- public Configuration getConf() {
- return this.conf;
- }
-
- @Override
- public Collection<Field> getFields() {
- return FIELDS;
- }
-
- /**
- * Adds metadata or otherwise modifies a parse of an HTML document, given
- * the DOM tree of a page.
- */
- @Override
- public Parse filter(String url, WebPage page, Parse parse,
- HTMLMetaTags metaTags, DocumentFragment doc) {
- // construct base url
- URL base;
+ if (nsPosition < 0)
+ return; // no RDF, abort
+ // try to parse the XML
+ Document doc;
try {
- base = new URL(page.getBaseUrl().toString());
- // extract license metadata
- Walker.walk(doc, base, page, getConf());
+ DocumentBuilder parser = FACTORY.newDocumentBuilder();
+ doc = parser.parse(new InputSource(new StringReader(comment)));
} catch (Exception e) {
- LOG.error("Error parsing " + url, e);
- return ParseStatusUtils.getEmptyParse(e, getConf());
+ if (LOG.isWarnEnabled()) {
+ LOG.warn("CC: Failed to parse RDF in " + base + ": " + e);
+ }
+ // e.printStackTrace();
+ return;
+ }
+
+ // check that root is rdf:RDF
+ NodeList roots = doc.getElementsByTagNameNS(RDF_NS, "RDF");
+ if (roots.getLength() != 1) {
+ if (LOG.isWarnEnabled()) {
+ LOG.warn("CC: No RDF root in " + base);
+ }
+ return;
}
+ Element rdf = (Element) roots.item(0);
- return parse;
+ // get cc:License nodes inside rdf:RDF
+ NodeList licenses = rdf.getElementsByTagNameNS(CC_NS, "License");
+ for (int i = 0; i < licenses.getLength(); i++) {
+ Element l = (Element) licenses.item(i);
+ // license is rdf:about= attribute from cc:License
+ this.rdfLicense = l.getAttributeNodeNS(RDF_NS, "about").getValue();
+
+ // walk predicates of cc:License
+ NodeList predicates = l.getChildNodes();
+ for (int j = 0; j < predicates.getLength(); j++) {
+ Node predicateNode = predicates.item(j);
+ if (!(predicateNode instanceof Element))
+ continue;
+ Element predicateElement = (Element) predicateNode;
+ // extract predicates of cc:xxx predicates
+ if (!CC_NS.equals(predicateElement.getNamespaceURI())) {
+ continue;
+ }
+ String predicate = predicateElement.getLocalName();
+ // object is rdf:resource from cc:xxx predicates
+ String object = predicateElement.getAttributeNodeNS(RDF_NS,
+ "resource").getValue();
+ // add object and predicate to metadata
+ // metadata.put(object, predicate);
+ // if (LOG.isInfoEnabled()) {
+ // LOG.info("CC: found: "+predicate+"="+object);
+ // }
+ }
+ }
+
+ // get cc:Work nodes from rdf:RDF
+ NodeList works = rdf.getElementsByTagNameNS(CC_NS, "Work");
+ for (int i = 0; i < works.getLength(); i++) {
+ Element l = (Element) works.item(i);
+
+ // get dc:type nodes from cc:Work
+ NodeList types = rdf.getElementsByTagNameNS(DC_NS, "type");
+ for (int j = 0; j < types.getLength(); j++) {
+ Element type = (Element) types.item(j);
+ String workUri = type.getAttributeNodeNS(RDF_NS, "resource")
+ .getValue();
+ this.workType = (String) WORK_TYPE_NAMES.get(workUri);
+ break;
+ }
+ }
+ }
+ }
+
+ private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
+
+ static {
+ FIELDS.add(WebPage.Field.BASE_URL);
+ FIELDS.add(WebPage.Field.METADATA);
+ }
+
+ private static final HashMap<String, String> WORK_TYPE_NAMES = new HashMap<String, String>();
+
+ static {
+ WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/MovingImage", "video");
+ WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/StillImage", "image");
+ WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Sound", "audio");
+ WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Text", "text");
+ WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Interactive",
+ "interactive");
+ WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Software", "software");
+ WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Image", "image");
+ }
+
+ private Configuration conf;
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ }
+
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+ @Override
+ public Collection<Field> getFields() {
+ return FIELDS;
+ }
+
+ /**
+ * Adds metadata or otherwise modifies a parse of an HTML document, given the
+ * DOM tree of a page.
+ */
+ @Override
+ public Parse filter(String url, WebPage page, Parse parse,
+ HTMLMetaTags metaTags, DocumentFragment doc) {
+ // construct base url
+ URL base;
+ try {
+ base = new URL(page.getBaseUrl().toString());
+ // extract license metadata
+ Walker.walk(doc, base, page, getConf());
+ } catch (Exception e) {
+ LOG.error("Error parsing " + url, e);
+ return ParseStatusUtils.getEmptyParse(e, getConf());
}
+
+ return parse;
+ }
}
Modified: nutch/branches/2.x/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java (original)
+++ nutch/branches/2.x/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java Fri Jan 9 06:34:33 2015
@@ -36,52 +36,50 @@ import static org.junit.Assert.assertEqu
public class TestCCParseFilter {
- private static final File testDir = new File(
- System.getProperty("test.input"));
+ private static final File testDir = new File(System.getProperty("test.input"));
@Test
- public void testPages() throws Exception {
- pageTest(new File(testDir, "anchor.html"), "http://foo.com/",
- "http://creativecommons.org/licenses/by-nc-sa/1.0", "a", null);
- // Tika returns <a> whereas parse-html returns <rel>
- // check later
- pageTest(new File(testDir, "rel.html"), "http://foo.com/",
- "http://creativecommons.org/licenses/by-nc/2.0", "rel", null);
- // Tika returns <a> whereas parse-html returns <rdf>
- // check later
- pageTest(new File(testDir, "rdf.html"), "http://foo.com/",
- "http://creativecommons.org/licenses/by-nc/1.0", "rdf", "text");
- }
-
- public void pageTest(File file, String url, String license,
- String location, String type) throws Exception {
-
- InputStream in = new FileInputStream(file);
- ByteArrayOutputStream out = new ByteArrayOutputStream(
- (int) file.length());
- byte[] buffer = new byte[1024];
- int i;
- while ((i = in.read(buffer)) != -1) {
- out.write(buffer, 0, i);
- }
- in.close();
- byte[] bytes = out.toByteArray();
- Configuration conf = NutchConfiguration.create();
-
- WebPage page = WebPage.newBuilder().build();
- page.setBaseUrl(new Utf8(url));
- page.setContent(ByteBuffer.wrap(bytes));
- MimeUtil mimeutil = new MimeUtil(conf);
- String mtype = mimeutil.getMimeType(file);
- page.setContentType(new Utf8(mtype));
-
- new ParseUtil(conf).parse(url, page);
-
- ByteBuffer bb = page.getMetadata().get(new Utf8("License-Url"));
- assertEquals(license, Bytes.toString(bb));
- bb = page.getMetadata().get(new Utf8("License-Location"));
- assertEquals(location, Bytes.toString(bb));
- bb = page.getMetadata().get(new Utf8("Work-Type"));
- assertEquals(type, Bytes.toString(bb));
- }
+ public void testPages() throws Exception {
+ pageTest(new File(testDir, "anchor.html"), "http://foo.com/",
+ "http://creativecommons.org/licenses/by-nc-sa/1.0", "a", null);
+ // Tika returns <a> whereas parse-html returns <rel>
+ // check later
+ pageTest(new File(testDir, "rel.html"), "http://foo.com/",
+ "http://creativecommons.org/licenses/by-nc/2.0", "rel", null);
+ // Tika returns <a> whereas parse-html returns <rdf>
+ // check later
+ pageTest(new File(testDir, "rdf.html"), "http://foo.com/",
+ "http://creativecommons.org/licenses/by-nc/1.0", "rdf", "text");
+ }
+
+ public void pageTest(File file, String url, String license, String location,
+ String type) throws Exception {
+
+ InputStream in = new FileInputStream(file);
+ ByteArrayOutputStream out = new ByteArrayOutputStream((int) file.length());
+ byte[] buffer = new byte[1024];
+ int i;
+ while ((i = in.read(buffer)) != -1) {
+ out.write(buffer, 0, i);
+ }
+ in.close();
+ byte[] bytes = out.toByteArray();
+ Configuration conf = NutchConfiguration.create();
+
+ WebPage page = WebPage.newBuilder().build();
+ page.setBaseUrl(new Utf8(url));
+ page.setContent(ByteBuffer.wrap(bytes));
+ MimeUtil mimeutil = new MimeUtil(conf);
+ String mtype = mimeutil.getMimeType(file);
+ page.setContentType(new Utf8(mtype));
+
+ new ParseUtil(conf).parse(url, page);
+
+ ByteBuffer bb = page.getMetadata().get(new Utf8("License-Url"));
+ assertEquals(license, Bytes.toString(bb));
+ bb = page.getMetadata().get(new Utf8("License-Location"));
+ assertEquals(location, Bytes.toString(bb));
+ bb = page.getMetadata().get(new Utf8("Work-Type"));
+ assertEquals(type, Bytes.toString(bb));
+ }
}
Modified: nutch/branches/2.x/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java (original)
+++ nutch/branches/2.x/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java Fri Jan 9 06:34:33 2015
@@ -32,13 +32,15 @@ import java.util.HashSet;
import java.util.Map.Entry;
/**
- * Indexing filter that offers an option to either index all inbound anchor text for
- * a document or deduplicate anchors. Deduplication does have it's con's,
+ * Indexing filter that offers an option to either index all inbound anchor text
+ * for a document or deduplicate anchors. Deduplication does have it's con's,
+ *
* @see {@code anchorIndexingFilter.deduplicate} in nutch-default.xml.
*/
public class AnchorIndexingFilter implements IndexingFilter {
- public static final Logger LOG = LoggerFactory.getLogger(AnchorIndexingFilter.class);
+ public static final Logger LOG = LoggerFactory
+ .getLogger(AnchorIndexingFilter.class);
private Configuration conf;
private boolean deduplicate = false;
@@ -47,7 +49,7 @@ public class AnchorIndexingFilter implem
static {
FIELDS.add(WebPage.Field.INLINKS);
}
-
+
/**
* Set the {@link Configuration} object
*/
@@ -57,40 +59,44 @@ public class AnchorIndexingFilter implem
deduplicate = conf.getBoolean("anchorIndexingFilter.deduplicate", false);
LOG.info("Anchor deduplication is: " + (deduplicate ? "on" : "off"));
}
-
+
/**
* Get the {@link Configuration} object
*/
public Configuration getConf() {
return this.conf;
}
-
+
public void addIndexBackendOptions(Configuration conf) {
}
-
+
/**
- * The {@link AnchorIndexingFilter} filter object which supports boolean
- * configuration settings for the deduplication of anchors.
- * See {@code anchorIndexingFilter.deduplicate} in nutch-default.xml.
- *
- * @param doc The {@link NutchDocument} object
- * @param url URL to be filtered for anchor text
- * @param page {@link WebPage} object relative to the URL
+ * The {@link AnchorIndexingFilter} filter object which supports boolean
+ * configuration settings for the deduplication of anchors. See
+ * {@code anchorIndexingFilter.deduplicate} in nutch-default.xml.
+ *
+ * @param doc
+ * The {@link NutchDocument} object
+ * @param url
+ * URL to be filtered for anchor text
+ * @param page
+ * {@link WebPage} object relative to the URL
* @return filtered NutchDocument
*/
@Override
public NutchDocument filter(NutchDocument doc, String url, WebPage page)
throws IndexingException {
HashSet<String> set = null;
-
+
for (Entry<CharSequence, CharSequence> e : page.getInlinks().entrySet()) {
String anchor = TableUtil.toString(e.getValue());
-
- if(anchor.equals(""))
+
+ if (anchor.equals(""))
continue;
-
+
if (deduplicate) {
- if (set == null) set = new HashSet<String>();
+ if (set == null)
+ set = new HashSet<String>();
String lcAnchor = anchor.toLowerCase();
// Check if already processed the current anchor
@@ -104,15 +110,14 @@ public class AnchorIndexingFilter implem
doc.add("anchor", anchor);
}
}
-
+
return doc;
}
-
+
/**
- * Gets all the fields for a given {@link WebPage}
- * Many datastores need to setup the mapreduce job by specifying the fields
- * needed. All extensions that work on WebPage are able to specify what fields
- * they need.
+ * Gets all the fields for a given {@link WebPage} Many datastores need to
+ * setup the mapreduce job by specifying the fields needed. All extensions
+ * that work on WebPage are able to specify what fields they need.
*/
@Override
public Collection<WebPage.Field> getFields() {
Modified: nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java (original)
+++ nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java Fri Jan 9 06:34:33 2015
@@ -25,13 +25,12 @@ import org.junit.Test;
import static org.junit.Assert.*;
/**
- * JUnit test case which tests
- * 1. that anchor text is obtained
- * 2. that anchor deduplication functionality is working
- *
+ * JUnit test case which tests 1. that anchor text is obtained 2. that anchor
+ * deduplication functionality is working
+ *
*/
public class TestAnchorIndexingFilter {
-
+
@Test
public void testDeduplicateAnchor() throws Exception {
Configuration conf = NutchConfiguration.create();
@@ -40,14 +39,19 @@ public class TestAnchorIndexingFilter {
filter.setConf(conf);
NutchDocument doc = new NutchDocument();
WebPage page = WebPage.newBuilder().build();
- page.getInlinks().put(new Utf8("http://example1.com/"), new Utf8("cool site"));
- page.getInlinks().put(new Utf8("http://example2.com/"), new Utf8("cool site"));
- page.getInlinks().put(new Utf8("http://example3.com/"), new Utf8("fun site"));
+ page.getInlinks().put(new Utf8("http://example1.com/"),
+ new Utf8("cool site"));
+ page.getInlinks().put(new Utf8("http://example2.com/"),
+ new Utf8("cool site"));
+ page.getInlinks().put(new Utf8("http://example3.com/"),
+ new Utf8("fun site"));
filter.filter(doc, "http://myurldoesnotmatter.com/", page);
-
- assertTrue("test if there is an anchor at all", doc.getFieldNames().contains("anchor"));
-
- assertEquals("test dedup, we expect 2", 2, doc.getFieldValues("anchor").size());
+
+ assertTrue("test if there is an anchor at all", doc.getFieldNames()
+ .contains("anchor"));
+
+ assertEquals("test dedup, we expect 2", 2, doc.getFieldValues("anchor")
+ .size());
}
}
Modified: nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java (original)
+++ nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java Fri Jan 9 06:34:33 2015
@@ -36,17 +36,17 @@ import java.util.Collection;
import java.util.Date;
import java.util.HashSet;
-/** Adds basic searchable fields to a document. The fields are:
- * host - add host as un-stored, indexed and tokenized
- * url - url is both stored and indexed, so it's both searchable and returned.
- * This is also a required field.
- * content - content is indexed, so that it's searchable, but not stored in index
- * title - title is stored and indexed
- * cache - add cached content/summary display policy, if available
- * tstamp - add timestamp when fetched, for deduplication
+/**
+ * Adds basic searchable fields to a document. The fields are: host - add host
+ * as un-stored, indexed and tokenized url - url is both stored and indexed, so
+ * it's both searchable and returned. This is also a required field. content -
+ * content is indexed, so that it's searchable, but not stored in index title -
+ * title is stored and indexed cache - add cached content/summary display
+ * policy, if available tstamp - add timestamp when fetched, for deduplication
*/
public class BasicIndexingFilter implements IndexingFilter {
- public static final Logger LOG = LoggerFactory.getLogger(BasicIndexingFilter.class);
+ public static final Logger LOG = LoggerFactory
+ .getLogger(BasicIndexingFilter.class);
private int MAX_TITLE_LENGTH;
private Configuration conf;
@@ -60,22 +60,25 @@ public class BasicIndexingFilter impleme
}
/**
- * The {@link BasicIndexingFilter} filter object which supports boolean
- * configurable value for length of characters permitted within the
- * title @see {@code indexer.max.title.length} in nutch-default.xml
- *
- * @param doc The {@link NutchDocument} object
- * @param url URL to be filtered for anchor text
- * @param page {@link WebPage} object relative to the URL
+ * The {@link BasicIndexingFilter} filter object which supports boolean
+ * configurable value for length of characters permitted within the title @see
+ * {@code indexer.max.title.length} in nutch-default.xml
+ *
+ * @param doc
+ * The {@link NutchDocument} object
+ * @param url
+ * URL to be filtered for anchor text
+ * @param page
+ * {@link WebPage} object relative to the URL
* @return filtered NutchDocument
*/
public NutchDocument filter(NutchDocument doc, String url, WebPage page)
throws IndexingException {
String reprUrl = null;
-// if (page.isReadable(WebPage.Field.REPR_URL.getIndex())) {
- reprUrl = TableUtil.toString(page.getReprUrl());
-// }
+ // if (page.isReadable(WebPage.Field.REPR_URL.getIndex())) {
+ reprUrl = TableUtil.toString(page.getReprUrl());
+ // }
String host = null;
try {
@@ -103,7 +106,10 @@ public class BasicIndexingFilter impleme
// title
String title = TableUtil.toString(page.getTitle());
- if (MAX_TITLE_LENGTH > -1 && title.length() > MAX_TITLE_LENGTH) { // truncate title if needed
+ if (MAX_TITLE_LENGTH > -1 && title.length() > MAX_TITLE_LENGTH) { // truncate
+ // title
+ // if
+ // needed
title = title.substring(0, MAX_TITLE_LENGTH);
}
if (title.length() > 0) {
@@ -111,15 +117,16 @@ public class BasicIndexingFilter impleme
doc.add("title", title);
}
// add cached content/summary display policy, if available
- ByteBuffer cachingRaw = page
- .getMetadata().get(Nutch.CACHING_FORBIDDEN_KEY_UTF8);
+ ByteBuffer cachingRaw = page.getMetadata().get(
+ Nutch.CACHING_FORBIDDEN_KEY_UTF8);
String caching = Bytes.toString(cachingRaw);
if (caching != null && !caching.equals(Nutch.CACHING_FORBIDDEN_NONE)) {
doc.add("cache", caching);
}
// add timestamp when fetched, for deduplication
- String tstamp = DateUtil.getThreadLocalDateFormat().format(new Date(page.getFetchTime()));
+ String tstamp = DateUtil.getThreadLocalDateFormat().format(
+ new Date(page.getFetchTime()));
doc.add("tstamp", tstamp);
return doc;
@@ -134,7 +141,8 @@ public class BasicIndexingFilter impleme
public void setConf(Configuration conf) {
this.conf = conf;
this.MAX_TITLE_LENGTH = conf.getInt("indexer.max.title.length", 100);
- LOG.info("Maximum title length for indexing set to: " + this.MAX_TITLE_LENGTH);
+ LOG.info("Maximum title length for indexing set to: "
+ + this.MAX_TITLE_LENGTH);
}
/**
@@ -145,10 +153,9 @@ public class BasicIndexingFilter impleme
}
/**
- * Gets all the fields for a given {@link WebPage}
- * Many datastores need to setup the mapreduce job by specifying the fields
- * needed. All extensions that work on WebPage are able to specify what fields
- * they need.
+ * Gets all the fields for a given {@link WebPage} Many datastores need to
+ * setup the mapreduce job by specifying the fields needed. All extensions
+ * that work on WebPage are able to specify what fields they need.
*/
@Override
public Collection<WebPage.Field> getFields() {
Modified: nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java (original)
+++ nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java Fri Jan 9 06:34:33 2015
@@ -29,66 +29,69 @@ import java.nio.ByteBuffer;
import static org.junit.Assert.*;
/**
- * JUnit test case which tests
- * 1. that the host, url, content, title, cache and tstamp fields
- * are obtained by the filter.
- * 2. that configurable maximum length functionality for titles actually works. .
- * This property defaults at 100 characters @see {@code indexer.max.title.length}
- * in nutch-default.xml but has been set to 10 for this test.
+ * JUnit test case which tests 1. that the host, url, content, title, cache and
+ * tstamp fields are obtained by the filter. 2. that configurable maximum length
+ * functionality for titles actually works. . This property defaults at 100
+ * characters @see {@code indexer.max.title.length} in nutch-default.xml but has
+ * been set to 10 for this test.
*
* @author lewismc
*/
public class TestBasicIndexingFilter {
-
+
@Test
public void testBasicFields() throws Exception {
- Configuration conf = NutchConfiguration.create();
- BasicIndexingFilter filter = new BasicIndexingFilter();
- filter.setConf(conf);
- assertNotNull(filter);
- NutchDocument doc = new NutchDocument();
- WebPage page = WebPage.newBuilder().build();
- page.getInlinks().put(new Utf8("http://nutch.apache.org/"), new Utf8("Welcome to Nutch"));
- page.setTitle(new Utf8("Welcome to Nutch"));
+ Configuration conf = NutchConfiguration.create();
+ BasicIndexingFilter filter = new BasicIndexingFilter();
+ filter.setConf(conf);
+ assertNotNull(filter);
+ NutchDocument doc = new NutchDocument();
+ WebPage page = WebPage.newBuilder().build();
+ page.getInlinks().put(new Utf8("http://nutch.apache.org/"),
+ new Utf8("Welcome to Nutch"));
+ page.setTitle(new Utf8("Welcome to Nutch"));
page.setReprUrl(new Utf8("http://www.urldoesnotmatter.org"));
byte[] bytes = new byte[10];
ByteBuffer bbuf = ByteBuffer.wrap(bytes);
page.getMetadata().put(Nutch.CACHING_FORBIDDEN_KEY_UTF8, bbuf);
page.setFetchTime(System.currentTimeMillis());
- try {
- filter.filter(doc, "http://www.apache.org/", page);
- } catch(Exception e) {
- e.printStackTrace();
- fail(e.getMessage());
- }
- assertNotNull(doc);
- assertTrue("check for host field ", doc.getFieldNames().contains("host"));
- assertTrue("check for url field", doc.getFieldNames().contains("url"));
- assertTrue("check for content field", doc.getFieldNames().contains("content"));
- assertTrue("check for title field", doc.getFieldNames().contains("title"));
- assertTrue("check for cache field", doc.getFieldNames().contains("cache"));
- assertTrue("check for tstamp field", doc.getFieldNames().contains("tstamp"));
+ try {
+ filter.filter(doc, "http://www.apache.org/", page);
+ } catch (Exception e) {
+ e.printStackTrace();
+ fail(e.getMessage());
+ }
+ assertNotNull(doc);
+ assertTrue("check for host field ", doc.getFieldNames().contains("host"));
+ assertTrue("check for url field", doc.getFieldNames().contains("url"));
+ assertTrue("check for content field",
+ doc.getFieldNames().contains("content"));
+ assertTrue("check for title field", doc.getFieldNames().contains("title"));
+ assertTrue("check for cache field", doc.getFieldNames().contains("cache"));
+ assertTrue("check for tstamp field", doc.getFieldNames().contains("tstamp"));
}
-
+
@Test
public void testTitleFieldLength() throws Exception {
- Configuration conf = NutchConfiguration.create();
- conf.setInt("indexer.max.title.length", 10);
- BasicIndexingFilter filter = new BasicIndexingFilter();
- filter.setConf(conf);
- assertNotNull(filter);
- NutchDocument doc = new NutchDocument();
- WebPage page = WebPage.newBuilder().build();
- page.getInlinks().put(new Utf8("http://exceedmaximumtitleurl.org/"), new Utf8("exceeding title site"));
- page.setTitle(new Utf8("This title exceeds maximum characters"));
- try {
- filter.filter(doc, "http://www.apache.org/", page);
- } catch (Exception e) {
- e.printStackTrace();
- fail(e.getMessage());
- }
- assertNotNull(doc);
- assertEquals("assert title field only has 10 characters", 10, doc.getFieldValue("title").length());
+ Configuration conf = NutchConfiguration.create();
+ conf.setInt("indexer.max.title.length", 10);
+ BasicIndexingFilter filter = new BasicIndexingFilter();
+ filter.setConf(conf);
+ assertNotNull(filter);
+ NutchDocument doc = new NutchDocument();
+ WebPage page = WebPage.newBuilder().build();
+ page.getInlinks().put(new Utf8("http://exceedmaximumtitleurl.org/"),
+ new Utf8("exceeding title site"));
+ page.setTitle(new Utf8("This title exceeds maximum characters"));
+ try {
+ filter.filter(doc, "http://www.apache.org/", page);
+ } catch (Exception e) {
+ e.printStackTrace();
+ fail(e.getMessage());
+ }
+ assertNotNull(doc);
+ assertEquals("assert title field only has 10 characters", 10, doc
+ .getFieldValue("title").length());
}
}
Modified: nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java (original)
+++ nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java Fri Jan 9 06:34:33 2015
@@ -42,7 +42,7 @@ import org.apache.nutch.util.Bytes;
public class MetadataIndexer implements IndexingFilter {
private Configuration conf;
- private static Map<Utf8,String> parseFieldnames;
+ private static Map<Utf8, String> parseFieldnames;
private static final String PARSE_CONF_PROPERTY = "index.metadata";
private static final String INDEX_PREFIX = "meta_";
private static final String PARSE_META_PREFIX = "meta_";
@@ -56,7 +56,7 @@ public class MetadataIndexer implements
// add the fields from parsemd
if (parseFieldnames != null) {
- for (Entry<Utf8,String> metatag : parseFieldnames.entrySet()) {
+ for (Entry<Utf8, String> metatag : parseFieldnames.entrySet()) {
ByteBuffer bvalues = page.getMetadata().get(metatag.getKey());
if (bvalues != null) {
String key = metatag.getValue();
@@ -75,7 +75,7 @@ public class MetadataIndexer implements
public void setConf(Configuration conf) {
this.conf = conf;
String[] metatags = conf.getStrings(PARSE_CONF_PROPERTY);
- parseFieldnames = new TreeMap<Utf8,String>();
+ parseFieldnames = new TreeMap<Utf8, String>();
for (int i = 0; i < metatags.length; i++) {
parseFieldnames.put(
new Utf8(PARSE_META_PREFIX + metatags[i].toLowerCase(Locale.ROOT)),
Modified: nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/package-info.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/package-info.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/package-info.java (original)
+++ nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/package-info.java Fri Jan 9 06:34:33 2015
@@ -20,3 +20,4 @@
* Metadata may come from CrawlDb, parse or content metadata.
*/
package org.apache.nutch.indexer.metadata;
+
Modified: nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (original)
+++ nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Fri Jan 9 06:34:33 2015
@@ -30,10 +30,12 @@ import org.slf4j.LoggerFactory;
* Add (or reset) a few metaData properties as respective fields (if they are
* available), so that they can be accurately used within the search index.
*
- * 'lastModifed' is indexed to support query by date, 'contentLength' obtains content length from the HTTP
- * header, 'type' field is indexed to support query by type and finally the 'title' field is an attempt
- * to reset the title if a content-disposition hint exists. The logic is that such a presence is indicative
- * that the content provider wants the filename therein to be used as the title.
+ * 'lastModifed' is indexed to support query by date, 'contentLength' obtains
+ * content length from the HTTP header, 'type' field is indexed to support query
+ * by type and finally the 'title' field is an attempt to reset the title if a
+ * content-disposition hint exists. The logic is that such a presence is
+ * indicative that the content provider wants the filename therein to be used as
+ * the title.
*
* Still need to make content-length searchable!
*
@@ -41,7 +43,8 @@ import org.slf4j.LoggerFactory;
*/
public class MoreIndexingFilter implements IndexingFilter {
- public static final Logger LOG = LoggerFactory.getLogger(MoreIndexingFilter.class);
+ public static final Logger LOG = LoggerFactory
+ .getLogger(MoreIndexingFilter.class);
/** Get the MimeTypes resolver instance. */
private MimeUtil MIME;
@@ -68,12 +71,13 @@ public class MoreIndexingFilter implemen
// last-modified, or, if that's not present, use fetch time.
private NutchDocument addTime(NutchDocument doc, WebPage page, String url) {
long time = -1;
- CharSequence lastModified = page
- .getHeaders().get(new Utf8(HttpHeaders.LAST_MODIFIED));
+ CharSequence lastModified = page.getHeaders().get(
+ new Utf8(HttpHeaders.LAST_MODIFIED));
// String lastModified = data.getMeta(Metadata.LAST_MODIFIED);
if (lastModified != null) { // try parse last-modified
time = getTime(lastModified.toString(), url); // use as time
- String formlastModified = DateUtil.getThreadLocalDateFormat().format(new Date(time));
+ String formlastModified = DateUtil.getThreadLocalDateFormat().format(
+ new Date(time));
// store as string
doc.add("lastModified", formlastModified);
}
@@ -82,7 +86,8 @@ public class MoreIndexingFilter implemen
time = page.getModifiedTime(); // use Modified time
}
- String dateString = DateUtil.getThreadLocalDateFormat().format(new Date(time));
+ String dateString = DateUtil.getThreadLocalDateFormat().format(
+ new Date(time));
// un-stored, indexed and un-tokenized
doc.add("date", dateString);
@@ -97,17 +102,19 @@ public class MoreIndexingFilter implemen
} catch (ParseException e) {
// try to parse it as date in alternative format
try {
- Date parsedDate = DateUtils.parseDate(date, new String[] {
- "EEE MMM dd HH:mm:ss yyyy", "EEE MMM dd HH:mm:ss yyyy zzz",
- "EEE MMM dd HH:mm:ss zzz yyyy", "EEE, dd MMM yyyy HH:mm:ss zzz",
- "EEE,dd MMM yyyy HH:mm:ss zzz", "EEE, dd MMM yyyy HH:mm:sszzz",
- "EEE, dd MMM yyyy HH:mm:ss", "EEE, dd-MMM-yy HH:mm:ss zzz",
- "yyyy/MM/dd HH:mm:ss.SSS zzz", "yyyy/MM/dd HH:mm:ss.SSS",
- "yyyy/MM/dd HH:mm:ss zzz", "yyyy/MM/dd", "yyyy.MM.dd HH:mm:ss",
- "yyyy-MM-dd HH:mm", "MMM dd yyyy HH:mm:ss. zzz",
- "MMM dd yyyy HH:mm:ss zzz", "dd.MM.yyyy HH:mm:ss zzz",
- "dd MM yyyy HH:mm:ss zzz", "dd.MM.yyyy; HH:mm:ss",
- "dd.MM.yyyy HH:mm:ss", "dd.MM.yyyy zzz", "yyyy-MM-dd'T'HH:mm:ss'Z'" });
+ Date parsedDate = DateUtils.parseDate(date,
+ new String[] { "EEE MMM dd HH:mm:ss yyyy",
+ "EEE MMM dd HH:mm:ss yyyy zzz", "EEE MMM dd HH:mm:ss zzz yyyy",
+ "EEE, dd MMM yyyy HH:mm:ss zzz",
+ "EEE,dd MMM yyyy HH:mm:ss zzz", "EEE, dd MMM yyyy HH:mm:sszzz",
+ "EEE, dd MMM yyyy HH:mm:ss", "EEE, dd-MMM-yy HH:mm:ss zzz",
+ "yyyy/MM/dd HH:mm:ss.SSS zzz", "yyyy/MM/dd HH:mm:ss.SSS",
+ "yyyy/MM/dd HH:mm:ss zzz", "yyyy/MM/dd", "yyyy.MM.dd HH:mm:ss",
+ "yyyy-MM-dd HH:mm", "MMM dd yyyy HH:mm:ss. zzz",
+ "MMM dd yyyy HH:mm:ss zzz", "dd.MM.yyyy HH:mm:ss zzz",
+ "dd MM yyyy HH:mm:ss zzz", "dd.MM.yyyy; HH:mm:ss",
+ "dd.MM.yyyy HH:mm:ss", "dd.MM.yyyy zzz",
+ "yyyy-MM-dd'T'HH:mm:ss'Z'" });
time = parsedDate.getTime();
// if (LOG.isWarnEnabled()) {
// LOG.warn(url + ": parsed date: " + date +" to:"+time);
@@ -123,8 +130,8 @@ public class MoreIndexingFilter implemen
// Add Content-Length
private NutchDocument addLength(NutchDocument doc, WebPage page, String url) {
- CharSequence contentLength = page.getHeaders().get(new Utf8(
- HttpHeaders.CONTENT_LENGTH));
+ CharSequence contentLength = page.getHeaders().get(
+ new Utf8(HttpHeaders.CONTENT_LENGTH));
if (contentLength != null) {
// NUTCH-1010 ContentLength not trimmed
String trimmed = contentLength.toString().trim();
@@ -188,7 +195,7 @@ public class MoreIndexingFilter implemen
if (conf.getBoolean("moreIndexingFilter.indexMimeTypeParts", true)) {
String[] parts = getParts(mimeType);
- for(String part: parts) {
+ for (String part : parts) {
doc.add("type", part);
}
}
@@ -233,8 +240,8 @@ public class MoreIndexingFilter implemen
}
private NutchDocument resetTitle(NutchDocument doc, WebPage page, String url) {
- CharSequence contentDisposition = page.getHeaders().get(new Utf8(
- HttpHeaders.CONTENT_DISPOSITION));
+ CharSequence contentDisposition = page.getHeaders().get(
+ new Utf8(HttpHeaders.CONTENT_DISPOSITION));
if (contentDisposition == null)
return doc;
Modified: nutch/branches/2.x/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java (original)
+++ nutch/branches/2.x/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java Fri Jan 9 06:34:33 2015
@@ -37,7 +37,7 @@ public class TestMoreIndexingFilter {
assertContentType(conf, "text/html", "text/html");
assertContentType(conf, "text/html; charset=UTF-8", "text/html");
}
-
+
public void testGetParts() {
String[] parts = MoreIndexingFilter.getParts("text/html");
assertParts(parts, 2, "text", "html");
@@ -48,34 +48,35 @@ public class TestMoreIndexingFilter {
* @since NUTCH-901
*/
@Test
- public void testNoParts(){
- Configuration conf = NutchConfiguration.create();
- conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);
- MoreIndexingFilter filter = new MoreIndexingFilter();
- filter.setConf(conf);
- assertNotNull(filter);
- NutchDocument doc = new NutchDocument();
- try{
- filter.filter(doc, "http://nutch.apache.org/index.html", WebPage.newBuilder().build());
- }
- catch(Exception e){
- e.printStackTrace();
- fail(e.getMessage());
- }
- assertNotNull(doc);
- assertTrue(doc.getFieldNames().contains("type"));
- assertEquals(1, doc.getFieldValues("type").size());
- assertEquals("text/html", doc.getFieldValue("type"));
+ public void testNoParts() {
+ Configuration conf = NutchConfiguration.create();
+ conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);
+ MoreIndexingFilter filter = new MoreIndexingFilter();
+ filter.setConf(conf);
+ assertNotNull(filter);
+ NutchDocument doc = new NutchDocument();
+ try {
+ filter.filter(doc, "http://nutch.apache.org/index.html", WebPage
+ .newBuilder().build());
+ } catch (Exception e) {
+ e.printStackTrace();
+ fail(e.getMessage());
+ }
+ assertNotNull(doc);
+ assertTrue(doc.getFieldNames().contains("type"));
+ assertEquals(1, doc.getFieldValues("type").size());
+ assertEquals("text/html", doc.getFieldValue("type"));
}
-
+
private void assertParts(String[] parts, int count, String... expected) {
assertEquals(count, parts.length);
for (int i = 0; i < expected.length; i++) {
assertEquals(expected[i], parts[i]);
}
}
-
- private void assertContentType(Configuration conf, String source, String expected) throws IndexingException {
+
+ private void assertContentType(Configuration conf, String source,
+ String expected) throws IndexingException {
MoreIndexingFilter filter = new MoreIndexingFilter();
filter.setConf(conf);
WebPage page = WebPage.newBuilder().build();
Modified: nutch/branches/2.x/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/package-info.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/package-info.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/package-info.java (original)
+++ nutch/branches/2.x/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/package-info.java Fri Jan 9 06:34:33 2015
@@ -19,3 +19,4 @@
* Index writer plugin for <a href="http://www.elasticsearch.org/">Elasticsearch</a>.
*/
package org.apache.nutch.indexwriter.elastic;
+
Modified: nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java (original)
+++ nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java Fri Jan 9 06:34:33 2015
@@ -22,7 +22,7 @@ public interface SolrConstants {
public static final String SERVER_URL = SOLR_PREFIX + "server.url";
public static final String COMMIT_SIZE = SOLR_PREFIX + "commit.size";
-
+
public static final String COMMIT_INDEX = SOLR_PREFIX + "commit.index";
public static final String MAPPING_FILE = SOLR_PREFIX + "mapping.file";
@@ -32,15 +32,15 @@ public interface SolrConstants {
public static final String USERNAME = SOLR_PREFIX + "auth.username";
public static final String PASSWORD = SOLR_PREFIX + "auth.password";
-
+
public static final String ID_FIELD = "id";
-
+
public static final String URL_FIELD = "url";
-
+
public static final String BOOST_FIELD = "boost";
-
+
public static final String TIMESTAMP_FIELD = "tstamp";
-
+
public static final String DIGEST_FIELD = "digest";
}
Modified: nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java (original)
+++ nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java Fri Jan 9 06:34:33 2015
@@ -144,7 +144,9 @@ public class SolrIndexWriter implements
public void commit() throws IOException {
try {
solr.commit();
- LOG.info("Total " + documentCount + (documentCount > 1 ? " documents are " : " document is ") + "added.");
+ LOG.info("Total " + documentCount
+ + (documentCount > 1 ? " documents are " : " document is ")
+ + "added.");
} catch (SolrServerException e) {
throw makeIOException(e);
}
Modified: nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrMappingReader.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrMappingReader.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrMappingReader.java (original)
+++ nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrMappingReader.java Fri Jan 9 06:34:33 2015
@@ -38,16 +38,17 @@ import org.xml.sax.SAXException;
public class SolrMappingReader {
public static Logger LOG = LoggerFactory.getLogger(SolrMappingReader.class);
-
+
private Configuration conf;
-
+
private Map<String, String> keyMap = new HashMap<String, String>();
private Map<String, String> copyMap = new HashMap<String, String>();
private String uniqueKey = "id";
-
+
public static synchronized SolrMappingReader getInstance(Configuration conf) {
ObjectCache cache = ObjectCache.get(conf);
- SolrMappingReader instance = (SolrMappingReader)cache.getObject(SolrMappingReader.class.getName());
+ SolrMappingReader instance = (SolrMappingReader) cache
+ .getObject(SolrMappingReader.class.getName());
if (instance == null) {
instance = new SolrMappingReader(conf);
cache.setObject(SolrMappingReader.class.getName(), instance);
@@ -60,9 +61,10 @@ public class SolrMappingReader {
parseMapping();
}
- private void parseMapping() {
+ private void parseMapping() {
InputStream ssInputStream = null;
- ssInputStream = conf.getConfResourceAsInputStream(conf.get(SolrConstants.MAPPING_FILE, "solrindex-mapping.xml"));
+ ssInputStream = conf.getConfResourceAsInputStream(conf.get(
+ SolrConstants.MAPPING_FILE, "solrindex-mapping.xml"));
InputSource inputSource = new InputSource(ssInputStream);
try {
@@ -74,48 +76,50 @@ public class SolrMappingReader {
if (fieldList.getLength() > 0) {
for (int i = 0; i < fieldList.getLength(); i++) {
Element element = (Element) fieldList.item(i);
- LOG.info("source: " + element.getAttribute("source") + " dest: " + element.getAttribute("dest"));
- keyMap.put(element.getAttribute("source"), element.getAttribute("dest"));
+ LOG.info("source: " + element.getAttribute("source") + " dest: "
+ + element.getAttribute("dest"));
+ keyMap.put(element.getAttribute("source"),
+ element.getAttribute("dest"));
}
}
NodeList copyFieldList = rootElement.getElementsByTagName("copyField");
if (copyFieldList.getLength() > 0) {
for (int i = 0; i < copyFieldList.getLength(); i++) {
Element element = (Element) copyFieldList.item(i);
- LOG.info("source: " + element.getAttribute("source") + " dest: " + element.getAttribute("dest"));
- copyMap.put(element.getAttribute("source"), element.getAttribute("dest"));
+ LOG.info("source: " + element.getAttribute("source") + " dest: "
+ + element.getAttribute("dest"));
+ copyMap.put(element.getAttribute("source"),
+ element.getAttribute("dest"));
}
}
NodeList uniqueKeyItem = rootElement.getElementsByTagName("uniqueKey");
if (uniqueKeyItem.getLength() > 1) {
LOG.warn("More than one unique key definitions found in solr index mapping, using default 'id'");
uniqueKey = "id";
- }
- else if (uniqueKeyItem.getLength() == 0) {
+ } else if (uniqueKeyItem.getLength() == 0) {
LOG.warn("No unique key definition found in solr index mapping using, default 'id'");
- }
- else{
- uniqueKey = uniqueKeyItem.item(0).getFirstChild().getNodeValue();
+ } else {
+ uniqueKey = uniqueKeyItem.item(0).getFirstChild().getNodeValue();
}
} catch (MalformedURLException e) {
- LOG.warn(e.toString());
+ LOG.warn(e.toString());
} catch (SAXException e) {
- LOG.warn(e.toString());
+ LOG.warn(e.toString());
} catch (IOException e) {
- LOG.warn(e.toString());
+ LOG.warn(e.toString());
} catch (ParserConfigurationException e) {
- LOG.warn(e.toString());
- }
+ LOG.warn(e.toString());
+ }
}
-
+
public Map<String, String> getKeyMap() {
return keyMap;
}
-
+
public Map<String, String> getCopyMap() {
return copyMap;
}
-
+
public String getUniqueKey() {
return uniqueKey;
}
@@ -128,14 +132,14 @@ public class SolrMappingReader {
}
public String mapKey(String key) throws IOException {
- if(keyMap.containsKey(key)) {
+ if (keyMap.containsKey(key)) {
key = keyMap.get(key);
}
return key;
}
public String mapCopyKey(String key) throws IOException {
- if(copyMap.containsKey(key)) {
+ if (copyMap.containsKey(key)) {
key = copyMap.get(key);
}
return key;
Modified: nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java (original)
+++ nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java Fri Jan 9 06:34:33 2015
@@ -1,6 +1,5 @@
package org.apache.nutch.indexwriter.solr;
-
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.UsernamePasswordCredentials;
@@ -17,7 +16,8 @@ public class SolrUtils {
public static Logger LOG = LoggerFactory.getLogger(SolrUtils.class);
- public static HttpSolrServer getHttpSolrServer(Configuration job) throws MalformedURLException {
+ public static HttpSolrServer getHttpSolrServer(Configuration job)
+ throws MalformedURLException {
DefaultHttpClient client = new DefaultHttpClient();
// Check for username/password
@@ -26,10 +26,14 @@ public class SolrUtils {
LOG.info("Authenticating as: " + username);
- AuthScope scope = new AuthScope(AuthScope.ANY_HOST, AuthScope.ANY_PORT, AuthScope.ANY_REALM, AuthScope.ANY_SCHEME);
+ AuthScope scope = new AuthScope(AuthScope.ANY_HOST, AuthScope.ANY_PORT,
+ AuthScope.ANY_REALM, AuthScope.ANY_SCHEME);
+
+ client.getCredentialsProvider().setCredentials(
+ scope,
+ new UsernamePasswordCredentials(username, job
+ .get(SolrConstants.PASSWORD)));
- client.getCredentialsProvider().setCredentials(scope, new UsernamePasswordCredentials(username, job.get(SolrConstants.PASSWORD)));
-
HttpParams params = client.getParams();
HttpClientParams.setAuthenticating(params, true);
@@ -46,12 +50,14 @@ public class SolrUtils {
for (int i = 0; i < input.length(); i++) {
ch = input.charAt(i);
- // Strip all non-characters http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:Noncharacter_Code_Point=True:]
- // and non-printable control characters except tabulator, new line and carriage return
+ // Strip all non-characters
+ // http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:Noncharacter_Code_Point=True:]
+ // and non-printable control characters except tabulator, new line and
+ // carriage return
if (ch % 0x10000 != 0xffff && // 0xffff - 0x10ffff range step 0x10000
- ch % 0x10000 != 0xfffe && // 0xfffe - 0x10fffe range
- (ch <= 0xfdd0 || ch >= 0xfdef) && // 0xfdd0 - 0xfdef
- (ch > 0x1F || ch == 0x9 || ch == 0xa || ch == 0xd)) {
+ ch % 0x10000 != 0xfffe && // 0xfffe - 0x10fffe range
+ (ch <= 0xfdd0 || ch >= 0xfdef) && // 0xfdd0 - 0xfdef
+ (ch > 0x1F || ch == 0x9 || ch == 0xa || ch == 0xd)) {
retval.append(ch);
}
Modified: nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/package-info.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/package-info.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/package-info.java (original)
+++ nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/package-info.java Fri Jan 9 06:34:33 2015
@@ -19,3 +19,4 @@
* Index writer plugin for <a href="http://lucene.apache.org/solr/">Apache Solr</a>.
*/
package org.apache.nutch.indexwriter.solr;
+
Modified: nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java (original)
+++ nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java Fri Jan 9 06:34:33 2015
@@ -47,7 +47,8 @@ import java.util.*;
*/
public class HTMLLanguageParser implements ParseFilter {
- public static final Logger LOG = LoggerFactory.getLogger(HTMLLanguageParser.class);
+ public static final Logger LOG = LoggerFactory
+ .getLogger(HTMLLanguageParser.class);
private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
@@ -113,8 +114,8 @@ public class HTMLLanguageParser implemen
}
if (lang != null) {
- page.getMetadata().put(new Utf8(Metadata.LANGUAGE), ByteBuffer.wrap(lang
- .getBytes()));
+ page.getMetadata().put(new Utf8(Metadata.LANGUAGE),
+ ByteBuffer.wrap(lang.getBytes()));
return parse;
}
@@ -135,7 +136,8 @@ public class HTMLLanguageParser implemen
return lang;
}
- CharSequence ulang = page.getHeaders().get(new Utf8(Response.CONTENT_LANGUAGE));
+ CharSequence ulang = page.getHeaders().get(
+ new Utf8(Response.CONTENT_LANGUAGE));
if (ulang != null) {
lang = ulang.toString();
}
@@ -154,7 +156,7 @@ public class HTMLLanguageParser implemen
String content = parse.getText();
if (content != null) {
- text.append(" ").append(content.toString());
+ text.append(" ").append(content.toString());
}
LanguageIdentifier identifier = new LanguageIdentifier(text.toString());
Modified: nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java (original)
+++ nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java Fri Jan 9 06:34:33 2015
@@ -35,11 +35,10 @@ import java.util.HashSet;
/**
* An {@link org.apache.nutch.indexer.IndexingFilter} that adds a
* <code>lang</code> (language) field to the document.
- *
- * It tries to find the language of the document by checking
- * if {@link HTMLLanguageParser} has added some language
- * information
- *
+ *
+ * It tries to find the language of the document by checking if
+ * {@link HTMLLanguageParser} has added some language information
+ *
* @author Sami Siren
* @author Jerome Charron
*/
@@ -56,7 +55,8 @@ public class LanguageIndexingFilter impl
/**
* Constructs a new Language Indexing Filter.
*/
- public LanguageIndexingFilter() {}
+ public LanguageIndexingFilter() {
+ }
public NutchDocument filter(NutchDocument doc, String url, WebPage page)
throws IndexingException {
Modified: nutch/branches/2.x/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java (original)
+++ nutch/branches/2.x/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java Fri Jan 9 06:34:33 2015
@@ -96,8 +96,8 @@ public class TestHTMLLanguageParser {
{ "torp, stuga, uthyres, bed & breakfast", null } };
for (int i = 0; i < 44; i++) {
- assertEquals(tests[i][1], HTMLLanguageParser.LanguageParser
- .parseLanguage(tests[i][0]));
+ assertEquals(tests[i][1],
+ HTMLLanguageParser.LanguageParser.parseLanguage(tests[i][0]));
}
}
@@ -151,8 +151,8 @@ public class TestHTMLLanguageParser {
page.setBaseUrl(BASE);
page.setContent(ByteBuffer.wrap(text.getBytes()));
page.setContentType(new Utf8("text/html"));
- page
- .getHeaders().put(EncodingDetector.CONTENT_TYPE_UTF8, new Utf8("text/html"));
+ page.getHeaders().put(EncodingDetector.CONTENT_TYPE_UTF8,
+ new Utf8("text/html"));
return page;
}
}
Modified: nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java (original)
+++ nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java Fri Jan 9 06:34:33 2015
@@ -19,7 +19,7 @@ package org.apache.nutch.protocol.http.a
@SuppressWarnings("serial")
public class BlockedException extends HttpException {
-
+
public BlockedException(String msg) {
super(msg);
}