You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2016/02/08 19:06:16 UTC
svn commit: r1729219 -
/nutch/branches/2.x/src/test/org/apache/nutch/parse/TestParseUtil.java
Author: lewismc
Date: Mon Feb 8 18:06:15 2016
New Revision: 1729219
URL: http://svn.apache.org/viewvc?rev=1729219&view=rev
Log:
NUTCH-1314 Impose a limit on the length of outlink target urls
Added:
nutch/branches/2.x/src/test/org/apache/nutch/parse/TestParseUtil.java
Added: nutch/branches/2.x/src/test/org/apache/nutch/parse/TestParseUtil.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/parse/TestParseUtil.java?rev=1729219&view=auto
==============================================================================
--- nutch/branches/2.x/src/test/org/apache/nutch/parse/TestParseUtil.java (added)
+++ nutch/branches/2.x/src/test/org/apache/nutch/parse/TestParseUtil.java Mon Feb 8 18:06:15 2016
@@ -0,0 +1,71 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import org.apache.avro.util.Utf8;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlStatus;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.UnsupportedEncodingException;
+import java.nio.ByteBuffer;
+
+import static org.junit.Assert.*;
+
+/**
+ * Unit tests for ParseUtil methods.
+ *
+ * @author Canan Girgin
+ *
+ */
+public class TestParseUtil {
+
+ private Configuration conf;
+ private ParseUtil parseUtil;
+
+ /**
+ * Inits the Test Case
+ */
+ @Before
+ public void setUp() throws Exception {
+ conf = NutchConfiguration.create();
+ conf.set("parser.html.outlinks.max.target.length", "40");
+ parseUtil = new ParseUtil(conf);
+ }
+
+ @Test
+ public void testOutlinksMaxLength() throws UnsupportedEncodingException {
+ WebPage page = WebPage.newBuilder().build();
+ page.setBaseUrl(new Utf8("http://www.example.com/"));
+ page.setContentType(new Utf8("text/plain"));
+ String content= "Test with http://www.nutch.org/index.html is it found? "
+ + "What about http://www.apache.org/foundation/ "
+ + "A longer URL could be http://www.sybit.com/solutions/portals.html";
+ page.setContent(ByteBuffer.wrap(content.getBytes("utf-8")));
+ page.setStatus((int)CrawlStatus.STATUS_FETCHED);
+ parseUtil.process("http://www.example.com/", page);
+ assertTrue("Wrong URL!", page.getOutlinks().size() == 2);
+ assertTrue("Wrong URL", page.getOutlinks().containsKey(new Utf8("http://www.nutch.org/index.html")));
+ assertTrue("Wrong URL", page.getOutlinks().containsKey(new Utf8("http://www.apache.org/foundation/")));
+
+ }
+
+}