You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@nutch.apache.org by "Julien Nioche (JIRA)" <ji...@apache.org> on 2014/06/17 10:53:02 UTC

[jira] [Assigned] (NUTCH-1269) Generate main problems

     [ https://issues.apache.org/jira/browse/NUTCH-1269?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Julien Nioche reassigned NUTCH-1269:
------------------------------------

    Assignee: Julien Nioche

> Generate main problems
> ----------------------
>
>                 Key: NUTCH-1269
>                 URL: https://issues.apache.org/jira/browse/NUTCH-1269
>             Project: Nutch
>          Issue Type: Improvement
>          Components: generator
>    Affects Versions: 1.4
>         Environment: software
>            Reporter: behnam nikbakht
>            Assignee: Julien Nioche
>              Labels: Generate, MaxHostCount, MaxNumSegments
>             Fix For: 1.9
>
>         Attachments: NUTCH-1269-v.2.patch, NUTCH-1269.patch
>
>
> there are some problems with current Generate method, with maxNumSegments and maxHostCount options:
> 1. first, size of generated segments are different
> 2. with maxHostCount option, it is unclear that it was applied or not
> 3. urls from one host are distributed non-uniform between segments
> we change Generator.java as described below:
> in Selector class:
>     private int maxNumSegments;
>     private int segmentSize;
>     private int maxHostCount;
> public void config
> ...
>       maxNumSegments = job.getInt(GENERATOR_MAX_NUM_SEGMENTS, 1);
>       segmentSize=(int)job.getInt(GENERATOR_TOP_N, 10000000)/maxNumSegments;
>       maxHostCount=job.getInt("GENERATE_MAX_PER_HOST", 100);  
> ...
> public void reduce(FloatWritable key, Iterator<SelectorEntry> values,
>         OutputCollector<FloatWritable,SelectorEntry> output, Reporter reporter)
>         throws IOException {
> 	int limit2=(int)((limit*3)/2);
>       while (values.hasNext()) {
> 	if(count == limit)
>                 break;
>         if (count % segmentSize == 0 ) {
>           if (currentsegmentnum < maxNumSegments-1){
>             currentsegmentnum++;
>           }
>           else
>                 currentsegmentnum=0;
>         }
>         boolean full=true;
>         for(int jk=0;jk<maxNumSegments;jk++){
>         	if (segCounts[jk]<segmentSize){
>         		full=false;
>         	}
>         }
>         if(full){
>         	break;
>         }
>         SelectorEntry entry = values.next();
>         Text url = entry.url;
>                 //logWrite("Generated3:"+limit+"-"+count+"-"+url.toString());
>         String urlString = url.toString();
>         URL u = null;
>         String hostordomain = null;
>         try {
>           if (normalise && normalizers != null) {
>             urlString = normalizers.normalize(urlString,
>                 URLNormalizers.SCOPE_GENERATE_HOST_COUNT);
>           }
>        
>           u = new URL(urlString);
>           if (byDomain) {
>             hostordomain = URLUtil.getDomainName(u);
>           } else {
>             hostordomain = new URL(urlString).getHost();
>           }
>  
> 	hostordomain = hostordomain.toLowerCase();
>         boolean countLimit=true;
>         // only filter if we are counting hosts or domains
>              int[] hostCount = hostCounts.get(hostordomain);
>              //host count: {a,b,c,d} means that from this host there are a urls in segment 0 and b urls in seg 1 and ...
>              if (hostCount == null) {
>                  hostCount = new int[maxNumSegments];
>                  for(int kl=0;kl<hostCount.length;kl++)
>                          hostCount[kl]=0;
>                  hostCounts.put(hostordomain, hostCount);
>              }  
>                  int selectedSeg=currentsegmentnum;
>                  int minCount=hostCount[selectedSeg];
>                  for(int jk=0;jk<maxNumSegments;jk++){
>                          if(hostCount[jk]<minCount){
>                                  minCount=hostCount[jk];
>                                  selectedSeg=jk;
>                          }
>                 }
>                 if(hostCount[selectedSeg]<=maxHostCount){
>                         count++;
>                         entry.segnum = new IntWritable(selectedSeg);
>                         hostCount[selectedSeg]++;
>                         output.collect(key, entry);
>                 }
>         } catch (Exception e) {
>           LOG.warn("Malformed URL: '" + urlString + "', skipping ("
>                 logWrite("Generate-malform:"+hostordomain+"-"+url.toString());
>               + StringUtils.stringifyException(e) + ")");
>           //continue;
>         }
>       }
>     }
>     



--
This message was sent by Atlassian JIRA
(v6.2#6252)