You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@nutch.apache.org by "Ratnesh,V2Solutions India" <ra...@in.v2solutions.com> on 2007/03/25 14:45:26 UTC

not able to index a field in lucene

Hi,
I want to search against a field, but I am getting difficulty to index that
field in lucene. for testing whether field is indexed or not I am using luke
as a tool.

here is the example of our program , I want to store recommend as a filed in
luke which may be for tag
<meta name="rollno" value="5"> in our html page. so here i want to index as
a field rollno which has value 5 in lucene index.

I am taking help of this entire program you can replace recommend as rollno.

Expecting your earliest reply.
thanks

package org.apache.nutch.parse.recommended;

//JDK imports
import java.util.Enumeration;
import java.util.Properties;
import java.util.logging.Logger;
import java.io.*;
//Nutch imports
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.HtmlParseFilter;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.metadata.Metadata;

//Commons imports
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

//W3C imports
import org.w3c.dom.DocumentFragment;
import org.apache.nutch.util.NutchConfiguration;



public class RecommendedParser implements HtmlParseFilter {

  private static final Log LOG =
LogFactory.getLog(RecommendedParser.class.getName());
  
  private Configuration conf;

  /** The Recommended meta data attribute name */
  public static final String META_RECOMMENDED_NAME="Recommended";

  /**
   * Scan the HTML document looking for a recommended meta tag.
   */
  public Parse filter(Content content, Parse parse, 
    HTMLMetaTags metaTags, DocumentFragment doc) {
    // Trying to find the document's recommended term
    String recommendation = null;

    Properties generalMetaTags = metaTags.getGeneralTags();

    for (Enumeration tagNames = generalMetaTags.propertyNames();
tagNames.hasMoreElements(); ) {
        if (tagNames.nextElement().equals("recommended")) {
           recommendation = generalMetaTags.getProperty("recommended");
           LOG.info("Found a Recommendation for " + recommendation);
        }
    }

    if (recommendation == null) {
        LOG.info("No Recommendation");
    } else {
        LOG.info("Adding Recommendation for " + recommendation);
        parse.getData().getContentMeta().set(META_RECOMMENDED_NAME,
recommendation);
    }

    return parse;
  }
  
  public static void main(String ar[])throws Exception
  {
	  if (ar.length < 3) {
			System.err.println(RecommendedParser.class.getName() + " ");
			return;
		}
		InputStream in = new FileInputStream(ar[0]);
		BufferedReader br = new BufferedReader(new InputStreamReader(in,
"UTF-8"));
		StringBuffer sb = new StringBuffer();
		String line = null;
		while ((line = br.readLine()) != null) sb.append(line + "\n");
		
		String contentType = "text/xml";
		String url = ar[1];
		String recommendedContent=ar[2];
		byte[] bytes = sb.toString().getBytes();
		
		Configuration conf = NutchConfiguration.create();
		RecommendedParser rp = new RecommendedParser();
		rp.setConf(NutchConfiguration.create());
		
		Content content =
		      new Content(url, url, bytes, contentType, new Metadata(), conf);
	
		 Parse parse = new
ParseUtil(conf).parseByExtensionId("org.apache.nutch.parse.recommended.recommendedfilter",content);
				
	}

  
	  
  /*String contentType = "text/html";
  InputStream in = new FileInputStream(file);
  ByteArrayOutputStream out = new ByteArrayOutputStream((int)file.length());
  byte[] buffer = new byte[1024];
  int i;
  while ((i = in.read(buffer)) != -1) {
    out.write(buffer, 0, i);
  }
  in.close();
  byte[] bytes = out.toByteArray();
  Configuration conf = NutchConfiguration.create();

  Content content =
    new Content(url, url, bytes, contentType, new Metadata(), conf);
  Parse parse = new
ParseUtil(conf).parseByExtensionId("parse-html",content);

  }*/
  
  
  public void setConf(Configuration conf) {
    this.conf = conf;
  }

  public Configuration getConf() {
    return this.conf;
  }  
}


public class RecommendedIndexer implements IndexingFilter {
    
	  public static final Log LOG =
LogFactory.getLog(RecommendedIndexer.class.getName());
	  
	  private Configuration conf;
	  
	  public RecommendedIndexer() {
	  }

	  public Document filter(Document doc, Parse parse, UTF8 url, 
	    CrawlDatum datum, Inlinks inlinks)
	    throws IndexingException {

	    String recommendation = parse.getData().getMeta("Recommended");

	        if (recommendation != null) {
	            Field recommendedField = 
	                new Field("recommended", recommendation, 
	                    Field.Store.YES, Field.Index.UN_TOKENIZED);
	            recommendedField.setBoost(5.0f);
	            doc.add(recommendedField);
	            LOG.info("Added " + recommendation + " to the recommended
Field");
	        }

	    return doc;
	  }
	  
	  public void setConf(Configuration conf) {
	    this.conf = conf;
	  }

	  public Configuration getConf() {
	    return this.conf;
	  }  
	}



public class RecommendedQueryFilter extends FieldQueryFilter {
    private static final Log LOG =
LogFactory.getLog(RecommendedParser.class.getName());

    public RecommendedQueryFilter() {
        super("recommended", 5f);
        LOG.info("Added a recommended query");
    }
  
}


this is plugin.xml file
----------------------------------------
----------------------------------------

xml version="1.0" encoding="UTF-8"?>
<plugin
   id="recommended"
   name="Recommended Parser/Filter"
   version="0.0.1"
   provider-name="nutch.org">

   <runtime>
      <!-- As defined in build.xml this plugin will end up bundled as
recommended.jar -->
      <library name="recommended.jar">
         <export name="*"/>
      </library>
   </runtime>

   <!-- The RecommendedParser extends the HtmlParseFilter to grab the
contents of
        any recommended meta tags -->
   <extension id="org.apache.nutch.parse.recommended.recommendedfilter"
              name="Recommended Parser"
              point="org.apache.nutch.parse.HtmlParseFilter">
      <implementation id="RecommendedParser"
                     
class="org.apache.nutch.parse.recommended.RecommendedParser"/>
   </extension>

   <!-- TheRecommendedIndexer extends the IndexingFilter in order to add the
contents
        of the recommended meta tags (as found by the RecommendedParser) to
the lucene
        index. -->
   <extension id="org.apache.nutch.parse.recommended.recommendedindexer"
              name="Recommended identifier filter"
              point="org.apache.nutch.indexer.IndexingFilter">
      <implementation id="RecommendedIndexer"
                     
class="org.apache.nutch.parse.recommended.RecommendedIndexer"/>
   </extension>

   <!-- The RecommendedQueryFilter gets called when you perform a search. It
runs a
        search for the user's query against the recommended fields.  In
order to get
        add this to the list of filters that gets run by default, you have
to use
        "fields=DEFAULT". -->   
   <extension id="org.apache.nutch.parse.recommended.recommendedSearcher"
              name="Recommended Search Query Filter"
              point="org.apache.nutch.searcher.QueryFilter">
      <implementation id="RecommendedQueryFilter"
                     
class="org.apache.nutch.parse.recommended.RecommendedQueryFilter">
                      <parameter name="fields" value="recommended"/>
                      </implementation>
                      
   </extension>

</plugin>

and I am not getting the desired result since recommended is not stored in
lucke as a field.
-- 
View this message in context: http://www.nabble.com/not-able-to-index-a-field-in-lucene-tf3462209.html#a9659562
Sent from the Nutch - User mailing list archive at Nabble.com.