You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@nutch.apache.org by "Ratnesh,V2Solutions India" <ra...@in.v2solutions.com> on 2007/03/25 14:45:26 UTC
not able to index a field in lucene
Hi,
I want to search against a field, but I am getting difficulty to index that
field in lucene. for testing whether field is indexed or not I am using luke
as a tool.
here is the example of our program , I want to store recommend as a filed in
luke which may be for tag
<meta name="rollno" value="5"> in our html page. so here i want to index as
a field rollno which has value 5 in lucene index.
I am taking help of this entire program you can replace recommend as rollno.
Expecting your earliest reply.
thanks
package org.apache.nutch.parse.recommended;
//JDK imports
import java.util.Enumeration;
import java.util.Properties;
import java.util.logging.Logger;
import java.io.*;
//Nutch imports
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.HtmlParseFilter;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.metadata.Metadata;
//Commons imports
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
//W3C imports
import org.w3c.dom.DocumentFragment;
import org.apache.nutch.util.NutchConfiguration;
public class RecommendedParser implements HtmlParseFilter {
private static final Log LOG =
LogFactory.getLog(RecommendedParser.class.getName());
private Configuration conf;
/** The Recommended meta data attribute name */
public static final String META_RECOMMENDED_NAME="Recommended";
/**
* Scan the HTML document looking for a recommended meta tag.
*/
public Parse filter(Content content, Parse parse,
HTMLMetaTags metaTags, DocumentFragment doc) {
// Trying to find the document's recommended term
String recommendation = null;
Properties generalMetaTags = metaTags.getGeneralTags();
for (Enumeration tagNames = generalMetaTags.propertyNames();
tagNames.hasMoreElements(); ) {
if (tagNames.nextElement().equals("recommended")) {
recommendation = generalMetaTags.getProperty("recommended");
LOG.info("Found a Recommendation for " + recommendation);
}
}
if (recommendation == null) {
LOG.info("No Recommendation");
} else {
LOG.info("Adding Recommendation for " + recommendation);
parse.getData().getContentMeta().set(META_RECOMMENDED_NAME,
recommendation);
}
return parse;
}
public static void main(String ar[])throws Exception
{
if (ar.length < 3) {
System.err.println(RecommendedParser.class.getName() + " ");
return;
}
InputStream in = new FileInputStream(ar[0]);
BufferedReader br = new BufferedReader(new InputStreamReader(in,
"UTF-8"));
StringBuffer sb = new StringBuffer();
String line = null;
while ((line = br.readLine()) != null) sb.append(line + "\n");
String contentType = "text/xml";
String url = ar[1];
String recommendedContent=ar[2];
byte[] bytes = sb.toString().getBytes();
Configuration conf = NutchConfiguration.create();
RecommendedParser rp = new RecommendedParser();
rp.setConf(NutchConfiguration.create());
Content content =
new Content(url, url, bytes, contentType, new Metadata(), conf);
Parse parse = new
ParseUtil(conf).parseByExtensionId("org.apache.nutch.parse.recommended.recommendedfilter",content);
}
/*String contentType = "text/html";
InputStream in = new FileInputStream(file);
ByteArrayOutputStream out = new ByteArrayOutputStream((int)file.length());
byte[] buffer = new byte[1024];
int i;
while ((i = in.read(buffer)) != -1) {
out.write(buffer, 0, i);
}
in.close();
byte[] bytes = out.toByteArray();
Configuration conf = NutchConfiguration.create();
Content content =
new Content(url, url, bytes, contentType, new Metadata(), conf);
Parse parse = new
ParseUtil(conf).parseByExtensionId("parse-html",content);
}*/
public void setConf(Configuration conf) {
this.conf = conf;
}
public Configuration getConf() {
return this.conf;
}
}
public class RecommendedIndexer implements IndexingFilter {
public static final Log LOG =
LogFactory.getLog(RecommendedIndexer.class.getName());
private Configuration conf;
public RecommendedIndexer() {
}
public Document filter(Document doc, Parse parse, UTF8 url,
CrawlDatum datum, Inlinks inlinks)
throws IndexingException {
String recommendation = parse.getData().getMeta("Recommended");
if (recommendation != null) {
Field recommendedField =
new Field("recommended", recommendation,
Field.Store.YES, Field.Index.UN_TOKENIZED);
recommendedField.setBoost(5.0f);
doc.add(recommendedField);
LOG.info("Added " + recommendation + " to the recommended
Field");
}
return doc;
}
public void setConf(Configuration conf) {
this.conf = conf;
}
public Configuration getConf() {
return this.conf;
}
}
public class RecommendedQueryFilter extends FieldQueryFilter {
private static final Log LOG =
LogFactory.getLog(RecommendedParser.class.getName());
public RecommendedQueryFilter() {
super("recommended", 5f);
LOG.info("Added a recommended query");
}
}
this is plugin.xml file
----------------------------------------
----------------------------------------
xml version="1.0" encoding="UTF-8"?>
<plugin
id="recommended"
name="Recommended Parser/Filter"
version="0.0.1"
provider-name="nutch.org">
<runtime>
<!-- As defined in build.xml this plugin will end up bundled as
recommended.jar -->
<library name="recommended.jar">
<export name="*"/>
</library>
</runtime>
<!-- The RecommendedParser extends the HtmlParseFilter to grab the
contents of
any recommended meta tags -->
<extension id="org.apache.nutch.parse.recommended.recommendedfilter"
name="Recommended Parser"
point="org.apache.nutch.parse.HtmlParseFilter">
<implementation id="RecommendedParser"
class="org.apache.nutch.parse.recommended.RecommendedParser"/>
</extension>
<!-- TheRecommendedIndexer extends the IndexingFilter in order to add the
contents
of the recommended meta tags (as found by the RecommendedParser) to
the lucene
index. -->
<extension id="org.apache.nutch.parse.recommended.recommendedindexer"
name="Recommended identifier filter"
point="org.apache.nutch.indexer.IndexingFilter">
<implementation id="RecommendedIndexer"
class="org.apache.nutch.parse.recommended.RecommendedIndexer"/>
</extension>
<!-- The RecommendedQueryFilter gets called when you perform a search. It
runs a
search for the user's query against the recommended fields. In
order to get
add this to the list of filters that gets run by default, you have
to use
"fields=DEFAULT". -->
<extension id="org.apache.nutch.parse.recommended.recommendedSearcher"
name="Recommended Search Query Filter"
point="org.apache.nutch.searcher.QueryFilter">
<implementation id="RecommendedQueryFilter"
class="org.apache.nutch.parse.recommended.RecommendedQueryFilter">
<parameter name="fields" value="recommended"/>
</implementation>
</extension>
</plugin>
and I am not getting the desired result since recommended is not stored in
lucke as a field.
--
View this message in context: http://www.nabble.com/not-able-to-index-a-field-in-lucene-tf3462209.html#a9659562
Sent from the Nutch - User mailing list archive at Nabble.com.