You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by cu...@apache.org on 2005/05/05 23:32:51 UTC
svn commit: r168422 -
/incubator/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
Author: cutting
Date: Thu May 5 14:32:51 2005
New Revision: 168422
URL: http://svn.apache.org/viewcvs?rev=168422&view=rev
Log:
Always index a date for a page, to improve sorting and range
searches.
Modified:
incubator/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
Modified: incubator/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=168422&r1=168421&r2=168422&view=diff
==============================================================================
--- incubator/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (original)
+++ incubator/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Thu May 5 14:32:51 2005
@@ -87,7 +87,7 @@
// normalize metaData (see note in the method below).
Properties metaData = normalizeMeta(parse.getData().getMetadata());
- addTime(doc, metaData, url);
+ addTime(doc, metaData, url, fo);
addLength(doc, metaData, url);
@@ -98,40 +98,24 @@
return doc;
}
- // Add time related meta info, now Last-Modified only
- // Others for consideration: Date, Expires
- private Document addTime(Document doc, Properties metaData, String url) {
+ // Add time related meta info. Add last-modified if present. Index date as
+ // last-modified, or, if that's not present, use fetch time.
+ private Document addTime(Document doc, Properties metaData, String url,
+ FetcherOutput fo) {
+ long time = -1;
String lastModified = metaData.getProperty("last-modified");
- if (lastModified == null)
- return doc;
-
- // try to figure out last-modified as long value
- DateFormat df = new SimpleDateFormat("EEE MMM dd HH:mm:ss yyyy zzz");
- long time = -1;
- try {
- time = HttpDateFormat.toLong(lastModified);
- } catch (ParseException e) {
- // try to parse it as date in alternative format
- try {
- Date d = df.parse(lastModified);
- time = d.getTime();
- } catch (Exception e1) {
- LOG.warning(url+": can't parse erroneous last-modified: "+lastModified);
- }
+ if (lastModified != null) { // try parse last-modified
+ time = getTime(lastModified,url); // use as time
+ // store as string
+ doc.add(Field.UnIndexed("lastModified", new Long(time).toString()));
}
- if (time == -1) {
- // or instead set it to current time at indexing?
- //time = System.currentTimeMillis();
- // for now, we just do nothing
- return doc;
+ if (time == -1) { // if no last-modified
+ time = fo.getFetchDate(); // use fetch time
}
- // store last-modified as string
- doc.add(Field.UnIndexed("lastModified", new Long(time).toString()));
-
- // add support for query syntax date: using last-modified
+ // add support for query syntax date:
// query filter is implemented in DateQueryFilter.java
SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd");
sdf.setTimeZone(TimeZone.getTimeZone("GMT"));
@@ -141,6 +125,23 @@
doc.add(new Field("date", dateString, false, true, false));
return doc;
+ }
+
+ private long getTime(String date, String url) {
+ long time = -1;
+ try {
+ time = HttpDateFormat.toLong(date);
+ } catch (ParseException e) {
+ // try to parse it as date in alternative format
+ try {
+ DateFormat df = new SimpleDateFormat("EEE MMM dd HH:mm:ss yyyy zzz");
+ Date d = df.parse(date);
+ time = d.getTime();
+ } catch (Exception e1) {
+ LOG.warning(url+": can't parse erroneous date: "+date);
+ }
+ }
+ return time;
}
// Add Content-Length