You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by do...@apache.org on 2011/02/06 17:43:54 UTC

svn commit: r1067699 - in /lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src: java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java test/org/apache/lucene/benchmark/byTask/feeds/trecdocs.zip

Author: doronc
Date: Sun Feb  6 16:43:54 2011
New Revision: 1067699

URL: http://svn.apache.org/viewvc?rev=1067699&view=rev
Log:
LUCENE-1540: Improvements to contrib.benchmark for TREC collections - fix test failures in some locales due to toUpperCase()

Modified:
    lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java
    lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/trecdocs.zip

Modified: lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java?rev=1067699&r1=1067698&r2=1067699&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java Sun Feb  6 16:43:54 2011
@@ -29,7 +29,12 @@ import java.util.Map;
 public abstract class TrecDocParser {
 
   /** Types of trec parse paths, */
-  public enum ParsePathType { GOV2, FBIS, FT, FR94, LATIMES }
+  public enum ParsePathType { GOV2("gov2"), FBIS("fbis"), FT("ft"), FR94("fr94"), LATIMES("latimes"); 
+    public final String dirName;
+    private ParsePathType(String dirName) {
+      this.dirName = dirName;
+    }
+  }
   
   /** trec parser type used for unknown extensions */
   public static final ParsePathType DEFAULT_PATH_TYPE  = ParsePathType.GOV2;
@@ -46,7 +51,7 @@ public abstract class TrecDocParser {
   static final Map<String,ParsePathType> pathName2Type = new HashMap<String,ParsePathType>();
   static {
     for (ParsePathType ppt : ParsePathType.values()) {
-      pathName2Type.put(ppt.name(),ppt);
+      pathName2Type.put(ppt.dirName,ppt);
     }
   }
   
@@ -59,7 +64,7 @@ public abstract class TrecDocParser {
   public static ParsePathType pathType(File f) {
     int pathLength = 0;
     while (f != null && ++pathLength < MAX_PATH_LENGTH) {
-      ParsePathType ppt = pathName2Type.get(f.getName().toUpperCase());
+      ParsePathType ppt = pathName2Type.get(f.getName());
       if (ppt!=null) {
         return ppt;
       }

Modified: lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/trecdocs.zip
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/trecdocs.zip?rev=1067699&r1=1067698&r2=1067699&view=diff
==============================================================================
Binary files - no diff available.



Re: svn commit: r1067699 - in /lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src: java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java test/org/apache/lucene/benchmark/byTask/feeds/trecdocs.zip

Posted by Doron Cohen <cd...@gmail.com>.
Interesting... Thanks Robert for pointing this out!

> "To obtain correct results for locale insensitive strings, use
toUpperCase(Locale.ENGLISH)"

Actually this is one of the things I tried and did solve it - with
toUpperCase(Locale.US) - not exactly Locale.ENGLISH but quite similar I
assume -  and as you suggest it felt wrong, for wrong reasons...

Perhaps I'll change it like this, case insensitivity is a good think when
running in various OS's.

On Sun, Feb 6, 2011 at 6:55 PM, Robert Muir <rc...@gmail.com> wrote:

> Thanks for catching this Doron. Another option if you want to keep the
> case-insensitive feature here would be to use
> toUpperCase(Locale.ENGLISH)
>
> It might look bad, but its actually recommended by the JDK for
> locale-insensitive strings:
>
> http://download.oracle.com/javase/6/docs/api/java/lang/String.html#toUpperCase()
>
> On Sun, Feb 6, 2011 at 11:43 AM,  <do...@apache.org> wrote:
> > Author: doronc
> > Date: Sun Feb  6 16:43:54 2011
> > New Revision: 1067699
> >
> > URL: http://svn.apache.org/viewvc?rev=1067699&view=rev
> > Log:
> > LUCENE-1540: Improvements to contrib.benchmark for TREC collections - fix
> test failures in some locales due to toUpperCase()
> >
> > Modified:
> >
>  lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java
> >
>  lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/trecdocs.zip
> >
> > Modified:
> lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java
> > URL:
> http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java?rev=1067699&r1=1067698&r2=1067699&view=diff
> >
> ==============================================================================
> > ---
> lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java
> (original)
> > +++
> lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java
> Sun Feb  6 16:43:54 2011
> > @@ -29,7 +29,12 @@ import java.util.Map;
> >  public abstract class TrecDocParser {
> >
> >   /** Types of trec parse paths, */
> > -  public enum ParsePathType { GOV2, FBIS, FT, FR94, LATIMES }
> > +  public enum ParsePathType { GOV2("gov2"), FBIS("fbis"), FT("ft"),
> FR94("fr94"), LATIMES("latimes");
> > +    public final String dirName;
> > +    private ParsePathType(String dirName) {
> > +      this.dirName = dirName;
> > +    }
> > +  }
> >
> >   /** trec parser type used for unknown extensions */
> >   public static final ParsePathType DEFAULT_PATH_TYPE  =
> ParsePathType.GOV2;
> > @@ -46,7 +51,7 @@ public abstract class TrecDocParser {
> >   static final Map<String,ParsePathType> pathName2Type = new
> HashMap<String,ParsePathType>();
> >   static {
> >     for (ParsePathType ppt : ParsePathType.values()) {
> > -      pathName2Type.put(ppt.name(),ppt);
> > +      pathName2Type.put(ppt.dirName,ppt);
> >     }
> >   }
> >
> > @@ -59,7 +64,7 @@ public abstract class TrecDocParser {
> >   public static ParsePathType pathType(File f) {
> >     int pathLength = 0;
> >     while (f != null && ++pathLength < MAX_PATH_LENGTH) {
> > -      ParsePathType ppt = pathName2Type.get(f.getName().toUpperCase());
> > +      ParsePathType ppt = pathName2Type.get(f.getName());
> >       if (ppt!=null) {
> >         return ppt;
> >       }
> >
> > Modified:
> lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/trecdocs.zip
> > URL:
> http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/trecdocs.zip?rev=1067699&r1=1067698&r2=1067699&view=diff
> >
> ==============================================================================
> > Binary files - no diff available.
> >
> >
> >
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: dev-unsubscribe@lucene.apache.org
> For additional commands, e-mail: dev-help@lucene.apache.org
>
>

Re: svn commit: r1067699 - in /lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src: java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java test/org/apache/lucene/benchmark/byTask/feeds/trecdocs.zip

Posted by Robert Muir <rc...@gmail.com>.
Thanks for catching this Doron. Another option if you want to keep the
case-insensitive feature here would be to use
toUpperCase(Locale.ENGLISH)

It might look bad, but its actually recommended by the JDK for
locale-insensitive strings:
http://download.oracle.com/javase/6/docs/api/java/lang/String.html#toUpperCase()

On Sun, Feb 6, 2011 at 11:43 AM,  <do...@apache.org> wrote:
> Author: doronc
> Date: Sun Feb  6 16:43:54 2011
> New Revision: 1067699
>
> URL: http://svn.apache.org/viewvc?rev=1067699&view=rev
> Log:
> LUCENE-1540: Improvements to contrib.benchmark for TREC collections - fix test failures in some locales due to toUpperCase()
>
> Modified:
>    lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java
>    lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/trecdocs.zip
>
> Modified: lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java
> URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java?rev=1067699&r1=1067698&r2=1067699&view=diff
> ==============================================================================
> --- lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java (original)
> +++ lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java Sun Feb  6 16:43:54 2011
> @@ -29,7 +29,12 @@ import java.util.Map;
>  public abstract class TrecDocParser {
>
>   /** Types of trec parse paths, */
> -  public enum ParsePathType { GOV2, FBIS, FT, FR94, LATIMES }
> +  public enum ParsePathType { GOV2("gov2"), FBIS("fbis"), FT("ft"), FR94("fr94"), LATIMES("latimes");
> +    public final String dirName;
> +    private ParsePathType(String dirName) {
> +      this.dirName = dirName;
> +    }
> +  }
>
>   /** trec parser type used for unknown extensions */
>   public static final ParsePathType DEFAULT_PATH_TYPE  = ParsePathType.GOV2;
> @@ -46,7 +51,7 @@ public abstract class TrecDocParser {
>   static final Map<String,ParsePathType> pathName2Type = new HashMap<String,ParsePathType>();
>   static {
>     for (ParsePathType ppt : ParsePathType.values()) {
> -      pathName2Type.put(ppt.name(),ppt);
> +      pathName2Type.put(ppt.dirName,ppt);
>     }
>   }
>
> @@ -59,7 +64,7 @@ public abstract class TrecDocParser {
>   public static ParsePathType pathType(File f) {
>     int pathLength = 0;
>     while (f != null && ++pathLength < MAX_PATH_LENGTH) {
> -      ParsePathType ppt = pathName2Type.get(f.getName().toUpperCase());
> +      ParsePathType ppt = pathName2Type.get(f.getName());
>       if (ppt!=null) {
>         return ppt;
>       }
>
> Modified: lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/trecdocs.zip
> URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/trecdocs.zip?rev=1067699&r1=1067698&r2=1067699&view=diff
> ==============================================================================
> Binary files - no diff available.
>
>
>

---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@lucene.apache.org
For additional commands, e-mail: dev-help@lucene.apache.org