You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by sh...@apache.org on 2009/09/07 15:12:01 UTC

svn commit: r812122 - in /lucene/solr/trunk/contrib/dataimporthandler: ./ src/main/java/org/apache/solr/handler/dataimport/ src/test/java/org/apache/solr/handler/dataimport/

Author: shalin
Date: Mon Sep  7 13:12:01 2009
New Revision: 812122

URL: http://svn.apache.org/viewvc?rev=812122&view=rev
Log:
SOLR-1406 -- Make FileDataSource and FileListEntityProcessor to be more extensible, fix variable resolving bug in FileListEntityProcessor and add tests

Modified:
    lucene/solr/trunk/contrib/dataimporthandler/CHANGES.txt
    lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/FileDataSource.java
    lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/FileListEntityProcessor.java
    lucene/solr/trunk/contrib/dataimporthandler/src/test/java/org/apache/solr/handler/dataimport/TestFileListEntityProcessor.java

Modified: lucene/solr/trunk/contrib/dataimporthandler/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/contrib/dataimporthandler/CHANGES.txt?rev=812122&r1=812121&r2=812122&view=diff
==============================================================================
--- lucene/solr/trunk/contrib/dataimporthandler/CHANGES.txt (original)
+++ lucene/solr/trunk/contrib/dataimporthandler/CHANGES.txt Mon Sep  7 13:12:01 2009
@@ -155,6 +155,8 @@
 
 37.SOLR-1348: Support binary field type in convertType logic in JdbcDataSource (shalin)
 
+38.SOLR-1406: Make FileDataSource and FileListEntityProcessor to be more extensible (Luke Forehand, shalin)
+
 Optimizations
 ----------------------
 1. SOLR-846:  Reduce memory consumption during delta import by removing keys when used

Modified: lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/FileDataSource.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/FileDataSource.java?rev=812122&r1=812121&r2=812122&view=diff
==============================================================================
--- lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/FileDataSource.java (original)
+++ lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/FileDataSource.java Mon Sep  7 13:12:01 2009
@@ -44,9 +44,15 @@
 public class FileDataSource extends DataSource<Reader> {
   public static final String BASE_PATH = "basePath";
 
-  private String basePath;
+  /**
+   * The basePath for this data source
+   */
+  protected String basePath;
 
-  private String encoding = null;
+  /**
+   * The encoding using which the given file should be read
+   */
+  protected String encoding = null;
 
   private static final Logger LOG = LoggerFactory.getLogger(FileDataSource.class);
 
@@ -95,7 +101,16 @@
     }
   }
 
-  private InputStreamReader openStream(File file) throws FileNotFoundException,
+  /**
+   * Open a {@link java.io.Reader} for the given file name
+   *
+   * @param file a {@link java.io.File} instance
+   * @return a Reader on the given file
+   * @throws FileNotFoundException if the File does not exist
+   * @throws UnsupportedEncodingException if the encoding is unsupported
+   * @since solr 1.4
+   */
+  protected Reader openStream(File file) throws FileNotFoundException,
           UnsupportedEncodingException {
     if (encoding == null) {
       return new InputStreamReader(new FileInputStream(file));

Modified: lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/FileListEntityProcessor.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/FileListEntityProcessor.java?rev=812122&r1=812121&r2=812122&view=diff
==============================================================================
--- lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/FileListEntityProcessor.java (original)
+++ lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/FileListEntityProcessor.java Mon Sep  7 13:12:01 2009
@@ -54,13 +54,54 @@
  * @since solr 1.3
  */
 public class FileListEntityProcessor extends EntityProcessorBase {
-  private String fileName, baseDir, excludes;
-
-  private Date newerThan, olderThan;
-
-  private long biggerThan = -1, smallerThan = -1;
-
-  private boolean recursive = false;
+  /**
+   * A regex pattern to identify files given in data-config.xml after resolving any variables 
+   */
+  protected String fileName;
+
+  /**
+   * The baseDir given in data-config.xml after resolving any variables
+   */
+  protected String baseDir;
+
+  /**
+   * A Regex pattern of excluded file names as given in data-config.xml after resolving any variables
+   */
+  protected String excludes;
+
+  /**
+   * The newerThan given in data-config as a {@link java.util.Date}
+   * <p>
+   * <b>Note: </b> This variable is resolved just-in-time in the {@link #nextRow()} method.
+   * </p>
+   */
+  protected Date newerThan;
+
+  /**
+   * The newerThan given in data-config as a {@link java.util.Date}
+   */
+  protected Date olderThan;
+
+  /**
+   * The biggerThan given in data-config as a long value
+   * <p>
+   * <b>Note: </b> This variable is resolved just-in-time in the {@link #nextRow()} method.
+   * </p>
+   */
+  protected long biggerThan = -1;
+
+  /**
+   * The smallerThan given in data-config as a long value
+   * <p>
+   * <b>Note: </b> This variable is resolved just-in-time in the {@link #nextRow()} method.
+   * </p>
+   */
+  protected long smallerThan = -1;
+
+  /**
+   * The recursive given in data-config. Default value is false.
+   */
+  protected boolean recursive = false;
 
   private Pattern fileNamePattern, excludesPattern;
 
@@ -91,13 +132,23 @@
     }
   }
 
+  /**
+   * Get the Date object corresponding to the given string.
+   *
+   * @param dateStr the date string. It can be a DateMath string or it may have a evaluator function
+   * @return a Date instance corresponding to the input string
+   */
   private Date getDate(String dateStr) {
     if (dateStr == null)
       return null;
 
     Matcher m = PLACE_HOLDER_PATTERN.matcher(dateStr);
     if (m.find()) {
-      return (Date) resolver.resolve(dateStr);
+      Object o = resolver.resolve(m.group(1));
+      if (o instanceof Date)  return (Date)o;
+      dateStr = (String) o;
+    } else  {
+      dateStr = resolver.replaceTokens(dateStr);
     }
     m = EvaluatorBag.IN_SINGLE_QUOTES.matcher(dateStr);
     if (m.find()) {
@@ -118,9 +169,34 @@
     }
   }
 
+  /**
+   * Get the Long value for the given string after resolving any evaluator or variable.
+   *
+   * @param sizeStr the size as a string
+   * @return the Long value corresponding to the given string
+   */
+  private Long getSize(String sizeStr)  {
+    if (sizeStr == null)
+      return null;
+
+    Matcher m = PLACE_HOLDER_PATTERN.matcher(sizeStr);
+    if (m.find()) {
+      Object o = resolver.resolve(m.group(1));
+      if (o instanceof Number) {
+        Number number = (Number) o;
+        return number.longValue();
+      }
+      sizeStr = (String) o;
+    } else  {
+      sizeStr = resolver.replaceTokens(sizeStr);
+    }
+
+    return Long.parseLong(sizeStr);
+  }
+
   public Map<String, Object> nextRow() {
     if (rowIterator != null)
-      return getAndApplyTrans();
+      return getNext();
     List<Map<String, Object>> fileDetails = new ArrayList<Map<String, Object>>();
     File dir = new File(baseDir);
 
@@ -128,17 +204,16 @@
     newerThan = getDate(dateStr);
     dateStr = context.getEntityAttribute(OLDER_THAN);
     olderThan = getDate(dateStr);
+    String biggerThanStr = context.getEntityAttribute(BIGGER_THAN);
+    if (biggerThanStr != null)
+      biggerThan = getSize(biggerThanStr);
+    String smallerThanStr = context.getEntityAttribute(SMALLER_THAN);
+    if (smallerThanStr != null)
+      smallerThan = getSize(smallerThanStr);
 
     getFolderFiles(dir, fileDetails);
     rowIterator = fileDetails.iterator();
-    return getAndApplyTrans();
-  }
-
-  private Map<String, Object> getAndApplyTrans() {    
-    while (true) {
-      Map<String, Object> r = getNext();
-      return r;
-    }
+    return getNext();
   }
 
   private void getFolderFiles(File dir, final List<Map<String, Object>> fileDetails) {
@@ -186,7 +261,7 @@
   }
 
   public static final Pattern PLACE_HOLDER_PATTERN = Pattern
-          .compile("\\$\\{.*?\\}");
+          .compile("\\$\\{(.*?)\\}");
 
   public static final String DIR = "fileDir";
 

Modified: lucene/solr/trunk/contrib/dataimporthandler/src/test/java/org/apache/solr/handler/dataimport/TestFileListEntityProcessor.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/contrib/dataimporthandler/src/test/java/org/apache/solr/handler/dataimport/TestFileListEntityProcessor.java?rev=812122&r1=812121&r2=812122&view=diff
==============================================================================
--- lucene/solr/trunk/contrib/dataimporthandler/src/test/java/org/apache/solr/handler/dataimport/TestFileListEntityProcessor.java (original)
+++ lucene/solr/trunk/contrib/dataimporthandler/src/test/java/org/apache/solr/handler/dataimport/TestFileListEntityProcessor.java Mon Sep  7 13:12:01 2009
@@ -22,10 +22,7 @@
 import java.io.File;
 import java.io.FileOutputStream;
 import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-import java.util.Map;
+import java.util.*;
 
 /**
  * <p>
@@ -63,22 +60,65 @@
     }
     Assert.assertEquals(2, fList.size());
   }
-
+  
   @Test
-  public void testNTOT() throws IOException {
+  public void testBiggerSmallerFiles() throws IOException {
     long time = System.currentTimeMillis();
     File tmpdir = new File("." + time);
     tmpdir.mkdir();
     tmpdir.deleteOnExit();
-    createFile(tmpdir, "a.xml", "a.xml".getBytes(), true);
-    createFile(tmpdir, "b.xml", "b.xml".getBytes(), true);
-    createFile(tmpdir, "c.props", "c.props".getBytes(), true);
+    long minLength = Long.MAX_VALUE;
+    String smallestFile = "";
+    byte[] content = "abcdefgij".getBytes("UTF-8");
+    createFile(tmpdir, "a.xml", content, false);
+    if (minLength > content.length) {
+      minLength = content.length;
+      smallestFile = "a.xml";
+    }
+    content = "abcdefgij".getBytes("UTF-8");
+    createFile(tmpdir, "b.xml", content, false);
+    if (minLength > content.length) {
+      minLength = content.length;
+      smallestFile = "b.xml";
+    }
+    content = "abc".getBytes("UTF-8");
+    createFile(tmpdir, "c.props", content, false);
+    if (minLength > content.length) {
+      minLength = content.length;
+      smallestFile = "c.props";
+    }
     Map attrs = AbstractDataImportHandlerTest.createMap(
-            FileListEntityProcessor.FILE_NAME, "xml$",
+            FileListEntityProcessor.FILE_NAME, ".*",
             FileListEntityProcessor.BASE_DIR, tmpdir.getAbsolutePath(),
-            FileListEntityProcessor.OLDER_THAN, "'NOW'");
+            FileListEntityProcessor.BIGGER_THAN, String.valueOf(minLength));
+    List<String> fList = getFiles(null, attrs);
+    Assert.assertEquals(2, fList.size());
+    Set<String> l = new HashSet<String>();
+    l.add(new File(tmpdir, "a.xml").getAbsolutePath());
+    l.add(new File(tmpdir, "b.xml").getAbsolutePath());
+    Assert.assertEquals(l, new HashSet<String>(fList));
+    attrs = AbstractDataImportHandlerTest.createMap(
+            FileListEntityProcessor.FILE_NAME, ".*",
+            FileListEntityProcessor.BASE_DIR, tmpdir.getAbsolutePath(),
+            FileListEntityProcessor.SMALLER_THAN, String.valueOf(minLength+1));
+    fList = getFiles(null, attrs);
+    l.clear();
+    l.add(new File(tmpdir, smallestFile).getAbsolutePath());
+    Assert.assertEquals(l, new HashSet<String>(fList));
+    attrs = AbstractDataImportHandlerTest.createMap(
+            FileListEntityProcessor.FILE_NAME, ".*",
+            FileListEntityProcessor.BASE_DIR, tmpdir.getAbsolutePath(),
+            FileListEntityProcessor.SMALLER_THAN, "${a.x}");
+    VariableResolverImpl resolver = new VariableResolverImpl();
+    resolver.addNamespace("a", AbstractDataImportHandlerTest.createMap("x", "4"));
+    fList = getFiles(resolver, attrs);
+    Assert.assertEquals(l, new HashSet<String>(fList));
+  }
+
+  @SuppressWarnings("unchecked")
+  private List<String> getFiles(VariableResolverImpl resolver, Map attrs) {
     Context c = AbstractDataImportHandlerTest.getContext(null,
-            new VariableResolverImpl(), null, Context.FULL_DUMP, Collections.EMPTY_LIST, attrs);
+            resolver, null, Context.FULL_DUMP, Collections.EMPTY_LIST, attrs);
     FileListEntityProcessor fileListEntityProcessor = new FileListEntityProcessor();
     fileListEntityProcessor.init(c);
     List<String> fList = new ArrayList<String>();
@@ -88,23 +128,29 @@
         break;
       fList.add((String) f.get(FileListEntityProcessor.ABSOLUTE_FILE));
     }
-    System.out.println("List of files when given OLDER_THAN -- " + fList);
+    return fList;
+  }
+
+  @Test
+  public void testNTOT() throws IOException {
+    long time = System.currentTimeMillis();
+    File tmpdir = new File("." + time);
+    tmpdir.mkdir();
+    tmpdir.deleteOnExit();
+    createFile(tmpdir, "a.xml", "a.xml".getBytes(), true);
+    createFile(tmpdir, "b.xml", "b.xml".getBytes(), true);
+    createFile(tmpdir, "c.props", "c.props".getBytes(), true);
+    Map attrs = AbstractDataImportHandlerTest.createMap(
+            FileListEntityProcessor.FILE_NAME, "xml$",
+            FileListEntityProcessor.BASE_DIR, tmpdir.getAbsolutePath(),
+            FileListEntityProcessor.OLDER_THAN, "'NOW'");
+    List<String> fList = getFiles(null, attrs);
     Assert.assertEquals(2, fList.size());
     attrs = AbstractDataImportHandlerTest.createMap(
             FileListEntityProcessor.FILE_NAME, ".xml$",
             FileListEntityProcessor.BASE_DIR, tmpdir.getAbsolutePath(),
             FileListEntityProcessor.NEWER_THAN, "'NOW-2HOURS'");
-    c = AbstractDataImportHandlerTest.getContext(null,
-            new VariableResolverImpl(), null, Context.FULL_DUMP, Collections.EMPTY_LIST, attrs);
-    fileListEntityProcessor.init(c);
-    fList.clear();
-    while (true) {
-      Map<String, Object> f = fileListEntityProcessor.nextRow();
-      if (f == null)
-        break;
-      fList.add((String) f.get(FileListEntityProcessor.ABSOLUTE_FILE));
-    }
-    System.out.println("List of files when given NEWER_THAN -- " + fList);
+    fList = getFiles(null, attrs);
     Assert.assertEquals(2, fList.size());
   }
 
@@ -124,20 +170,7 @@
             FileListEntityProcessor.FILE_NAME, "^.*\\.xml$",
             FileListEntityProcessor.BASE_DIR, childdir.getAbsolutePath(),
             FileListEntityProcessor.RECURSIVE, "true");
-    Context c = AbstractDataImportHandlerTest.getContext(null,
-            new VariableResolverImpl(), null, Context.FULL_DUMP, Collections.EMPTY_LIST, attrs);
-    FileListEntityProcessor fileListEntityProcessor = new FileListEntityProcessor();
-    fileListEntityProcessor.init(c);
-    List<String> fList = new ArrayList<String>();
-    while (true) {
-      // Add the documents to the index. NextRow() should only
-      // find two filesnames that match the pattern in fileName
-      Map<String, Object> f = fileListEntityProcessor.nextRow();
-      if (f == null)
-        break;
-      fList.add((String) f.get(FileListEntityProcessor.ABSOLUTE_FILE));
-    }
-    System.out.println("List of files indexed -- " + fList);
+    List<String> fList = getFiles(null, attrs);
     Assert.assertEquals(2, fList.size());
   }
 
@@ -148,10 +181,8 @@
     FileOutputStream f = new FileOutputStream(file);
     f.write(content);
     f.close();
-    // System.out.println("before "+file.lastModified());
     if (changeModifiedTime)
       file.setLastModified(System.currentTimeMillis() - 3600000);
-    // System.out.println("after "+file.lastModified());
     return file;
   }
 }