You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by sh...@apache.org on 2009/09/07 15:12:01 UTC
svn commit: r812122 - in /lucene/solr/trunk/contrib/dataimporthandler: ./
src/main/java/org/apache/solr/handler/dataimport/
src/test/java/org/apache/solr/handler/dataimport/
Author: shalin
Date: Mon Sep 7 13:12:01 2009
New Revision: 812122
URL: http://svn.apache.org/viewvc?rev=812122&view=rev
Log:
SOLR-1406 -- Make FileDataSource and FileListEntityProcessor to be more extensible, fix variable resolving bug in FileListEntityProcessor and add tests
Modified:
lucene/solr/trunk/contrib/dataimporthandler/CHANGES.txt
lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/FileDataSource.java
lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/FileListEntityProcessor.java
lucene/solr/trunk/contrib/dataimporthandler/src/test/java/org/apache/solr/handler/dataimport/TestFileListEntityProcessor.java
Modified: lucene/solr/trunk/contrib/dataimporthandler/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/contrib/dataimporthandler/CHANGES.txt?rev=812122&r1=812121&r2=812122&view=diff
==============================================================================
--- lucene/solr/trunk/contrib/dataimporthandler/CHANGES.txt (original)
+++ lucene/solr/trunk/contrib/dataimporthandler/CHANGES.txt Mon Sep 7 13:12:01 2009
@@ -155,6 +155,8 @@
37.SOLR-1348: Support binary field type in convertType logic in JdbcDataSource (shalin)
+38.SOLR-1406: Make FileDataSource and FileListEntityProcessor to be more extensible (Luke Forehand, shalin)
+
Optimizations
----------------------
1. SOLR-846: Reduce memory consumption during delta import by removing keys when used
Modified: lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/FileDataSource.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/FileDataSource.java?rev=812122&r1=812121&r2=812122&view=diff
==============================================================================
--- lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/FileDataSource.java (original)
+++ lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/FileDataSource.java Mon Sep 7 13:12:01 2009
@@ -44,9 +44,15 @@
public class FileDataSource extends DataSource<Reader> {
public static final String BASE_PATH = "basePath";
- private String basePath;
+ /**
+ * The basePath for this data source
+ */
+ protected String basePath;
- private String encoding = null;
+ /**
+ * The encoding using which the given file should be read
+ */
+ protected String encoding = null;
private static final Logger LOG = LoggerFactory.getLogger(FileDataSource.class);
@@ -95,7 +101,16 @@
}
}
- private InputStreamReader openStream(File file) throws FileNotFoundException,
+ /**
+ * Open a {@link java.io.Reader} for the given file name
+ *
+ * @param file a {@link java.io.File} instance
+ * @return a Reader on the given file
+ * @throws FileNotFoundException if the File does not exist
+ * @throws UnsupportedEncodingException if the encoding is unsupported
+ * @since solr 1.4
+ */
+ protected Reader openStream(File file) throws FileNotFoundException,
UnsupportedEncodingException {
if (encoding == null) {
return new InputStreamReader(new FileInputStream(file));
Modified: lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/FileListEntityProcessor.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/FileListEntityProcessor.java?rev=812122&r1=812121&r2=812122&view=diff
==============================================================================
--- lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/FileListEntityProcessor.java (original)
+++ lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/FileListEntityProcessor.java Mon Sep 7 13:12:01 2009
@@ -54,13 +54,54 @@
* @since solr 1.3
*/
public class FileListEntityProcessor extends EntityProcessorBase {
- private String fileName, baseDir, excludes;
-
- private Date newerThan, olderThan;
-
- private long biggerThan = -1, smallerThan = -1;
-
- private boolean recursive = false;
+ /**
+ * A regex pattern to identify files given in data-config.xml after resolving any variables
+ */
+ protected String fileName;
+
+ /**
+ * The baseDir given in data-config.xml after resolving any variables
+ */
+ protected String baseDir;
+
+ /**
+ * A Regex pattern of excluded file names as given in data-config.xml after resolving any variables
+ */
+ protected String excludes;
+
+ /**
+ * The newerThan given in data-config as a {@link java.util.Date}
+ * <p>
+ * <b>Note: </b> This variable is resolved just-in-time in the {@link #nextRow()} method.
+ * </p>
+ */
+ protected Date newerThan;
+
+ /**
+ * The newerThan given in data-config as a {@link java.util.Date}
+ */
+ protected Date olderThan;
+
+ /**
+ * The biggerThan given in data-config as a long value
+ * <p>
+ * <b>Note: </b> This variable is resolved just-in-time in the {@link #nextRow()} method.
+ * </p>
+ */
+ protected long biggerThan = -1;
+
+ /**
+ * The smallerThan given in data-config as a long value
+ * <p>
+ * <b>Note: </b> This variable is resolved just-in-time in the {@link #nextRow()} method.
+ * </p>
+ */
+ protected long smallerThan = -1;
+
+ /**
+ * The recursive given in data-config. Default value is false.
+ */
+ protected boolean recursive = false;
private Pattern fileNamePattern, excludesPattern;
@@ -91,13 +132,23 @@
}
}
+ /**
+ * Get the Date object corresponding to the given string.
+ *
+ * @param dateStr the date string. It can be a DateMath string or it may have a evaluator function
+ * @return a Date instance corresponding to the input string
+ */
private Date getDate(String dateStr) {
if (dateStr == null)
return null;
Matcher m = PLACE_HOLDER_PATTERN.matcher(dateStr);
if (m.find()) {
- return (Date) resolver.resolve(dateStr);
+ Object o = resolver.resolve(m.group(1));
+ if (o instanceof Date) return (Date)o;
+ dateStr = (String) o;
+ } else {
+ dateStr = resolver.replaceTokens(dateStr);
}
m = EvaluatorBag.IN_SINGLE_QUOTES.matcher(dateStr);
if (m.find()) {
@@ -118,9 +169,34 @@
}
}
+ /**
+ * Get the Long value for the given string after resolving any evaluator or variable.
+ *
+ * @param sizeStr the size as a string
+ * @return the Long value corresponding to the given string
+ */
+ private Long getSize(String sizeStr) {
+ if (sizeStr == null)
+ return null;
+
+ Matcher m = PLACE_HOLDER_PATTERN.matcher(sizeStr);
+ if (m.find()) {
+ Object o = resolver.resolve(m.group(1));
+ if (o instanceof Number) {
+ Number number = (Number) o;
+ return number.longValue();
+ }
+ sizeStr = (String) o;
+ } else {
+ sizeStr = resolver.replaceTokens(sizeStr);
+ }
+
+ return Long.parseLong(sizeStr);
+ }
+
public Map<String, Object> nextRow() {
if (rowIterator != null)
- return getAndApplyTrans();
+ return getNext();
List<Map<String, Object>> fileDetails = new ArrayList<Map<String, Object>>();
File dir = new File(baseDir);
@@ -128,17 +204,16 @@
newerThan = getDate(dateStr);
dateStr = context.getEntityAttribute(OLDER_THAN);
olderThan = getDate(dateStr);
+ String biggerThanStr = context.getEntityAttribute(BIGGER_THAN);
+ if (biggerThanStr != null)
+ biggerThan = getSize(biggerThanStr);
+ String smallerThanStr = context.getEntityAttribute(SMALLER_THAN);
+ if (smallerThanStr != null)
+ smallerThan = getSize(smallerThanStr);
getFolderFiles(dir, fileDetails);
rowIterator = fileDetails.iterator();
- return getAndApplyTrans();
- }
-
- private Map<String, Object> getAndApplyTrans() {
- while (true) {
- Map<String, Object> r = getNext();
- return r;
- }
+ return getNext();
}
private void getFolderFiles(File dir, final List<Map<String, Object>> fileDetails) {
@@ -186,7 +261,7 @@
}
public static final Pattern PLACE_HOLDER_PATTERN = Pattern
- .compile("\\$\\{.*?\\}");
+ .compile("\\$\\{(.*?)\\}");
public static final String DIR = "fileDir";
Modified: lucene/solr/trunk/contrib/dataimporthandler/src/test/java/org/apache/solr/handler/dataimport/TestFileListEntityProcessor.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/contrib/dataimporthandler/src/test/java/org/apache/solr/handler/dataimport/TestFileListEntityProcessor.java?rev=812122&r1=812121&r2=812122&view=diff
==============================================================================
--- lucene/solr/trunk/contrib/dataimporthandler/src/test/java/org/apache/solr/handler/dataimport/TestFileListEntityProcessor.java (original)
+++ lucene/solr/trunk/contrib/dataimporthandler/src/test/java/org/apache/solr/handler/dataimport/TestFileListEntityProcessor.java Mon Sep 7 13:12:01 2009
@@ -22,10 +22,7 @@
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-import java.util.Map;
+import java.util.*;
/**
* <p>
@@ -63,22 +60,65 @@
}
Assert.assertEquals(2, fList.size());
}
-
+
@Test
- public void testNTOT() throws IOException {
+ public void testBiggerSmallerFiles() throws IOException {
long time = System.currentTimeMillis();
File tmpdir = new File("." + time);
tmpdir.mkdir();
tmpdir.deleteOnExit();
- createFile(tmpdir, "a.xml", "a.xml".getBytes(), true);
- createFile(tmpdir, "b.xml", "b.xml".getBytes(), true);
- createFile(tmpdir, "c.props", "c.props".getBytes(), true);
+ long minLength = Long.MAX_VALUE;
+ String smallestFile = "";
+ byte[] content = "abcdefgij".getBytes("UTF-8");
+ createFile(tmpdir, "a.xml", content, false);
+ if (minLength > content.length) {
+ minLength = content.length;
+ smallestFile = "a.xml";
+ }
+ content = "abcdefgij".getBytes("UTF-8");
+ createFile(tmpdir, "b.xml", content, false);
+ if (minLength > content.length) {
+ minLength = content.length;
+ smallestFile = "b.xml";
+ }
+ content = "abc".getBytes("UTF-8");
+ createFile(tmpdir, "c.props", content, false);
+ if (minLength > content.length) {
+ minLength = content.length;
+ smallestFile = "c.props";
+ }
Map attrs = AbstractDataImportHandlerTest.createMap(
- FileListEntityProcessor.FILE_NAME, "xml$",
+ FileListEntityProcessor.FILE_NAME, ".*",
FileListEntityProcessor.BASE_DIR, tmpdir.getAbsolutePath(),
- FileListEntityProcessor.OLDER_THAN, "'NOW'");
+ FileListEntityProcessor.BIGGER_THAN, String.valueOf(minLength));
+ List<String> fList = getFiles(null, attrs);
+ Assert.assertEquals(2, fList.size());
+ Set<String> l = new HashSet<String>();
+ l.add(new File(tmpdir, "a.xml").getAbsolutePath());
+ l.add(new File(tmpdir, "b.xml").getAbsolutePath());
+ Assert.assertEquals(l, new HashSet<String>(fList));
+ attrs = AbstractDataImportHandlerTest.createMap(
+ FileListEntityProcessor.FILE_NAME, ".*",
+ FileListEntityProcessor.BASE_DIR, tmpdir.getAbsolutePath(),
+ FileListEntityProcessor.SMALLER_THAN, String.valueOf(minLength+1));
+ fList = getFiles(null, attrs);
+ l.clear();
+ l.add(new File(tmpdir, smallestFile).getAbsolutePath());
+ Assert.assertEquals(l, new HashSet<String>(fList));
+ attrs = AbstractDataImportHandlerTest.createMap(
+ FileListEntityProcessor.FILE_NAME, ".*",
+ FileListEntityProcessor.BASE_DIR, tmpdir.getAbsolutePath(),
+ FileListEntityProcessor.SMALLER_THAN, "${a.x}");
+ VariableResolverImpl resolver = new VariableResolverImpl();
+ resolver.addNamespace("a", AbstractDataImportHandlerTest.createMap("x", "4"));
+ fList = getFiles(resolver, attrs);
+ Assert.assertEquals(l, new HashSet<String>(fList));
+ }
+
+ @SuppressWarnings("unchecked")
+ private List<String> getFiles(VariableResolverImpl resolver, Map attrs) {
Context c = AbstractDataImportHandlerTest.getContext(null,
- new VariableResolverImpl(), null, Context.FULL_DUMP, Collections.EMPTY_LIST, attrs);
+ resolver, null, Context.FULL_DUMP, Collections.EMPTY_LIST, attrs);
FileListEntityProcessor fileListEntityProcessor = new FileListEntityProcessor();
fileListEntityProcessor.init(c);
List<String> fList = new ArrayList<String>();
@@ -88,23 +128,29 @@
break;
fList.add((String) f.get(FileListEntityProcessor.ABSOLUTE_FILE));
}
- System.out.println("List of files when given OLDER_THAN -- " + fList);
+ return fList;
+ }
+
+ @Test
+ public void testNTOT() throws IOException {
+ long time = System.currentTimeMillis();
+ File tmpdir = new File("." + time);
+ tmpdir.mkdir();
+ tmpdir.deleteOnExit();
+ createFile(tmpdir, "a.xml", "a.xml".getBytes(), true);
+ createFile(tmpdir, "b.xml", "b.xml".getBytes(), true);
+ createFile(tmpdir, "c.props", "c.props".getBytes(), true);
+ Map attrs = AbstractDataImportHandlerTest.createMap(
+ FileListEntityProcessor.FILE_NAME, "xml$",
+ FileListEntityProcessor.BASE_DIR, tmpdir.getAbsolutePath(),
+ FileListEntityProcessor.OLDER_THAN, "'NOW'");
+ List<String> fList = getFiles(null, attrs);
Assert.assertEquals(2, fList.size());
attrs = AbstractDataImportHandlerTest.createMap(
FileListEntityProcessor.FILE_NAME, ".xml$",
FileListEntityProcessor.BASE_DIR, tmpdir.getAbsolutePath(),
FileListEntityProcessor.NEWER_THAN, "'NOW-2HOURS'");
- c = AbstractDataImportHandlerTest.getContext(null,
- new VariableResolverImpl(), null, Context.FULL_DUMP, Collections.EMPTY_LIST, attrs);
- fileListEntityProcessor.init(c);
- fList.clear();
- while (true) {
- Map<String, Object> f = fileListEntityProcessor.nextRow();
- if (f == null)
- break;
- fList.add((String) f.get(FileListEntityProcessor.ABSOLUTE_FILE));
- }
- System.out.println("List of files when given NEWER_THAN -- " + fList);
+ fList = getFiles(null, attrs);
Assert.assertEquals(2, fList.size());
}
@@ -124,20 +170,7 @@
FileListEntityProcessor.FILE_NAME, "^.*\\.xml$",
FileListEntityProcessor.BASE_DIR, childdir.getAbsolutePath(),
FileListEntityProcessor.RECURSIVE, "true");
- Context c = AbstractDataImportHandlerTest.getContext(null,
- new VariableResolverImpl(), null, Context.FULL_DUMP, Collections.EMPTY_LIST, attrs);
- FileListEntityProcessor fileListEntityProcessor = new FileListEntityProcessor();
- fileListEntityProcessor.init(c);
- List<String> fList = new ArrayList<String>();
- while (true) {
- // Add the documents to the index. NextRow() should only
- // find two filesnames that match the pattern in fileName
- Map<String, Object> f = fileListEntityProcessor.nextRow();
- if (f == null)
- break;
- fList.add((String) f.get(FileListEntityProcessor.ABSOLUTE_FILE));
- }
- System.out.println("List of files indexed -- " + fList);
+ List<String> fList = getFiles(null, attrs);
Assert.assertEquals(2, fList.size());
}
@@ -148,10 +181,8 @@
FileOutputStream f = new FileOutputStream(file);
f.write(content);
f.close();
- // System.out.println("before "+file.lastModified());
if (changeModifiedTime)
file.setLastModified(System.currentTimeMillis() - 3600000);
- // System.out.println("after "+file.lastModified());
return file;
}
}