You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sr...@apache.org on 2010/02/05 19:24:38 UTC
svn commit: r907037 - in /lucene/mahout/trunk:
core/src/main/java/org/apache/mahout/cf/taste/impl/model/
core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/
examples/src/main/java/org/apache/mahout/cf/taste/example/jester/
Author: srowen
Date: Fri Feb 5 18:24:37 2010
New Revision: 907037
URL: http://svn.apache.org/viewvc?rev=907037&view=rev
Log:
FileDataModel improvements when update files only, not main file have changed -- won't reload main data
Modified:
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericBooleanPrefDataModel.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericDataModel.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java
lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/jester/JesterDataModel.java
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericBooleanPrefDataModel.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericBooleanPrefDataModel.java?rev=907037&r1=907036&r2=907037&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericBooleanPrefDataModel.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericBooleanPrefDataModel.java Fri Feb 5 18:24:37 2010
@@ -124,6 +124,13 @@
return (FastByIDMap<FastIDSet>) (FastByIDMap<?>) data;
}
+ /**
+ * This is used mostly internally to the framework, and shouldn't be relied upon otherwise.
+ */
+ public FastByIDMap<FastIDSet> getRawUserData() {
+ return this.preferenceFromUsers;
+ }
+
@Override
public LongPrimitiveArrayIterator getUserIDs() {
return new LongPrimitiveArrayIterator(userIDs);
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericDataModel.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericDataModel.java?rev=907037&r1=907036&r2=907037&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericDataModel.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericDataModel.java Fri Feb 5 18:24:37 2010
@@ -140,6 +140,13 @@
return data;
}
+ /**
+ * This is used mostly internally to the framework, and shouldn't be relied upon otherwise.
+ */
+ public FastByIDMap<PreferenceArray> getRawUserData() {
+ return this.preferenceFromUsers;
+ }
+
@Override
public LongPrimitiveArrayIterator getUserIDs() {
return new LongPrimitiveArrayIterator(userIDs);
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java?rev=907037&r1=907036&r2=907037&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java Fri Feb 5 18:24:37 2010
@@ -21,6 +21,7 @@
import org.apache.mahout.cf.taste.common.TasteException;
import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.model.GenericUserPreferenceArray;
import org.apache.mahout.common.FileLineIterator;
import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
import org.apache.mahout.cf.taste.impl.model.GenericBooleanPrefDataModel;
@@ -77,8 +78,8 @@
* that, a JDBC-backed {@link DataModel} and a database are more appropriate.</p>
*
* <p>It is possible and likely useful to subclass this class and customize its behavior to accommodate
- * application-specific needs and input formats. See {@link #processLine(String, FastByIDMap, char)} and
- * {@link #processLineWithoutID(String, FastByIDMap, char)}
+ * application-specific needs and input formats. See {@link #processLine(String, FastByIDMap)} and
+ * {@link #processLineWithoutID(String, FastByIDMap)}
*/
public class FileDataModel implements DataModel {
@@ -89,6 +90,9 @@
private final File dataFile;
private long lastModified;
+ private long lastUpdateFileModified;
+ private final char delimiter;
+ private final boolean hasPrefValues;
private boolean loaded;
private DataModel delegate;
private final ReentrantLock reloadLock;
@@ -98,12 +102,17 @@
* @param dataFile file containing preferences data. If file is compressed (and name ends in .gz or .zip accordingly)
* it will be decompressed as it is read)
* @throws FileNotFoundException if dataFile does not exist
+ * @throws IOException if file can't be read
*/
- public FileDataModel(File dataFile) throws FileNotFoundException {
+ public FileDataModel(File dataFile) throws IOException {
this(dataFile, false);
}
- public FileDataModel(File dataFile, boolean transpose) throws FileNotFoundException {
+ /**
+ * @param transpose transposes user IDs and item IDs -- convenient for 'flipping' the data model this way
+ * @see #FileDataModel(File)
+ */
+ public FileDataModel(File dataFile, boolean transpose) throws IOException {
if (dataFile == null) {
throw new IllegalArgumentException("dataFile is null");
}
@@ -115,6 +124,18 @@
this.dataFile = dataFile.getAbsoluteFile();
this.lastModified = dataFile.lastModified();
+ this.lastUpdateFileModified = readLastUpdateFileModified();
+
+ FileLineIterator iterator = new FileLineIterator(dataFile, false);
+ String firstLine = iterator.peek();
+ while (firstLine.length() == 0 || firstLine.charAt(0) == COMMENT_CHAR) {
+ iterator.next();
+ firstLine = iterator.peek();
+ }
+ iterator.close();
+ delimiter = determineDelimiter(firstLine, 2);
+ hasPrefValues = firstLine.indexOf(delimiter, firstLine.indexOf(delimiter) + 1) >= 0;
+
this.reloadLock = new ReentrantLock();
this.transpose = transpose;
}
@@ -123,6 +144,10 @@
return dataFile;
}
+ public char getDelimiter() {
+ return delimiter;
+ }
+
protected void reload() {
if (!reloadLock.isLocked()) {
reloadLock.lock();
@@ -138,29 +163,67 @@
}
protected DataModel buildModel() throws IOException {
- FileLineIterator iterator = new FileLineIterator(dataFile, false);
- String firstLine = iterator.peek();
- while (firstLine.length() == 0 || firstLine.charAt(0) == COMMENT_CHAR) {
- iterator.next();
- firstLine = iterator.peek();
- }
- char delimiter = determineDelimiter(firstLine, 2);
- boolean hasPrefValues = firstLine.indexOf(delimiter, firstLine.indexOf(delimiter) + 1) >= 0;
+
+ long newLastModified = dataFile.lastModified();
+ long newLastUpdateFileModified = readLastUpdateFileModified();
+
+ boolean loadFreshData = delegate == null || newLastModified > lastModified + MIN_RELOAD_INTERVAL_MS;
+
+ lastModified = newLastModified;
+ lastUpdateFileModified = newLastUpdateFileModified;
if (hasPrefValues) {
- FastByIDMap<Collection<Preference>> data = new FastByIDMap<Collection<Preference>>();
- processFile(iterator, data, delimiter);
- for (File updateFile : findUpdateFiles()) {
- processFile(new FileLineIterator(updateFile, false), data, delimiter);
+
+ if (loadFreshData) {
+
+ FastByIDMap<Collection<Preference>> data = new FastByIDMap<Collection<Preference>>();
+ FileLineIterator iterator = new FileLineIterator(dataFile, false);
+ processFile(iterator, data);
+
+ for (File updateFile : findUpdateFiles()) {
+ processFile(new FileLineIterator(updateFile, false), data);
+ }
+
+ return new GenericDataModel(GenericDataModel.toDataMap(data, true));
+
+ } else {
+
+ FastByIDMap<PreferenceArray> rawData = ((GenericDataModel) delegate).getRawUserData();
+
+ for (File updateFile : findUpdateFiles()) {
+ processFile(new FileLineIterator(updateFile, false), rawData);
+ }
+
+ return new GenericDataModel(rawData);
+
}
- return new GenericDataModel(GenericDataModel.toDataMap(data, true));
+
} else {
- FastByIDMap<FastIDSet> data = new FastByIDMap<FastIDSet>();
- processFileWithoutID(iterator, data, delimiter);
- for (File updateFile : findUpdateFiles()) {
- processFileWithoutID(new FileLineIterator(updateFile, false), data, delimiter);
+
+ if (loadFreshData) {
+
+ FastByIDMap<FastIDSet> data = new FastByIDMap<FastIDSet>();
+ FileLineIterator iterator = new FileLineIterator(dataFile, false);
+ processFileWithoutID(iterator, data);
+
+ for (File updateFile : findUpdateFiles()) {
+ processFileWithoutID(new FileLineIterator(updateFile, false), data);
+ }
+
+ return new GenericBooleanPrefDataModel(data);
+
+ } else {
+
+ FastByIDMap<FastIDSet> rawData = ((GenericBooleanPrefDataModel) delegate).getRawUserData();
+
+ for (File updateFile : findUpdateFiles()) {
+ processFileWithoutID(new FileLineIterator(updateFile, false), rawData);
+ }
+
+ return new GenericBooleanPrefDataModel(rawData);
+
}
- return new GenericBooleanPrefDataModel(data);
+
}
}
@@ -185,6 +248,14 @@
return updateFiles;
}
+ private long readLastUpdateFileModified() {
+ long mostRecentModification = Long.MIN_VALUE;
+ for (File updateFile : findUpdateFiles()) {
+ mostRecentModification = Math.max(mostRecentModification, updateFile.lastModified());
+ }
+ return mostRecentModification;
+ }
+
public static char determineDelimiter(String line, int maxDelimiters) {
char delimiter;
if (line.indexOf(',') >= 0) {
@@ -212,14 +283,13 @@
}
protected void processFile(FileLineIterator dataOrUpdateFileIterator,
- FastByIDMap<Collection<Preference>> data,
- char delimiter) {
+ FastByIDMap<?> data) {
log.info("Reading file info...");
AtomicInteger count = new AtomicInteger();
while (dataOrUpdateFileIterator.hasNext()) {
String line = dataOrUpdateFileIterator.next();
if (line.length() > 0) {
- processLine(line, data, delimiter);
+ processLine(line, data);
int currentCount = count.incrementAndGet();
if (currentCount % 1000000 == 0) {
log.info("Processed {} lines", currentCount);
@@ -240,7 +310,7 @@
* @param line line from input data file
* @param data all data read so far, as a mapping from user IDs to preferences
*/
- protected void processLine(String line, FastByIDMap<Collection<Preference>> data, char delimiter) {
+ protected void processLine(String line, FastByIDMap<?> data) {
if (line.length() == 0 || line.charAt(0) == COMMENT_CHAR) {
return;
@@ -274,37 +344,120 @@
userID = itemID;
itemID = tmp;
}
- Collection<Preference> prefs = data.get(userID);
- if (prefs == null) {
- prefs = new ArrayList<Preference>(2);
- data.put(userID, prefs);
- }
- if (preferenceValueString.length() == 0) {
- // remove pref
- Iterator<Preference> prefsIterator = prefs.iterator();
- while (prefsIterator.hasNext()) {
- Preference pref = prefsIterator.next();
- if (pref.getItemID() == itemID) {
- prefsIterator.remove();
- break;
+ // This is kind of gross but need to handle two types of storage
+ Object maybePrefs = data.get(userID);
+ if (maybePrefs instanceof PreferenceArray) {
+
+ PreferenceArray prefs = (PreferenceArray) maybePrefs;
+ if (preferenceValueString.length() == 0) {
+ if (prefs != null) {
+ boolean exists = false;
+ int length = prefs.length();
+ for (int i = 0; i < length; i++) {
+ if (prefs.getItemID(i) == itemID) {
+ exists = true;
+ break;
+ }
+ }
+ if (exists) {
+ if (length == 1) {
+ data.remove(userID);
+ } else {
+ PreferenceArray newPrefs = new GenericUserPreferenceArray(length - 1);
+ for (int i = 0, j = 0; i < length; i++, j++) {
+ if (prefs.getItemID(i) == itemID) {
+ j--;
+ } else {
+ newPrefs.set(j, prefs.get(i));
+ }
+ }
+ }
+ }
+ }
+
+ } else {
+
+ float preferenceValue = Float.parseFloat(preferenceValueString);
+
+ boolean exists = false;
+ if (prefs != null) {
+ for (int i = 0; i < prefs.length(); i++) {
+ if (prefs.getItemID(i) == itemID) {
+ exists = true;
+ prefs.setValue(i, preferenceValue);
+ break;
+ }
+ }
+ }
+
+ if (!exists) {
+ if (prefs == null) {
+ prefs = new GenericUserPreferenceArray(1);
+ ((FastByIDMap<PreferenceArray>) data).put(userID, prefs);
+ } else {
+ PreferenceArray newPrefs = new GenericUserPreferenceArray(prefs.length() + 1);
+ for (int i = 0, j = 1; i < prefs.length(); i++, j++) {
+ newPrefs.set(j, prefs.get(i));
+ }
+ }
+ prefs.setUserID(0, userID);
+ prefs.setItemID(0, itemID);
+ prefs.setValue(0, preferenceValue);
}
}
+
} else {
- float preferenceValue = Float.parseFloat(preferenceValueString);
- prefs.add(new GenericPreference(userID, itemID, preferenceValue));
+
+ Collection<Preference> prefs = (Collection<Preference>) maybePrefs;
+
+ if (preferenceValueString.length() == 0) {
+ if (prefs != null) {
+ // remove pref
+ Iterator<Preference> prefsIterator = prefs.iterator();
+ while (prefsIterator.hasNext()) {
+ Preference pref = prefsIterator.next();
+ if (pref.getItemID() == itemID) {
+ prefsIterator.remove();
+ break;
+ }
+ }
+ }
+ } else {
+
+ float preferenceValue = Float.parseFloat(preferenceValueString);
+
+ boolean exists = false;
+ if (prefs != null) {
+ for (Preference pref : prefs) {
+ if (pref.getItemID() == itemID) {
+ exists = true;
+ pref.setValue(preferenceValue);
+ break;
+ }
+ }
+ }
+
+ if (!exists) {
+ if (prefs == null) {
+ prefs = new ArrayList<Preference>(2);
+ ((FastByIDMap<Collection<Preference>>) data).put(userID, prefs);
+ }
+ prefs.add(new GenericPreference(userID, itemID, preferenceValue));
+ }
+ }
+
}
}
protected void processFileWithoutID(FileLineIterator dataOrUpdateFileIterator,
- FastByIDMap<FastIDSet> data,
- char delimiter) {
+ FastByIDMap<FastIDSet> data) {
log.info("Reading file info...");
AtomicInteger count = new AtomicInteger();
while (dataOrUpdateFileIterator.hasNext()) {
String line = dataOrUpdateFileIterator.next();
if (line.length() > 0) {
- processLineWithoutID(line, data, delimiter);
+ processLineWithoutID(line, data);
int currentCount = count.incrementAndGet();
if (currentCount % 100000 == 0) {
log.info("Processed {} lines", currentCount);
@@ -314,7 +467,7 @@
log.info("Read lines: {}", count.get());
}
- protected void processLineWithoutID(String line, FastByIDMap<FastIDSet> data, char delimiter) {
+ protected void processLineWithoutID(String line, FastByIDMap<FastIDSet> data) {
if (line.length() == 0 || line.charAt(0) == COMMENT_CHAR) {
return;
@@ -438,13 +591,9 @@
@Override
public void refresh(Collection<Refreshable> alreadyRefreshed) {
- long mostRecentModification = dataFile.lastModified();
- for (File updateFile : findUpdateFiles()) {
- mostRecentModification = Math.max(mostRecentModification, updateFile.lastModified());
- }
- if (mostRecentModification > lastModified + MIN_RELOAD_INTERVAL_MS) {
+ if (dataFile.lastModified() > lastModified + MIN_RELOAD_INTERVAL_MS ||
+ readLastUpdateFileModified() > lastUpdateFileModified + MIN_RELOAD_INTERVAL_MS) {
log.debug("File has changed; reloading...");
- lastModified = mostRecentModification;
reload();
}
}
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/jester/JesterDataModel.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/jester/JesterDataModel.java?rev=907037&r1=907036&r2=907037&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/jester/JesterDataModel.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/jester/JesterDataModel.java Fri Feb 5 18:24:37 2010
@@ -57,13 +57,14 @@
protected DataModel buildModel() throws IOException {
FastByIDMap<Collection<Preference>> data = new FastByIDMap<Collection<Preference>>();
FileLineIterator iterator = new FileLineIterator(getDataFile(), false);
- processFile(iterator, data, ',');
+ processFile(iterator, data);
return new GenericDataModel(GenericDataModel.toDataMap(data, true));
}
@Override
- protected void processLine(String line, FastByIDMap<Collection<Preference>> data, char delimiter) {
- String[] jokePrefs = line.split(String.valueOf(delimiter));
+ protected void processLine(String line, FastByIDMap<?> rawData) {
+ FastByIDMap<Collection<Preference>> data = (FastByIDMap<Collection<Preference>>) rawData;
+ String[] jokePrefs = line.split(",");
int count = Integer.parseInt(jokePrefs[0]);
Collection<Preference> prefs = new ArrayList<Preference>(count);
for (int itemID = 1; itemID < jokePrefs.length; itemID++) { // yes skip first one, just a count