You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sr...@apache.org on 2009/08/17 18:52:32 UTC

svn commit: r805035 - in /lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste: impl/common/ impl/model/ model/

Author: srowen
Date: Mon Aug 17 16:52:31 2009
New Revision: 805035

URL: http://svn.apache.org/viewvc?rev=805035&view=rev
Log:
MAHOUT-162

Added:
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/AbstractIDMigrator.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/AbstractJDBCIDMigrator.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/MemoryIDMigrator.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/MySQLJDBCIDMigrator.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/model/IDMigrator.java
Modified:
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/common/RandomUtils.java

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/common/RandomUtils.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/common/RandomUtils.java?rev=805035&r1=805034&r2=805035&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/common/RandomUtils.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/common/RandomUtils.java Mon Aug 17 16:52:31 2009
@@ -60,20 +60,6 @@
   }
 
   /**
-   * See String.hashCode() from Apache Harmony
-   */
-  public static long hashStringToLong(String value) {
-    long hash = 0L;
-    int multiplier = 1;
-    for (int i = value.length() - 1; i >= 0; i--) {
-      hash += value.charAt(i) * multiplier;
-      int shifted = multiplier << 5;
-      multiplier = shifted - multiplier;
-    }
-    return hash;
-  }
-
-  /**
    * <p>Finds next-largest "twin primes": numbers p and p+2 such that both are prime. Finds the smallest such p such
    * that the smaller twin, p, is greater than or equal to n. Returns p+2, the larger of the two twins.</p>
    */

Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/AbstractIDMigrator.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/AbstractIDMigrator.java?rev=805035&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/AbstractIDMigrator.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/AbstractIDMigrator.java Mon Aug 17 16:52:31 2009
@@ -0,0 +1,74 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.model.IDMigrator;
+
+import java.nio.charset.Charset;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+
+public abstract class AbstractIDMigrator implements IDMigrator {
+
+  private static final Charset UTF8_CHARSET = Charset.forName("UTF8");
+
+  private final MessageDigest md5Digest;
+
+  protected AbstractIDMigrator() {
+    try {
+      md5Digest = MessageDigest.getInstance("MD5");
+    } catch (NoSuchAlgorithmException nsae) {
+      // Can't happen
+      throw new IllegalStateException(nsae);
+    }
+  }
+
+  protected final long hash(String value) {
+    byte[] md5hash;
+    synchronized (md5Digest) {
+      md5hash = md5Digest.digest(value.getBytes(UTF8_CHARSET));
+      md5Digest.reset();
+    }
+    long hash = 0L;
+    for (int i = 0; i < 8; i++) {
+      hash = (hash << 8) | (md5hash[i] & 0x00000000000000FFL);
+    }
+    return hash;
+  }
+
+  @Override
+  public long toLongID(String stringID) throws TasteException {
+    long longID = hash(stringID);
+    storeMapping(longID, stringID);
+    return longID;
+  }
+
+  @Override
+  public void initialize(Iterable<String> stringIDs) throws TasteException {
+    for (String stringID : stringIDs) {
+      toLongID(stringID);
+    }
+  }
+
+  protected abstract void storeMapping(long longID, String stringID) throws TasteException;
+
+  @Override
+  public abstract String toStringID(long longID) throws TasteException;
+
+}
\ No newline at end of file

Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/AbstractJDBCIDMigrator.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/AbstractJDBCIDMigrator.java?rev=805035&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/AbstractJDBCIDMigrator.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/AbstractJDBCIDMigrator.java Mon Aug 17 16:52:31 2009
@@ -0,0 +1,88 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.IOUtils;
+
+import javax.sql.DataSource;
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+
+public abstract class AbstractJDBCIDMigrator extends AbstractIDMigrator {
+
+  public static final String DEFAULT_MAPPING_TABLE = "taste_id_mapping";
+  public static final String DEFAULT_LONG_ID_COLUMN = "long_id";
+  public static final String DEFAULT_STRING_ID_COLUMN = "string_id";
+
+  private final DataSource dataSource;
+  private final String getStringIDSQL;
+  private final String storeMappingSQL;
+
+  public AbstractJDBCIDMigrator(DataSource dataSource,
+                                String getStringIDSQL,
+                                String storeMappingSQL) {
+    this.dataSource = dataSource;
+    this.getStringIDSQL = getStringIDSQL;
+    this.storeMappingSQL = storeMappingSQL;
+  }
+
+  @Override
+  protected final void storeMapping(long longID, String stringID) throws TasteException {
+    Connection conn = null;
+    PreparedStatement stmt = null;
+    try {
+      conn = dataSource.getConnection();
+      stmt = conn.prepareStatement(storeMappingSQL);
+      stmt.setLong(1, longID);
+      stmt.setString(2, stringID);
+      stmt.executeUpdate();
+    } catch (SQLException sqle) {
+      throw new TasteException(sqle);
+    } finally {
+      IOUtils.quietClose(null, stmt, conn);
+    }
+  }
+
+  @Override
+  public final String toStringID(long longID) throws TasteException {
+    Connection conn = null;
+    PreparedStatement stmt = null;
+    ResultSet rs = null;
+    try {
+      conn = dataSource.getConnection();
+      stmt = conn.prepareStatement(getStringIDSQL, ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY);
+      stmt.setFetchDirection(ResultSet.FETCH_FORWARD);
+      stmt.setFetchSize(1);
+      stmt.setLong(1, longID);
+      rs = stmt.executeQuery();
+      if (rs.next()) {
+        return rs.getString(1);
+      } else {
+        return null;
+      }
+    } catch (SQLException sqle) {
+      throw new TasteException(sqle);
+    } finally {
+      IOUtils.quietClose(rs, stmt, conn);
+    }
+  }
+
+}
\ No newline at end of file

Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/MemoryIDMigrator.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/MemoryIDMigrator.java?rev=805035&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/MemoryIDMigrator.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/MemoryIDMigrator.java Mon Aug 17 16:52:31 2009
@@ -0,0 +1,44 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model;
+
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+
+public final class MemoryIDMigrator extends AbstractIDMigrator {
+
+  private final FastByIDMap<String> longToString;
+
+  public MemoryIDMigrator() {
+    this.longToString = new FastByIDMap<String>(100);
+  }
+
+  @Override
+  protected void storeMapping(long longID, String stringID) {
+    synchronized (longToString) {
+      longToString.put(longID, stringID);
+    }
+  }
+
+  @Override
+  public String toStringID(long longID) {
+    synchronized (longToString) {
+      return longToString.get(longID);
+    }
+  }
+
+}
\ No newline at end of file

Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/MySQLJDBCIDMigrator.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/MySQLJDBCIDMigrator.java?rev=805035&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/MySQLJDBCIDMigrator.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/MySQLJDBCIDMigrator.java Mon Aug 17 16:52:31 2009
@@ -0,0 +1,49 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model;
+
+import javax.sql.DataSource;
+
+/**
+ * <p>An implementation for MySQL. The following statement would create a table suitable for use with
+ * this class:</p>
+ * 
+ * <pre>
+ * CREATE TABLE taste_id_migration (
+ *   long_id BIGINT NOT NULL,
+ *   string_id VARCHAR(255) NOT NULL,
+ *   PRIMARY KEY (long_id)
+ * )
+ * </pre>
+ */
+public final class MySQLJDBCIDMigrator extends AbstractJDBCIDMigrator {
+
+  public MySQLJDBCIDMigrator(DataSource dataSource) {
+    this(dataSource, DEFAULT_MAPPING_TABLE, DEFAULT_LONG_ID_COLUMN, DEFAULT_STRING_ID_COLUMN);
+  }
+
+  public MySQLJDBCIDMigrator(DataSource dataSource,
+                             String mappingTable,
+                             String longIDColumn,
+                             String stringIDColumn) {
+    super(dataSource,
+          "SELECT " + stringIDColumn + " FROM " + mappingTable + " WHERE " + longIDColumn + "=?",
+          "REPLACE INTO " + mappingTable + " (" + longIDColumn + ',' + stringIDColumn + ") VALUES (?,?)");
+  }
+
+}
\ No newline at end of file

Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/model/IDMigrator.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/model/IDMigrator.java?rev=805035&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/model/IDMigrator.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/model/IDMigrator.java Mon Aug 17 16:52:31 2009
@@ -0,0 +1,65 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.model;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+
+/**
+ * <p>Mahout 0.2 changed the framework to operate only in terms of numeric (long) ID values
+ * for users and items. This is, obviously, not compatible with applications that used other
+ * key types -- most commonly {@link String}. Implementation of this class provide support for
+ * mapping String to longs and vice versa in order to provide a smoother migration path to
+ * applications that must still use strings as IDs.</p>
+ *
+ * <p>The mapping from strings to 64-bit numeric values is fixed here, to provide a standard
+ * implementation that is 'portable' or reproducible outside the framework easily. See
+ * {@link #toLongID(String)}.</p>
+ *
+ * <p>Because this mapping is deterministically computable, it does not need to be stored. Indeed,
+ * subclasses' job is to store the reverse mapping. There are an infinite number of strings but only
+ * a fixed number of longs, so, it is possible for two strings to map to the same value. Subclasses
+ * do not treat this as an error but rather retain only the most recent mapping, overwriting a previous
+ * mapping. The probability of collision in a 64-bit space is quite small, but not zero. However,
+ * in the context of a collaborative filtering problem, the consequence of a collision is small, at worst
+ * -- perhaps one user receives another recommendations.</p>
+ *
+ * @since 0.2
+ */
+public interface IDMigrator {
+
+  /**
+   * @return the top 8 bytes of the MD5 hash of the bytes of the given {@link String}'s UTF-8 encoding as a long.
+   *  The reverse mapping is also stored.
+   * @throws TasteException if an error occurs while storing the mapping
+   */
+  long toLongID(String stringID) throws TasteException;
+
+  /**
+   * @return the string ID most recently associated with the given long ID, or null if doesn't exist
+   * @throws TasteException if an error occurs while retrieving the mapping
+   */
+  String toStringID(long longID) throws TasteException;
+
+  /**
+   * Make the mapping aware of the given string IDs.
+   *
+   * @throws TasteException if an error occurs while storing the mappings
+   */
+  void initialize(Iterable<String> stringIDs) throws TasteException;
+
+}