You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@couchdb.apache.org by Apache Wiki <wi...@apache.org> on 2008/07/21 00:39:20 UTC
[Couchdb Wiki] Update of "FullTextIndexWithView" by DanielReverri

Dear Wiki user,

You have subscribed to a wiki page or wiki category on "Couchdb Wiki" for change notification.

The following page has been changed by DanielReverri:
http://wiki.apache.org/couchdb/FullTextIndexWithView

New page:
I wanted to throw this idea out there to see what people thought.

There has been a lot of discussion about integrating full text search into couch and possibly implementing full text search in Erlang. Would it be worth investigating the use of CouchDB's !MapReduce functionality to implement a full text indexer? I whipped together a short example using views. It implements a simple white space tokenizer in !JavaScript, emits each token with it's doc id and position, and reduces each token to a list of doc ids and positions.

Here is the map function:

{{{
function(doc) 
{
    var tokenEmit = function(token) {
        emit([token.value,token.field], [this._id,token.position]);
    }
    
    var whiteSpaceAnalyzer = function(str, field) {
        // Returns tokens split by white space
        // token: { value: tokenString, position: [0,10] }
        var len = str.length;
        var tokenPositions = new Array();
        var startPosition = null;

        var isTokenChar = function(Char) {
            if (Char === ' ' || Char === '\t' || Char === '\n')
                return false;
            return true;
        }

        for(var i=0; i < len; i++)
        {
            if(startPosition == null)
            {
                if(isTokenChar(str[i]))
                {
                    // start of word
                    startPosition = i;
                    if( i+1 == len )
                    {
                        // end of string
                        tokenPositions[tokenPositions.length] = [startPosition, i+1];
                    }
                }
            }
            else
            {
                if(!isTokenChar(str[i]))
                {
                    // end of word
                    tokenPositions[tokenPositions.length] = [startPosition, i];
                    startPosition = null; // reset startPosition
                    continue;
                }
                
                if( i+1 == len )
                {
                    // end of string
                    tokenPositions[tokenPositions.length] = [startPosition, i+1];
                }
            }
        }

        var tokenMap = function(tokenPosition) {
            var token = this.str.substring(tokenPosition[0],tokenPosition[1]);
            return { value: token, field:this.field, position: tokenPosition };
        }
        
        return tokenPositions.map(tokenMap,{str:str,field:field});
    }
    
    var tokens;
    
    for (field in doc) {
        if (typeof(doc[field])=='string') {
            tokens = whiteSpaceAnalyzer(doc[field], field);
            tokens.map(tokenEmit, doc);
        }
    }
}
}}}

Here is the reduce function:

{{{
function(keys,values,combine)
{
    var result = new Array();
    var docHash = new Array();
    if(combine) 
    {
        for(var v in values)
        {
            var docObject = values[v][0];
            var docId = docObject["doc"];
            var positions = docObject["pos"];
            if(docHash[docId] == null)
            {
                docHash[docId]=new Array();
            }
            docHash[docId] = docHash[docId].concat(positions);
        }
        for(var i in docHash){
            result[result.length]={doc:i,pos:docHash[i]};
        }
    }
    else
    {
        for(var j in values)
        {
            var docId = values[j][0];
            var position = values[j][1];
            if(docHash[docId] == null)
            {
            docHash[docId]=new Array();
            }
            docHash[docId] = docHash[docId].concat([position]);
        }
        for(var i in docHash){
            result[result.length]={doc:i,pos:docHash[i]};
        }
    }
    return result;  
}
}}}

The key emitted from the view is {{{["token","field"]}}}. This allows terms to be searched per field while also allowing the use of group_level=1 to combine the results of all fields. Combining results of multiple fields currently eliminates the use of positions.

To reduce the amount of information passed during view generation the whiteSpaceAnalyzer function can be moved to the main.js file.

Is this worth pursuing further?