You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-user@lucene.apache.org by Алексей Курган <ku...@hotjob.ru> on 2013/08/14 09:55:53 UTC
Solr with custom tokenizer

There is a problem with custom tokenizer for Solr. We have developed our
own tokenizer for Solr, that he rescued phones from the text and put
additional tokens to token stream. But unfortunately, these additional
tokens are not indexed by Solr. For an example, the text "Hello (111)
222-33-44 all!" expanded into tokens: "2223344", "1112223344",
"71112223344", "81112223344", "hello", "111", "222", "33", "44", "all". The
search for tokens "2223344", "1112223344", "71112223344", "81112223344" is
not happening. Tell me what could be the cause. We are using Solr 4.3.1.
Next are the sources:

public class HJStandardTokenizerFactory extends TokenizerFactory{
    private final int maxTokenLength;

    public HJStandardTokenizerFactory(Map<String, String> args) {
        super(args);
        assureMatchVersion();
        maxTokenLength = getInt(args, "maxTokenLength",
StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
        if (!args.isEmpty()) {
            throw new IllegalArgumentException("Unknown parameters: " +
args);
        }
    }

    @Override
    public Tokenizer create(AttributeSource.AttributeFactory factory,
Reader input) {
        String content = null;
        HJPhoneNumberHelper hjPhoneNumberHelper = null;
        StringReader stringReader = null;
        try {
            content = IOUtils.toString(input);
            hjPhoneNumberHelper = new HJPhoneNumberHelper(content);
            stringReader = new StringReader(content);
        } catch (IOException e) {
        }

        HJStandardTokenizer tokenizer = new
HJStandardTokenizer(luceneMatchVersion, factory, stringReader,
                hjPhoneNumberHelper.getPhoneNumbers());
        tokenizer.setMaxTokenLength(maxTokenLength);
        return tokenizer;
    }
}



public class HJStandardTokenizer extends Tokenizer{
    private StandardTokenizerInterface scanner;

    public static final int ALPHANUM          = 0;
    /** @deprecated (3.1) */
    @Deprecated
    public static final int APOSTROPHE        = 1;
    /** @deprecated (3.1) */
    @Deprecated
    public static final int ACRONYM           = 2;
    /** @deprecated (3.1) */
    @Deprecated
    public static final int COMPANY           = 3;
    public static final int EMAIL             = 4;
    /** @deprecated (3.1) */
    @Deprecated
    public static final int HOST              = 5;
    public static final int NUM               = 6;
    /** @deprecated (3.1) */
    @Deprecated
    public static final int CJ                = 7;

    /** @deprecated (3.1) */
    @Deprecated
    public static final int ACRONYM_DEP       = 8;

    public static final int SOUTHEAST_ASIAN = 9;
    public static final int IDEOGRAPHIC = 10;
    public static final int HIRAGANA = 11;
    public static final int KATAKANA = 12;
    public static final int HANGUL = 13;

    /** String token types that correspond to token type int constants */
    public static final String [] TOKEN_TYPES = new String [] {
            "<ALPHANUM>",
            "<APOSTROPHE>",
            "<ACRONYM>",
            "<COMPANY>",
            "<EMAIL>",
            "<HOST>",
            "<NUM>",
            "<CJ>",
            "<ACRONYM_DEP>",
            "<SOUTHEAST_ASIAN>",
            "<IDEOGRAPHIC>",
            "<HIRAGANA>",
            "<KATAKANA>",
            "<HANGUL>"
    };

    private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;

    private static class PhoneTextPosition {
        public int position;
        public int length;
        public LinkedList<String> variants = new LinkedList<String>();

        private PhoneTextPosition(int position, int length,
Collection<String> phoneVariants) {
            this.position = position;
            this.length = length;
            this.variants.addAll(phoneVariants);
        }

        @Override
        public int hashCode() {
            return (new Integer(position).hashCode());
        }

        @Override
        public boolean equals(Object obj) {
            if (obj instanceof PhoneTextPosition) {
                PhoneTextPosition otherObj = (PhoneTextPosition)obj;
                if (position == otherObj.position &&
                        length == otherObj.length)
                    return true;
            }
            return false;
        }
    }

    private LinkedList<PhoneTextPosition> phoneVariants;

    /** Set the max allowed token length.  Any token longer
     *  than this is skipped. */
    public void setMaxTokenLength(int length) {
        this.maxTokenLength = length;
    }

    /** @see #setMaxTokenLength */
    public int getMaxTokenLength() {
        return maxTokenLength;
    }

    /**
     * Creates a new instance of the {@link
org.apache.lucene.analysis.standard.StandardTokenizer}.  Attaches
     * the <code>input</code> to the newly created JFlex scanner.
     *
     * @param input The input reader
     *
     * See http://issues.apache.org/jira/browse/LUCENE-1068
     */
    public HJStandardTokenizer(Version matchVersion, Reader input) {
        super(input);
        init(matchVersion);
    }

    /**
     * Creates a new StandardTokenizer with a given {@link
org.apache.lucene.util.AttributeSource.AttributeFactory}
     */
    public HJStandardTokenizer(Version matchVersion, AttributeFactory
factory, Reader input, Collection<HJPhoneNumber> phones) {
        super(factory, input);
        init(matchVersion);

        phoneVariants = new LinkedList<PhoneTextPosition>();
        for (HJPhoneNumber phone : phones) {
            PhoneTextPosition position = new PhoneTextPosition(
                    phone.getPositionInText(),
                    phone.getLengthInText(),
                    phone.getAllVariants());
            phoneVariants.add(position);
        }
    }

    private final void init(Version matchVersion) {
        this.scanner = new StandardTokenizerImpl(null);
    }

    // this tokenizer generates three attributes:
    // term offset, positionIncrement and type
    private final CharTermAttribute termAtt =
addAttribute(CharTermAttribute.class);
    private final OffsetAttribute offsetAtt =
addAttribute(OffsetAttribute.class);
    private final PositionIncrementAttribute posIncrAtt =
addAttribute(PositionIncrementAttribute.class);
    private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);

    /*
     * (non-Javadoc)
     *
     * @see org.apache.lucene.analysis.TokenStream#next()
     */
    @Override
    public final boolean incrementToken() throws IOException {
        clearAttributes();

        if (phoneVariants.size() > 0) {
            PhoneTextPosition p = phoneVariants.peek();
            try {
                String variant = p.variants.poll();
                if (StringUtils.isNotEmpty(variant)) {
                    posIncrAtt.setPositionIncrement(1);
                    char[] buf = variant.toCharArray();
                    termAtt.resizeBuffer(buf.length);
                    termAtt.copyBuffer(buf, 0, buf.length);
                    final int start = p.position;
                    offsetAtt.setOffset(correctOffset(start),
correctOffset(start+p.length));

typeAtt.setType(HJStandardTokenizer.TOKEN_TYPES[HJStandardTokenizer.NUM]);
                    return true;
                }
            } finally {
                if (p.variants.size() == 0) {
                    phoneVariants.remove(p);
                }
            }
        }

        int posIncr = 1;

        while(true) {
            int tokenType = scanner.getNextToken();

            if (tokenType == StandardTokenizerInterface.YYEOF) {
                return false;
            }

            if (scanner.yylength() <= maxTokenLength) {
                posIncrAtt.setPositionIncrement(posIncr);
                scanner.getText(termAtt);
                final int start = scanner.yychar();
                offsetAtt.setOffset(correctOffset(start),
correctOffset(start+termAtt.length()));
                // This 'if' should be removed in the next release. For
now, it converts
                // invalid acronyms to HOST. When removed, only the 'else'
part should
                // remain.
                if (tokenType == HJStandardTokenizer.ACRONYM_DEP) {

typeAtt.setType(HJStandardTokenizer.TOKEN_TYPES[HJStandardTokenizer.HOST]);
                    termAtt.setLength(termAtt.length() - 1); // remove
extra '.'
                } else {

typeAtt.setType(HJStandardTokenizer.TOKEN_TYPES[tokenType]);
                }
                return true;
            } else
                // When we skip a too-long term, we still increment the
                // position increment
                posIncr++;
        }
    }

    @Override
    public final void end() {
        // set final offset
        int finalOffset = correctOffset(scanner.yychar() +
scanner.yylength());
        offsetAtt.setOffset(finalOffset, finalOffset);
    }

    @Override
    public void reset() throws IOException {
        scanner.yyreset(input);
    }
}