Lucene分词器

浏览数：24 / 时间：2015年06月09日

Lucene分析器的基类为Analyzer，Analyzer包含两个核心组件：Tokenizer和 TokenFilter。自定义分析器必须实现Analyzer类的抽象方法createComponents(String)来定义TokenStreamComponents。在调用方法tokenStream(String, Reader)的时候，TokenStreamComponents会被重复使用。

自定义分析器首先需要继承Analyzer类，代码如下：

public class HAnalyzer extends Analyzer {

    /*
     * 默认不使用停用单词
     * */
    private boolean useStopWords;
    
    private CharArraySet stopWords;
    
    public HAnalyzer() {
        useStopWords = false;
    }
    
    public HAnalyzer(CharArraySet stopWords) {
        useStopWords = true;
        this.stopWords = stopWords;
    }
    
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
        LetterTokenizer tokenizer = new LetterTokenizer();
        if(useStopWords) {
            return new TokenStreamComponents(tokenizer , new HStopTokenFilter(tokenizer, stopWords));
        }
        return new TokenStreamComponents(tokenizer);
    }

}

Analyzer两个核心组件：Tokenizer和 TokenFilter，实现如下：

/*
 * 分词解析器,需要定义Token属性CharTermAttribute offsetAttribute
 * */
public class LetterTokenizer extends Tokenizer {
    
    /*
     * 词元文本属性
     * */ 
    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
        
    /*
     * 词元位移属性
     * */ 
    private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
    
    /*
     * Token文本最大长度
     * */
    private static final int MAX_WORD_LEN = 255;      
    
    /*
     * Buffer Size
     * */
    private static final int IO_BUFFER_SIZE = 4096;
    
    private char[] ioBuffer = new char[IO_BUFFER_SIZE];
    
    /*
     * Token分隔符集合
     * */
    private char[] splitChars = {‘ ‘,‘,‘,‘.‘,‘!‘};
    
    /*
     * 当前字符串在原字符串中的位置
     * */
    private int offset = 0;
    
    /*
     * 当前字符在这一次读取的字符串中的位置
     * */
    private int bufferIndex = 0;
    
    /*
     * 每次读取字符串的长度
     * */
    private int dataLen = 0;
    
    @Override
    public boolean incrementToken() throws IOException {
        clearAttributes();                                  // 清除前一个Token的所有属性
        int length = 0;                                     // 单词的长度
        int start = bufferIndex;
        char []buffer = termAtt.buffer();
        while(true) {
            if(bufferIndex >= dataLen) {                    // 分词处理到ioBuffer末尾时,继续从input读取数据
                offset += dataLen;
                dataLen = input.read(ioBuffer);
                if(dataLen == -1) {                         // 在Reader读取结束          
                    dataLen = 0;
                    if(length > 0) {                        // 虽然从input读取完数据,ioBuffer处理的字符 还没有生成Token
                        break;
                    } else {
                        return false;                      
                    }
                }
                bufferIndex = 0;                            // 指向ioBuffer的起始位置
            }
            /**处理ioBuffer读取的字符*/
            final char ch = ioBuffer[bufferIndex++];
            if(isTokenChar(ch)) {                           // ch分隔符,形成Token,跳出循环
                if(length == 0) {
                    start = offset + bufferIndex - 1;
                } else if(length == buffer.length) {
                    buffer = termAtt.resizeBuffer(length + 1);
                }
                if(length == MAX_WORD_LEN) {
                    break;
                }
                break;
            } else {
                buffer[length++] = normalize(ch);           // CharTermAttribute文本赋值 
            }
         }
        termAtt.setLength(length);
        offsetAtt.setOffset(correctOffset(start), correctOffset(start + length));
         return true;
    }

    /*
     * 规整化--->转为小写
     * */
    protected char normalize(char ch) {
        return Character.toLowerCase(ch);
    }
    
    
    /*
     * 如果字符ch是分隔符,返回true
     * */
    protected boolean isTokenChar(char ch) {
        for(char c : splitChars) {
            if(ch == c) {
                return true;
            }
        }
        return false;
    }
    
}

/*
 * 过滤TokenStream,需要更改Token的PositionIncrementAttribute属性
 * */
public class HStopTokenFilter extends TokenFilter {

    /*
     * TokenStream流Token文本属性
     * */
    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
    
    /*
     * 当前Token与前一个Token位移差属性
     * */
    private PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
    
    private int skippedPositions;                      
    
    /*
     * 停用单词集合
     * */
    private CharArraySet stopWords;
    
    protected HStopTokenFilter(TokenStream input) {
        super(input);
    }
    
    public HStopTokenFilter(TokenStream input , CharArraySet stopWords) {
        this(input);
        this.stopWords = stopWords;
    }

    @Override
    public boolean incrementToken() throws IOException {
        clearAttributes();                           // 清除上个Token所有属性
        skippedPositions = 0;
        while(input.incrementToken()) {
            if(filter()) {                           // 过滤掉当前Token,修改skippedPositions
                skippedPositions += posIncrAtt.getPositionIncrement();
            } else {                                 // 当前Token不可过滤,如果前一个Token被过滤,需修改当前Token的PositionIncrementAttribute属性
                if(skippedPositions != 0) {
                    posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
                }
                return true;
            }
        }
        return false;
    }

    private boolean filter() {
        return stopWords.contains(termAtt.buffer() , 0 , termAtt.length());
    }
}

通过自定义的HAnalyzer，可以完成文本分析，示例如下：

public class Main {

    public static void main(String []args) {
        HAnalyzer analyzer = new HAnalyzer();
        TokenStream ts = null;
        try {
            ts = analyzer.tokenStream("myfield", new StringReader("I am a student.My name is Tom!"));
            //获取词元位置属性
            OffsetAttribute  offset = ts.addAttribute(OffsetAttribute.class); 
            //获取词元文本属性
            CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
            //重置TokenStream（重置StringReader）
            ts.reset(); 
            //迭代获取分词结果
            while (ts.incrementToken()) {
              System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() );
            }
            //关闭TokenStream（关闭StringReader）
            ts.end();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    
}