Lucene分词器
Lucene分析器的基类为Analyzer,Analyzer包含两个核心组件:Tokenizer和 TokenFilter。自定义分析器必须实现Analyzer类的抽象方法createComponents(String)来定义TokenStreamComponents。在调用方法tokenStream(String, Reader)的时候,TokenStreamComponents会被重复使用。
自定义分析器首先需要继承Analyzer类,代码如下:
public class HAnalyzer extends Analyzer { /* * 默认不使用停用单词 * */ private boolean useStopWords; private CharArraySet stopWords; public HAnalyzer() { useStopWords = false; } public HAnalyzer(CharArraySet stopWords) { useStopWords = true; this.stopWords = stopWords; } @Override protected TokenStreamComponents createComponents(String fieldName) { LetterTokenizer tokenizer = new LetterTokenizer(); if(useStopWords) { return new TokenStreamComponents(tokenizer , new HStopTokenFilter(tokenizer, stopWords)); } return new TokenStreamComponents(tokenizer); } }
Analyzer两个核心组件:Tokenizer和 TokenFilter,实现如下:
/* * 分词解析器,需要定义Token属性CharTermAttribute offsetAttribute * */ public class LetterTokenizer extends Tokenizer { /* * 词元文本属性 * */ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); /* * 词元位移属性 * */ private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); /* * Token文本最大长度 * */ private static final int MAX_WORD_LEN = 255; /* * Buffer Size * */ private static final int IO_BUFFER_SIZE = 4096; private char[] ioBuffer = new char[IO_BUFFER_SIZE]; /* * Token分隔符集合 * */ private char[] splitChars = {‘ ‘,‘,‘,‘.‘,‘!‘}; /* * 当前字符串在原字符串中的位置 * */ private int offset = 0; /* * 当前字符在这一次读取的字符串中的位置 * */ private int bufferIndex = 0; /* * 每次读取字符串的长度 * */ private int dataLen = 0; @Override public boolean incrementToken() throws IOException { clearAttributes(); // 清除前一个Token的所有属性 int length = 0; // 单词的长度 int start = bufferIndex; char []buffer = termAtt.buffer(); while(true) { if(bufferIndex >= dataLen) { // 分词处理到ioBuffer末尾时,继续从input读取数据 offset += dataLen; dataLen = input.read(ioBuffer); if(dataLen == -1) { // 在Reader读取结束 dataLen = 0; if(length > 0) { // 虽然从input读取完数据,ioBuffer处理的字符 还没有生成Token break; } else { return false; } } bufferIndex = 0; // 指向ioBuffer的起始位置 } /**处理ioBuffer读取的字符*/ final char ch = ioBuffer[bufferIndex++]; if(isTokenChar(ch)) { // ch分隔符,形成Token,跳出循环 if(length == 0) { start = offset + bufferIndex - 1; } else if(length == buffer.length) { buffer = termAtt.resizeBuffer(length + 1); } if(length == MAX_WORD_LEN) { break; } break; } else { buffer[length++] = normalize(ch); // CharTermAttribute文本赋值 } } termAtt.setLength(length); offsetAtt.setOffset(correctOffset(start), correctOffset(start + length)); return true; } /* * 规整化--->转为小写 * */ protected char normalize(char ch) { return Character.toLowerCase(ch); } /* * 如果字符ch是分隔符,返回true * */ protected boolean isTokenChar(char ch) { for(char c : splitChars) { if(ch == c) { return true; } } return false; } }
/* * 过滤TokenStream,需要更改Token的PositionIncrementAttribute属性 * */ public class HStopTokenFilter extends TokenFilter { /* * TokenStream流Token文本属性 * */ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); /* * 当前Token与前一个Token位移差属性 * */ private PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); private int skippedPositions; /* * 停用单词集合 * */ private CharArraySet stopWords; protected HStopTokenFilter(TokenStream input) { super(input); } public HStopTokenFilter(TokenStream input , CharArraySet stopWords) { this(input); this.stopWords = stopWords; } @Override public boolean incrementToken() throws IOException { clearAttributes(); // 清除上个Token所有属性 skippedPositions = 0; while(input.incrementToken()) { if(filter()) { // 过滤掉当前Token,修改skippedPositions skippedPositions += posIncrAtt.getPositionIncrement(); } else { // 当前Token不可过滤,如果前一个Token被过滤,需修改当前Token的PositionIncrementAttribute属性 if(skippedPositions != 0) { posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions); } return true; } } return false; } private boolean filter() { return stopWords.contains(termAtt.buffer() , 0 , termAtt.length()); } }
通过自定义的HAnalyzer,可以完成文本分析,示例如下:
public class Main { public static void main(String []args) { HAnalyzer analyzer = new HAnalyzer(); TokenStream ts = null; try { ts = analyzer.tokenStream("myfield", new StringReader("I am a student.My name is Tom!")); //获取词元位置属性 OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); //获取词元文本属性 CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); //重置TokenStream(重置StringReader) ts.reset(); //迭代获取分词结果 while (ts.incrementToken()) { System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() ); } //关闭TokenStream(关闭StringReader) ts.end(); } catch (IOException e) { e.printStackTrace(); } } }
郑重声明:本站内容如果来自互联网及其他传播媒体,其版权均属原媒体及文章作者所有。转载目的在于传递更多信息及用于网络分享,并不代表本站赞同其观点和对其真实性负责,也不构成任何其他建议。