lucene一个TermFilter的简单实现
public abstract DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) throws IOException;
经过一天的研究,总算有些进展. 希望大家提出各种意见,欢迎拍砖! lucene版本:4.3.1
小插曲,原本想写写spetial search,但是研究研究着,就了解到了termFilter. 因此,见到代码后,不要奇怪啊.有机会的话,再写一些关于spetial的实现.虽然有现成的实现,可是,依然想弄明白,具体是怎么回事. 欢迎大家拍一些有深度的砖.
你可以拍砖,但是你拍得一定要有意义~
核心类:
package com.pptv.search.list.index.increment; import java.io.IOException; import java.nio.charset.Charset; import java.util.Iterator; import org.apache.commons.lang.NumberUtils; import org.apache.lucene.index.AtomicReader; import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.Fields; import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.Filter; import org.apache.lucene.util.Bits; import org.apache.lucene.util.FixedBitSet; public class MyOwnFilter extends Filter { public static void main(String[] args) throws Exception { SpatialSearchTest.main(args); } @Override public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws IOException { // lazy init if needed - no need to create a big bitset ahead of time System.out.println(">>>> MyOwnFilter in"); final AtomicReader reader = context.reader(); // A. 生成一个结果集合,并初始化到最大 FixedBitSet result = new FixedBitSet(reader.maxDoc()); // B. 通过得到所有的 词元 final Fields fields = reader.fields(); // 显示fields showFields(fields); // terms操作 String termName = "able"; Terms terms = fields.terms(termName); System.out.println(termName + "_" + "terms.size() = " + terms.size()); // C. 得到具体的每一个词元 TermsEnum reuse = null; reuse = terms.iterator(reuse); for (int i = 0; i < terms.size(); i++) { reuse.next(); System.out.println("----" + i + "----" + reuse.term()); System.out.println("内容:" + new String(reuse.term().bytes, 0, reuse.term().length, Charset.forName("UTF-8"))); // BytesRef text = new BytesRef("2".getBytes()); // D. 查看所有terms中,是否存在此term // System.out.println(reuse.seekExact(text, false)); // System.out.println(text); // E. 通过词元,对倒排词典进行反查 DocsEnum docs = null; // no freq ,since we don't need them docs = reuse.docs(acceptDocs, docs, DocsEnum.FLAG_NONE); while (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { int docId = docs.docID(); System.out.println("collected:" + docId); result.set(docId); } } System.out.println("<<<< MyOwnFilter out"); return result; } private void showFields(final Fields fields) { System.out.println("fields.size() = " + fields.size()); Iterator<String> ite = fields.iterator(); int i = 0; while (ite.hasNext()) { ++i; System.out.println("\t" + i + ":" + ite.next()); } } }
入口类:
package com.pptv.search.list.index.increment; import java.io.IOException; import java.util.BitSet; import java.util.Set; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.DoubleDocValuesField; import org.apache.lucene.document.DoubleField; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.NumericDocValuesField; import org.apache.lucene.document.StringField; import org.apache.lucene.index.AtomicReader; import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.search.ComplexExplanation; import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.Filter; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.Weight; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.DocIdBitSet; import org.apache.lucene.util.NumericUtils; import org.apache.lucene.util.OpenBitSetIterator; import org.apache.lucene.util.ToStringUtils; import org.apache.lucene.util.Version; @SuppressWarnings("unused") public class SpatialSearchTest { static Version version = Version.LUCENE_43; public static void main(String[] args) throws Exception { RAMDirectory d = new RAMDirectory(); IndexWriter writer = new IndexWriter(d, new IndexWriterConfig(version, new StandardAnalyzer(version))); doIndex(writer); IndexSearcher searcher = new IndexSearcher(DirectoryReader.open(d)); System.out.println("maxDoc:" + searcher.getIndexReader().maxDoc()); // Query,Filter Query query = new MyQuery(); query.setBoost(1.0001f); System.out.println("query:" + query); Filter filter = null; filter = createFilter(); System.out.println("filter:" + filter); TopDocs tds = searcher.search(query, filter, 10); for (int i = 0; i < tds.scoreDocs.length; i++) { ScoreDoc sd = tds.scoreDocs[i]; Document doc = searcher.doc(sd.doc); pintDoc(doc); } } private static Filter createFilter() { Filter filter; // filter = new MyFilter(new Term("able", "1")); filter = new MyOwnFilter(); return filter; } private static void pintDoc(Document doc) { String lat = doc.get("lat"); String lng = doc.get("lng"); System.out.println("(" + lng + "," + lat + ")"); } private static void doIndex(IndexWriter writer) throws Exception, IOException { for (int i = 0; i < 100 && i < 5; i++) { Document document = new Document(); indexLocation(document, 100l + i, (Math.random() * 100l) + i * i, i % 2 == 0 ? "0" : "abcd你好"); writer.addDocument(document); } writer.forceMerge(1); writer.close(); } private static void indexLocation(Document document, double longitude, double latitude, String able) throws Exception { DoubleField lat = new DoubleField("lat", latitude, Store.YES); DoubleField lng = new DoubleField("lng", longitude, Store.YES); document.add(new StringField("able", able, Store.YES)); document.add(lat); document.add(lng); } }
实际上就是通过Filter暴露给我们的下面这个方法
public abstract DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws IOException;
通过context得到reader,再得到fields,再得到terms,最后通过
public abstract DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) throws IOException;
方法,将结果合并封装,并返回.注意,这个过程是在搜索过程中,执行的.
下面这句话,希望高手们拍下砖,
目前,我猜想lucene是先query,后filter的?对吗? 怎么都感觉不对.希望明示,改天有机会,再来验证这个问题.
郑重声明:本站内容如果来自互联网及其他传播媒体,其版权均属原媒体及文章作者所有。转载目的在于传递更多信息及用于网络分享,并不代表本站赞同其观点和对其真实性负责,也不构成任何其他建议。