Lucene实战-Indexer索引创建

浏览数：39 / 时间：2015年06月09日

package com.lin.util;

import java.io.File;
import java.io.FileFilter;
import java.io.FileReader;
import java.io.IOException;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;

public class Indexer {

	private IndexWriter writer;

	/**
	 * 主程序
	 * 
	 * @param indexDir
	 *            索引位置
	 * @param dataDir
	 *            数据来源
	 * @throws Exception
	 */
	public static void index(String indexDir, String dataDir) throws Exception {
		if (indexDir == null || dataDir == null) {
			throw new IllegalArgumentException("请检查你的参数是否正确");
		}
		long start = System.currentTimeMillis();
		Indexer indexer = new Indexer(indexDir);
		int numIndexed;
		try {
			numIndexed = indexer.index(dataDir, new TextFilesFilter());
		} finally {
			indexer.close();
		}
		long end = System.currentTimeMillis();
		System.out.println("Indexing " + numIndexed + " files took "
				+ (end - start) + " milliseconds");
	}

	/**
	 * 初始化writer（用与建立索引）
	 * 
	 * @param indexDir
	 * @throws IOException
	 */
	private Indexer(String indexDir) throws IOException {
		Directory dir = FSDirectory.open(new File(indexDir));
		IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_4_10_2,
				new IKAnalyzer());
		writer = new IndexWriter(dir, config);
	}

	/**
	 * 
	 * @param dataDir
	 * @param filter
	 * @return 索引的文件数
 	 * @throws IOException
	 */
	public int index(String dataDir, FileFilter filter) throws IOException {
		File[] files = new File(dataDir).listFiles();
		for (File f : files) {
			if (!f.isDirectory() && !f.isHidden() && f.canRead() && f.exists()
					&& (filter == null || filter.accept(f))) {
				indexFile(f);
			}
		}
		return writer.numDocs();
	}

	private void indexFile(File f) throws IOException {
		System.out.println("indexing " + f.getCanonicalPath());
		Document doc = getDocument(f);
		writer.addDocument(doc);

	}

	@SuppressWarnings("deprecation")
	protected Document getDocument(File f) throws IOException {
		Document doc = new Document();
		doc.add(new Field("contents", new FileReader(f)));
		doc.add(new Field("filename", f.getName(), Field.Store.YES,
				Field.Index.NOT_ANALYZED));
		doc.add(new Field("fullpath", f.getCanonicalPath(), Field.Store.YES,
				Field.Index.NOT_ANALYZED));
		return doc;
	}

	/**
	 * 文件过滤器
	 * 
	 * @author zan
	 * 
	 */
	private static class TextFilesFilter implements FileFilter {

		public boolean accept(File f) {
			return f.getName().toLowerCase().endsWith(".txt");
		}

	}

	public void close() throws IOException {
		if (writer != null) {
			writer.close();
		}

	}

	public static void main(String[] args) throws Exception {
		Indexer.index("d:\\index", "D:\\Program Files\\TortoiseSVN");
	}
}

郑重声明：本站内容如果来自互联网及其他传播媒体，其版权均属原媒体及文章作者所有。转载目的在于传递更多信息及用于网络分享，并不代表本站赞同其观点和对其真实性负责，也不构成任何其他建议。