java读取中文分词工具(三)




import java.io.EOFException;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.util.ArrayList;


/*
 * 文件格式:已分词的文本,词语之间用空格,换行等空白符分割。
 * 到了文件末尾就结束
 * 适合读取一行很大的文本,因为这里的缓冲不是一行,而是若干个词语(比一行少)。
 * 代码实现方式:每次读若干个词语作为一个句子,逐个字节读,以空白符区分词语的开始和结束。
 * 
 */
public class WordReader 
{
	RandomAccessFile raf = null;
	ArrayList<String> sentence = null;
	int senSize = 1000;
	int senPos =0 ;
	
	public WordReader(String fileName) throws IOException
	{
	     File file=new File(fileName);   
	     raf = new RandomAccessFile(file,"r") ; 
	     sentence = new ArrayList<String>();
	}

	public String[] getNextWords(int count) throws IOException
	{
		if(senPos+count >= sentence.size())//到了段落末尾,读取新的段落
		{
			if(readSentence())
				return getNextWords(count);
			else return null;
		}
		String[] words = new String[count];
		for(int i=0;i<count;i++)
		{
			words[i] = sentence.get(senPos+i);
		}
		senPos++;
		return words;
	}
	private boolean readSentence()
	{
		try
		{
			sentence.clear();
			for(int i=0;i<senSize;i++)
			{
				//System.out.println(i);
				
				int len = 0;
				while(true)
				{
					int b = raf.read();
					if(b == -1) return false;
					if(b == ' ' || b == '\n'|| b == '\r'|| b=='\t')
					{
						break;
					}
					len++;
				}
				raf.seek(raf.getFilePointer() -len-1);
				byte[] buffer = new byte[len];
				raf.read(buffer, 0, len);
				//byte[] sub = new byte[len];
				//for(int k=0;k<len;k++) sub[k] = buffer[k];
				String word = new String(buffer,"utf-8");//这里有坑,不会根据结束符0截断字符串,必须手动处理
				//System.out.println(word);
				sentence.add(word);
				while(true)
				{
					int b = raf.read();
					if(b == -1) return false;
					if(b == ' ' || b == '\n' || b == '\r' || b=='\t')
					{
						continue;
					}
					else break;
				}
				raf.seek(raf.getFilePointer() -1);
				
			}	
			senPos = 0;
			return true;
		}
		catch(EOFException ex)
		{
			ex.printStackTrace();
			return false;
		}
		catch(IOException ex)
		{
			ex.printStackTrace();
			return false;
		}
	
	}
	
	
	
	public static void main(String[] args) throws IOException 
	{
		// TODO Auto-generated method stub
		//WordReader wr = new WordReader("/home/linger/sources/ParaModel/electronic_seg.txt");
		WordReader wr = new WordReader("/home/linger/sources/resultbig.txt");
		wr.readSentence();
		//System.out.println("-------------------------");
		//wr.readSentence();
		//int i=0;
		//while(true)//614005行
		//{
			//String[] words = wr.getNextWords(5);
			//if(words == null) break;
			//System.out.println(i++);
			//System.out.println(words.length);
			//System.out.printf("%s,%s,%s,%s,%s \n",words[0],words[1],words[2],words[3],words[4]);
		//}
	}

}

本文作者:linger

本文链接:http://blog.csdn.net/lingerlanlan/article/details/38337483

java读取中文分词工具(三),古老的榕树,5-wow.com

郑重声明:本站内容如果来自互联网及其他传播媒体,其版权均属原媒体及文章作者所有。转载目的在于传递更多信息及用于网络分享,并不代表本站赞同其观点和对其真实性负责,也不构成任何其他建议。