java读取中文分词工具(四)



import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.RandomAccessFile;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.StringTokenizer;

/*
 * 文件格式:已分词的中文文本,每个词语空格分割,每行一个段落。
 * 这个类适合读取每行数量较少的文本,比如分好段落的文本,一个段落一行存储。
 * 读取一行,步长为1,返回词组。不会跨段落生成词组。
 * 3种模式:
 * 1 读到文件末尾,结束
 * 2 读到文件末尾,从头再来
 * 3 一行循环多次,浏览到文本末尾就完了
 */
public class ParaWordReader implements Reader
{
	static final int normalMode = 0;//浏览到文本末尾就完了
	static final int againMode = 1;//浏览到文本末尾,从头再来
	static final int paraAgainMode = 2 ;//一行循环多次,浏览到文本末尾就完了
	private int currentMode = 0;
	
	private RandomAccessFile raf= null;
	private File file;
	private ArrayList<String> paraWords = null;
	
	private StringTokenizer tokenizer;
	private int currentPara = -1;
	private int paraPos = 0;
	private int paraIter = 0;
	private int paraIters = 1;
	public  ParaWordReader(String fileName) throws IOException
	{
		file=new File(fileName);
		raf = new RandomAccessFile(file,"r") ;  
		paraWords = new ArrayList<String>();
	}	
	
	public void setMode(int m)
	{
		currentMode = m;
	}
	
	public void setParaIters(int iters)
	{
		paraIters = iters;
		setMode(paraAgainMode);
	}
	
	public int paraIndex()
	{
		return currentPara;
	}
	private boolean readPara() throws IOException
	{		
		String line = raf.readLine();
		if(line == null)//到文件末尾了
		{
			if(currentMode == normalMode || currentMode == paraAgainMode)
			{
				return false;
			}
			else 
			{		
				System.out.println("文件太大可能不支持");
				raf.seek(0); 
				currentPara = -1;
				return readPara();
			}			
		}
		paraWords.clear();	
		line = new String(line.getBytes("iso8859-1"),"utf-8"); 
		tokenizer= new StringTokenizer(line," ");
		while(tokenizer.hasMoreTokens())
		{
			paraWords.add(tokenizer.nextToken());
		}	
		currentPara++;
		paraPos = 0;
		return true;	
	}
	
	public String[] getNextWords(int count) throws IOException
	{
		if(paraPos+count >= paraWords.size())//到了段落末尾
		{
			if(currentMode == paraAgainMode && paraIter< paraIters)//段落从头再来
			{
				paraPos = 0;
				paraIter++;
				return getNextWords(count);
			}
			else 
			{
				paraIter =0;
				if(readPara())//读取新的段落
				return getNextWords(count);
				else return null;
			}
		}
		String[] words = new String[count];
		for(int i=0;i<count;i++)
		{
			words[i] = paraWords.get(paraPos+i);
			
		}
		paraPos++;
		return words;
	}
		
	public static void main(String[] args) throws IOException 
	{
		// TODO Auto-generated method stub
		ParaWordReader wordReader = new ParaWordReader("/media/linger/G/sources/ParaModel/electronic_seg.txt");
		wordReader.currentMode = ParaWordReader.againMode;
		//while(true)//614005行
		for(int i=0;i<614005*2;i++)
		{
			String[] words = wordReader.getNextWords(5);
			if(words == null) break;
			System.out.printf("%s,%s,%s,%s,%s \n",words[0],words[1],words[2],words[3],words[4]);
		}
		System.out.println(wordReader.currentPara);

	}

}


本文作者:linger

本文链接:http://blog.csdn.net/lingerlanlan/article/details/38337707



java读取中文分词工具(四),古老的榕树,5-wow.com

郑重声明:本站内容如果来自互联网及其他传播媒体,其版权均属原媒体及文章作者所有。转载目的在于传递更多信息及用于网络分享,并不代表本站赞同其观点和对其真实性负责,也不构成任何其他建议。