JAVA xml 流方式读取。数据挖掘大文件预处理。
import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.util.HashMap; import java.util.Iterator; import java.util.Map; /** * @author gjf *db_pre.arff存储的是从xml文件中读取的xml信息 */ public class ElmAuth { Map<String, Integer> map = new HashMap<String, Integer>(); //第一步 //从xml文件中提取 读取xml文件中的author信息,写到db_pre.arff,并且替换特殊字符 public void settleXml(String src, String dst){//src=dblp.xml dst=db_pre.arff File file = new File(src); File fl = new File(dst); FileReader fr; try { fr = new FileReader(file); FileWriter fw = new FileWriter(fl); BufferedReader br = new BufferedReader(fr); BufferedWriter bw = new BufferedWriter(fw); String line = null; boolean flag = true; int loc_st; int loc_end; int len = 0, max = 0; while((line = br.readLine()) != null){ if(line == null) break; loc_st = line.indexOf("<author>"); if(loc_st != -1){ loc_end = line.indexOf("</author>"); line = line.substring(loc_st + 8, loc_end);//在<author></author>之间的数据,一个作者的名字 line=line.replace(‘&‘, ‘ ‘); line=line.replace(‘$‘, ‘ ‘); line=line.replace("‘ "," "); line=line.replace("‘", " "); /*flag以文章为界限,在同一篇文章内,flag=false,写入在同一行*/ if(flag){ bw.write("\n"); bw.write(line); } else { bw.write(","); bw.write(line); } len++;//每写一个作者,计数加 +1 flag = false; } else { flag = true; if(max < len) max = len;//选择最大的len; len = 0; bw.flush(); } } System.out.println("第一步 论文中具有最大的作者数:" + max); } catch (IOException e) { e.printStackTrace(); } } //消除只有单个作item //第二步:将作者的信息db_pre.arff中只有一个作者的数据删除 public void elimate_one(String src, String dst){//src=db_pre.arff dst=db_elone.arff try { File file = new File(src); FileReader fr = new FileReader(file); BufferedReader br = new BufferedReader(fr); File filew = new File(dst); FileWriter fw = new FileWriter(filew); BufferedWriter bw = new BufferedWriter(fw); Map<String, Integer> map = new HashMap<String, Integer>(); String line = null; int k = 1; int res = 0; while((line = br.readLine()) != null){ String[] arrLine = line.split(","); //作者之间用","隔离,","的数量表示作者的个数,数量比一少,则不写入. if(arrLine.length > 1){ bw.write(line); bw.write("\n"); res ++; } } bw.flush(); br.close(); bw.close(); fr.close(); //System.out.println("The Row of the file is:" + res); System.out.println("这篇论文中去除单个作者后的行数:" + res); }catch (IOException e) { e.printStackTrace(); } } //将剩余的作储再hashMap中,key值为人名,value为出现的次数,支持度数 public void createMap(String src){//srr=db_elone.arff try { File file = new File(src); FileReader fr = new FileReader(file); BufferedReader br = new BufferedReader(fr); String line = null; while((line = br.readLine()) != null){ if(line == null) break; String[] arrLine = line.split(","); for(int i = 0; i < arrLine.length; ++i){ if(map.get(arrLine[i]) == null){ map.put(arrLine[i], 1); } else { map.put(arrLine[i], map.get(arrLine[i]) + 1); } } } fr.close(); br.close(); } catch (IOException e) { e.printStackTrace(); } } //从hashMap中删除小于支持度minsup的作者,本次的支持度数为100; public void settleMap(int minsup){ Iterator it = map.keySet().iterator(); while(it.hasNext()){ String str = (String) it.next(); if(map.get(str) < minsup){ it.remove(); } } System.out.println("Map的大小,支持度大于100的作者个数:" + map.size()); } //将大于minsup的作者存储到文件 db_minsup.arff,存储的是符合筛选的作者 public void updateMap(String src, String dst){//src=db_elone.arff dst=db_minsup.arff try { File filer = new File(src); FileReader fr = new FileReader(filer); BufferedReader br = new BufferedReader(fr); File filew = new File(dst); FileWriter fw = new FileWriter(filew); BufferedWriter bw = new BufferedWriter(fw); String line = null; int res = 0; boolean flag = true; while((line = br.readLine()) != null){ if(line == null)break; String[] arrLine = line.split(","); if(flag == false)res++; flag = true; for(int i = 0; i < arrLine.length; ++i){ if(map.get(arrLine[i]) != null){ if(flag == true){ bw.write("\n" + arrLine[i]); flag = false; } else { bw.write("," + arrLine[i]); } } } } bw.flush(); System.out.println("符合筛选的作者合作写的论文篇数:" + res); fw.close(); bw.close(); fr.close(); br.close(); } catch (IOException e) { e.printStackTrace(); } } //生成weka识别的文 dst=db public void createWekaFile(String src, String dst){//src=db_minsup.arff dst=db try { File filer = new File(src); FileReader fr = new FileReader(filer); BufferedReader br = new BufferedReader(fr); File filew = new File(dst); FileWriter fw = new FileWriter(filew); BufferedWriter bw = new BufferedWriter(fw); bw.write("@relation db" + "\n"); Iterator it = map.keySet().iterator(); while(it.hasNext()){ String str = (String) it.next(); str.replace("‘", "\‘"); bw.write("@attribute ‘" + str + "‘ { t}\n"); } bw.write("@data" + "\n"); String line = null; boolean flag = true; while((line = br.readLine()) != null){ if(line == null)break; flag = true; char ch; it = map.keySet().iterator(); while(it.hasNext()){ String str = (String)it.next(); if(line.indexOf(str) >= 0){ ch = ‘t‘; } else { ch = ‘?‘; } if(flag == true){ bw.write(ch); } else { bw.write("," + ch); } flag = false; } bw.write("\n"); } bw.flush(); fw.close(); bw.close(); fr.close(); br.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public void clearMap(){ map.clear(); } public static void main(String args[]){ ElmAuth elmauth = new ElmAuth(); elmauth.settleXml("dblp.xml", "db_pre.arff"); elmauth.elimate_one("db_pre.arff", "db_elone.arff"); elmauth.createMap("db_elone.arff"); elmauth.settleMap(100);//确定最小支持度数 elmauth.updateMap("db_elone.arff", "db_minsup.arff"); for(int i = 0; i < 20; ++i){ System.out.println(); elmauth.elimate_one("db_minsup.arff", "db_minsup_elone.arff"); elmauth.clearMap(); elmauth.createMap("db_minsup_elone.arff"); elmauth.settleMap(100); elmauth.updateMap("db_minsup_elone.arff", "db_minsup.arff"); } elmauth.createWekaFile("db_minsup.arff", "db.arff"); } }
郑重声明:本站内容如果来自互联网及其他传播媒体,其版权均属原媒体及文章作者所有。转载目的在于传递更多信息及用于网络分享,并不代表本站赞同其观点和对其真实性负责,也不构成任何其他建议。