利用htmlparser提取网页纯文本的例子
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
importorg.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.TableTag;
import org.htmlparser.util.NodeList;
/**
* 标题:利用htmlparser提取网页纯文本的例子
*/
public class TestHTMLParser2 {
/**
* 读取目标html内容
*
*/
publicstatic void testHtml() {
try{
StringsCurrentLine;
StringsTotalString;
sCurrentLine= "";
sTotalString= "";
java.io.InputStreaml_urlStream;
java.net.URLl_url = new java.net.URL(
"http://10.249.187.199:8083/injs100/");
java.net.HttpURLConnectionl_connection = (java.net.HttpURLConnection) l_url
.openConnection();
l_connection.connect();
l_urlStream= l_connection.getInputStream();
java.io.BufferedReaderl_reader = new java.io.BufferedReader(
newjava.io.InputStreamReader(l_urlStream));
while((sCurrentLine = l_reader.readLine()) != null) {
sTotalString+= sCurrentLine + "\r\n";
}
StringtestText = extractText(sTotalString);
}catch (Exception e) {
e.printStackTrace();
}
}
/**
* 抽取纯文本信息
* @param inputHtml:html文本
* @return
* @throws Exception
*/
publicstatic String extractText(String inputHtml) throws Exception {
StringBuffertext = new StringBuffer();
Parserparser = Parser.createParser(new String(inputHtml.getBytes(),
"GBK"),"GBK");
//遍历所有的节点
NodeListnodes = parser.extractAllNodesThatMatch(new NodeFilter() {
publicboolean accept(Node node) {
returntrue;
}
});
System.out.println(nodes.size());
for(int i = 0; i < nodes.size(); i++) {
Nodenodet = nodes.elementAt(i);
//字符串的代表性节点:节点的描述
text.append(newString(nodet.toPlainTextString().getBytes("GBK"))
+"\r\n");
}
returntext.toString();
}
/**
* 读取文件的方式/utl 来分析内容.filePath也可以是一个Url.
* @param resource :文件/Url
* @throws Exception
*/
publicstatic void test5(String resource) throws Exception {
ParsermyParser = new Parser(resource);
myParser.setEncoding("GBK");
StringfilterStr = "table";
NodeFilterfilter = new TagNameFilter(filterStr);
NodeListnodeList = myParser.extractAllNodesThatMatch(filter);
/*for(inti=0;i<nodeList.size();i++)
{
TableTagtabletag = (TableTag) nodeList.elementAt(i);
//标签名称
System.out.println(tabletag.getTagName());
System.out.println(tabletag.getText());
}*/
TableTagtabletag = (TableTag) nodeList.elementAt(1);
http://c.tieba.baidu.com/p/3392402431
http://c.tieba.baidu.com/p/3392405849
http://c.tieba.baidu.com/p/3392411579
http://c.tieba.baidu.com/p/3392416082
http://c.tieba.baidu.com/p/3392420343
http://c.tieba.baidu.com/p/3392424840
http://c.tieba.baidu.com/p/3392429238
http://c.tieba.baidu.com/p/3392433592
http://c.tieba.baidu.com/p/3392437923
http://c.tieba.baidu.com/p/3392442352
http://c.tieba.baidu.com/p/3392446861
http://c.tieba.baidu.com/p/3392451443
http://c.tieba.baidu.com/p/3392455917
http://c.tieba.baidu.com/p/3392460228
http://c.tieba.baidu.com/p/3392464693
http://c.tieba.baidu.com/p/3392469164
http://c.tieba.baidu.com/p/3392473582
http://c.tieba.baidu.com/p/3392478006
http://c.tieba.baidu.com/p/3392482459
http://c.tieba.baidu.com/p/3392487114
http://c.tieba.baidu.com/p/3392491670
http://c.tieba.baidu.com/p/3392496190
http://c.tieba.baidu.com/p/3392500743
http://c.tieba.baidu.com/p/3392505309
http://c.tieba.baidu.com/p/3392509820
http://c.tieba.baidu.com/p/3392514415
http://c.tieba.baidu.com/p/3392519080
http://c.tieba.baidu.com/p/3392523563
http://c.tieba.baidu.com/p/3392528180
http://c.tieba.baidu.com/p/3392532879
http://c.tieba.baidu.com/p/3392537592
http://c.tieba.baidu.com/p/3392542235
http://c.tieba.baidu.com/p/3392546757
http://c.tieba.baidu.com/p/3392551549
http://c.tieba.baidu.com/p/3392556372
http://c.tieba.baidu.com/p/3392561202
http://c.tieba.baidu.com/p/3392566194
http://c.tieba.baidu.com/p/3392571173
http://c.tieba.baidu.com/p/3392576221
http://c.tieba.baidu.com/p/3392581132
http://c.tieba.baidu.com/p/3392586257
http://c.tieba.baidu.com/p/3392591293
http://c.tieba.baidu.com/p/3392596453
http://c.tieba.baidu.com/p/3392601756
http://c.tieba.baidu.com/p/3392606977
http://c.tieba.baidu.com/p/3392612191
http://c.tieba.baidu.com/p/3392617313
http://c.tieba.baidu.com/p/3392622531
http://c.tieba.baidu.com/p/3392627719
http://c.tieba.baidu.com/p/3392633007
http://c.tieba.baidu.com/p/3392638423
http://c.tieba.baidu.com/p/3392643790
http://c.tieba.baidu.com/p/3392649198
http://c.tieba.baidu.com/p/3392654429
http://c.tieba.baidu.com/p/3392659821
http://c.tieba.baidu.com/p/3392665260
http://c.tieba.baidu.com/p/3392670593
http://c.tieba.baidu.com/p/3392676058
http://c.tieba.baidu.com/p/3392681592
http://c.tieba.baidu.com/p/3392687221
http://c.tieba.baidu.com/p/3392692701
http://c.tieba.baidu.com/p/3392698484
http://c.tieba.baidu.com/p/3392704140
http://c.tieba.baidu.com/p/3392783374
http://c.tieba.baidu.com/p/3392798951
http://c.tieba.baidu.com/p/3392817954
http://c.tieba.baidu.com/p/3392817954
http://c.tieba.baidu.com/p/3392841279
http://c.tieba.baidu.com/p/3392850395
http://c.tieba.baidu.com/p/3392856094
http://c.tieba.baidu.com/p/3392861130
http://c.tieba.baidu.com/p/3392870496
http://c.tieba.baidu.com/p/3392903790
http://c.tieba.baidu.com/p/3392910784
http://c.tieba.baidu.com/p/3392915350
郑重声明:本站内容如果来自互联网及其他传播媒体,其版权均属原媒体及文章作者所有。转载目的在于传递更多信息及用于网络分享,并不代表本站赞同其观点和对其真实性负责,也不构成任何其他建议。