使用HtmlParser抓去网页内容
package parser;
import org.htmlparser.Parser;
import org.htmlparser.beans.StringBean;
importorg.htmlparser.filters.NodeClassFilter;
importorg.htmlparser.parserapplications.StringExtractor;
import org.htmlparser.tags.BodyTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
/**
* 使用HtmlParser抓去网页内容: 要抓去页面的内容最方便的方法就是使用StringBean. 里面有几个控制页面内容的几个参数.
* 在后面的代码中会有说明. Htmlparser包中还有一个示例StringExtractor 里面有个直接得到内容的方法,
* 其中也是使用了StringBean . 另外直接解析Parser的每个标签也可以的.
*
*@author chenguoyong
*
*/
public class GetContent {
publicvoid getContentUsingStringBean(String url) {
StringBeansb = new StringBean();
sb.setLinks(true);// 是否显示web页面的连接(Links)
//为了取得页面的整洁美观一般设置上面两项为true , 如果要保持页面的原有格式, 如代码页面的空格缩进 可以设置为false
sb.setCollapse(true);// 如果是true的话把一系列空白字符用一个字符替代.
sb.setReplaceNonBreakingSpaces(true);//If true regular space
sb
.setURL("http://www.blogjava.net/51AOP/archive/2006/07/19/59064.html");
System.out.println("TheContent is :\n" + sb.getStrings());
}
publicvoid getContentUsingStringExtractor(String url, boolean link) {
//StringExtractor内部机制和上面的一样.做了一下包装
StringExtractorse = new StringExtractor(url);
Stringtext = null;
try{
text= se.extractStrings(link);
System.out.println("Thecontent is :\n" + text);
}catch (ParserException e) {
e.printStackTrace();
}
}
publicvoid getContentUsingParser(String url) {
NodeListnl;
try{
Parserp = new Parser(url);
nl= p.parse(new NodeClassFilter(BodyTag.class));
BodyTagbt = (BodyTag) nl.elementAt(0);
System.out.println(bt.toPlainTextString());// 保留原来的内容格式. 包含js代码
}catch (ParserException e) {
e.printStackTrace();
}
}
/**
* @param args
*/
publicstatic void main(String[] args) {
Stringurl = "http://www.blogjava.net/51AOP/archive/2006/07/19/59064.html";
//newGetContent().getContentUsingParser(url);
//--------------------------------------------------
newGetContent().getContentUsingStringBean(url);
}
http://c.tieba.baidu.com/p/3476776824
http://c.tieba.baidu.com/p/3476808306
http://c.tieba.baidu.com/p/3476798710
http://c.tieba.baidu.com/p/3474281354
http://c.tieba.baidu.com/p/3474300101
http://c.tieba.baidu.com/p/3474294075
http://c.tieba.baidu.com/p/3474123295
http://c.tieba.baidu.com/p/3474314242
http://c.tieba.baidu.com/p/3474310411
http://c.tieba.baidu.com/p/3474304550
http://c.tieba.baidu.com/p/3475433945
http://c.tieba.baidu.com/p/3475430015
http://c.tieba.baidu.com/p/3475433348
http://c.tieba.baidu.com/p/3475431434
http://c.tieba.baidu.com/p/3474176863
http://c.tieba.baidu.com/p/3474159835
http://c.tieba.baidu.com/p/3474163941
http://c.tieba.baidu.com/p/3474156121
http://c.tieba.baidu.com/p/3474147660
http://c.tieba.baidu.com/p/3474151899
http://c.tieba.baidu.com/p/3474142287
http://c.tieba.baidu.com/p/3474136965
http://c.tieba.baidu.com/p/3474133165
http://c.tieba.baidu.com/p/3474128675
http://c.tieba.baidu.com/p/3474103896
http://c.tieba.baidu.com/p/3474099488
http://c.tieba.baidu.com/p/3474094120
http://c.tieba.baidu.com/p/3475431976
http://c.tieba.baidu.com/p/3474267991
http://c.tieba.baidu.com/p/3474259583
http://c.tieba.baidu.com/p/3474254990
http://c.tieba.baidu.com/p/3474228986
http://c.tieba.baidu.com/p/3474221626
http://c.tieba.baidu.com/p/3474215742
http://c.tieba.baidu.com/p/3474212122
http://c.tieba.baidu.com/p/3474188883
http://c.tieba.baidu.com/p/3474207722
http://c.tieba.baidu.com/p/3474184143
http://c.tieba.baidu.com/p/3474180522
http://c.tieba.baidu.com/p/3474171022
http://c.tieba.baidu.com/p/3474086627
http://c.tieba.baidu.com/p/3462847203
http://c.tieba.baidu.com/p/3462839334
http://c.tieba.baidu.com/p/3462834294
http://c.tieba.baidu.com/p/3462786130
http://c.tieba.baidu.com/p/3462782768
http://c.tieba.baidu.com/p/3461791753
http://c.tieba.baidu.com/p/3461784215
http://c.tieba.baidu.com/p/3461778008
http://c.tieba.baidu.com/p/3461772860
http://c.tieba.baidu.com/p/3461767442
http://c.tieba.baidu.com/p/3461736231
http://c.tieba.baidu.com/p/3461704953
http://c.tieba.baidu.com/p/3461692676
http://c.tieba.baidu.com/p/3461665341
http://c.tieba.baidu.com/p/3461656389
http://c.tieba.baidu.com/p/3461660595
http://c.tieba.baidu.com/p/3461566608
http://c.tieba.baidu.com/p/3461652243
http://c.tieba.baidu.com/p/3461561596
http://c.tieba.baidu.com/p/3461557067
郑重声明:本站内容如果来自互联网及其他传播媒体,其版权均属原媒体及文章作者所有。转载目的在于传递更多信息及用于网络分享,并不代表本站赞同其观点和对其真实性负责,也不构成任何其他建议。