使用HtmlParser抓去网页内容

package parser;

 

import org.htmlparser.Parser;

import org.htmlparser.beans.StringBean;

importorg.htmlparser.filters.NodeClassFilter;

importorg.htmlparser.parserapplications.StringExtractor;

import org.htmlparser.tags.BodyTag;

import org.htmlparser.util.NodeList;

import org.htmlparser.util.ParserException;

 

/**

 * 使用HtmlParser抓去网页内容: 要抓去页面的内容最方便的方法就是使用StringBean. 里面有几个控制页面内容的几个参数.

 * 在后面的代码中会有说明. Htmlparser包中还有一个示例StringExtractor 里面有个直接得到内容的方法,

 * 其中也是使用了StringBean . 另外直接解析Parser的每个标签也可以的.

 *

 *@author chenguoyong

 *

 */

public class GetContent {

       publicvoid getContentUsingStringBean(String url) {

              StringBeansb = new StringBean();

              sb.setLinks(true);// 是否显示web页面的连接(Links)

              //为了取得页面的整洁美观一般设置上面两项为true , 如果要保持页面的原有格式, 如代码页面的空格缩进 可以设置为false

              sb.setCollapse(true);// 如果是true的话把一系列空白字符用一个字符替代.

              sb.setReplaceNonBreakingSpaces(true);//If true regular space

              sb

                            .setURL("http://www.blogjava.net/51AOP/archive/2006/07/19/59064.html");

              System.out.println("TheContent is :\n" + sb.getStrings());

 

       }

 

       publicvoid getContentUsingStringExtractor(String url, boolean link) {

              //StringExtractor内部机制和上面的一样.做了一下包装

              StringExtractorse = new StringExtractor(url);

              Stringtext = null;

              try{

                     text= se.extractStrings(link);

                     System.out.println("Thecontent is :\n" + text);

              }catch (ParserException e) {

                     e.printStackTrace();

              }

       }

 

       publicvoid getContentUsingParser(String url) {

              NodeListnl;

              try{

                     Parserp = new Parser(url);

                     nl= p.parse(new NodeClassFilter(BodyTag.class));

                     BodyTagbt = (BodyTag) nl.elementAt(0);

                     System.out.println(bt.toPlainTextString());// 保留原来的内容格式. 包含js代码

              }catch (ParserException e) {

                     e.printStackTrace();

              }

       }

 

       /**

        * @param args

        */

       publicstatic void main(String[] args) {

              Stringurl = "http://www.blogjava.net/51AOP/archive/2006/07/19/59064.html";

              //newGetContent().getContentUsingParser(url);

              //--------------------------------------------------

              newGetContent().getContentUsingStringBean(url);

 

       }

 http://c.tieba.baidu.com/p/3476776824
http://c.tieba.baidu.com/p/3476808306
http://c.tieba.baidu.com/p/3476798710
http://c.tieba.baidu.com/p/3474281354
http://c.tieba.baidu.com/p/3474300101
http://c.tieba.baidu.com/p/3474294075
http://c.tieba.baidu.com/p/3474123295
http://c.tieba.baidu.com/p/3474314242
http://c.tieba.baidu.com/p/3474310411
http://c.tieba.baidu.com/p/3474304550
http://c.tieba.baidu.com/p/3475433945
http://c.tieba.baidu.com/p/3475430015
http://c.tieba.baidu.com/p/3475433348
http://c.tieba.baidu.com/p/3475431434
http://c.tieba.baidu.com/p/3474176863
http://c.tieba.baidu.com/p/3474159835
http://c.tieba.baidu.com/p/3474163941
http://c.tieba.baidu.com/p/3474156121
http://c.tieba.baidu.com/p/3474147660
http://c.tieba.baidu.com/p/3474151899
http://c.tieba.baidu.com/p/3474142287
http://c.tieba.baidu.com/p/3474136965
http://c.tieba.baidu.com/p/3474133165
http://c.tieba.baidu.com/p/3474128675
http://c.tieba.baidu.com/p/3474103896
http://c.tieba.baidu.com/p/3474099488
http://c.tieba.baidu.com/p/3474094120
http://c.tieba.baidu.com/p/3475431976
http://c.tieba.baidu.com/p/3474267991
http://c.tieba.baidu.com/p/3474259583
http://c.tieba.baidu.com/p/3474254990
http://c.tieba.baidu.com/p/3474228986
http://c.tieba.baidu.com/p/3474221626
http://c.tieba.baidu.com/p/3474215742
http://c.tieba.baidu.com/p/3474212122
http://c.tieba.baidu.com/p/3474188883
http://c.tieba.baidu.com/p/3474207722
http://c.tieba.baidu.com/p/3474184143
http://c.tieba.baidu.com/p/3474180522
http://c.tieba.baidu.com/p/3474171022
http://c.tieba.baidu.com/p/3474086627
http://c.tieba.baidu.com/p/3462847203
http://c.tieba.baidu.com/p/3462839334
http://c.tieba.baidu.com/p/3462834294
http://c.tieba.baidu.com/p/3462786130
http://c.tieba.baidu.com/p/3462782768
http://c.tieba.baidu.com/p/3461791753
http://c.tieba.baidu.com/p/3461784215
http://c.tieba.baidu.com/p/3461778008
http://c.tieba.baidu.com/p/3461772860
http://c.tieba.baidu.com/p/3461767442
http://c.tieba.baidu.com/p/3461736231
http://c.tieba.baidu.com/p/3461704953
http://c.tieba.baidu.com/p/3461692676
http://c.tieba.baidu.com/p/3461665341
http://c.tieba.baidu.com/p/3461656389
http://c.tieba.baidu.com/p/3461660595
http://c.tieba.baidu.com/p/3461566608
http://c.tieba.baidu.com/p/3461652243
http://c.tieba.baidu.com/p/3461561596
http://c.tieba.baidu.com/p/3461557067



郑重声明:本站内容如果来自互联网及其他传播媒体,其版权均属原媒体及文章作者所有。转载目的在于传递更多信息及用于网络分享,并不代表本站赞同其观点和对其真实性负责,也不构成任何其他建议。