html代码里面所有的链接地址和链接名称

package parser;

 

import org.htmlparser.Parser;

import org.htmlparser.Node;

import org.htmlparser.NodeFilter;

import org.htmlparser.Parser;

importorg.htmlparser.filters.TagNameFilter;

import org.htmlparser.tags.LinkTag;

import org.htmlparser.tags.TableTag;

import org.htmlparser.util.NodeList;

import org.htmlparser.util.ParserException;

import org.htmlparser.visitors.HtmlPage;

 

/**

 *htmlparser取得一段html代码里面所有的链接地址和链接名称

 *

 *@author chenguoyong

 *

 */

public class Testhtmlparser {

 

       /**

        * @param args

        */

       publicstatic void main(String[] args) {

              Stringhtmlcode ="<HTML><HEAD><TITLE>AAA</TITLE></HEAD><BODY>"

                            +"<a href=‘http://topic.csdn.net/u/20080522/14/0ff402ef-c382-499a-8213-ba6b2f550425.html‘>连接1</a>"

                            +"<a href=‘http://topic.csdn.net‘>连接2</a></BODY></HTML>";

              //创建Parser对象根据传给字符串和指定的编码

              Parserparser = Parser.createParser(htmlcode, "GBK");

              //创建HtmlPage对象HtmlPage(Parser parser)

              HtmlPagepage = new HtmlPage(parser);

              try{

                     //HtmlPage extends visitor,Apply the given visitor to the current

                     //page.

                     parser.visitAllNodesWith(page);

              }catch (ParserException e1) {

                     e1= null;

              }

              //所有的节点

              NodeListnodelist = page.getBody();

              //建立一个节点filter用于过滤节点

              NodeFilterfilter = new TagNameFilter("A");

              //得到所有过滤后,想要的节点

              nodelist= nodelist.extractAllNodesThatMatch(filter, true);

              for(int i = 0; i < nodelist.size(); i++) {

                     LinkTaglink = (LinkTag) nodelist.elementAt(i);

                     //链接地址

                     System.out.println(link.getAttribute("href")+ "\n");

                     //链接名称

                     System.out.println(link.getStringText());

              }

 

       }

 

}

 

结果如下:

http://topic.csdn.net/u/20080522/14/0ff402ef-c382-499a-8213-ba6b2f550425.html

连接1

http://topic.csdn.net

连接2

 

2. 使用HtmlParser抓去网页内容

package parser;

 

import org.htmlparser.Parser;

import org.htmlparser.beans.StringBean;

importorg.htmlparser.filters.NodeClassFilter;

importorg.htmlparser.parserapplications.StringExtractor;

import org.htmlparser.tags.BodyTag;

import org.htmlparser.util.NodeList;

import org.htmlparser.util.ParserException;

 

/**

 * 使用HtmlParser抓去网页内容: 要抓去页面的内容最方便的方法就是使用StringBean. 里面有几个控制页面内容的几个参数.

 * 在后面的代码中会有说明. Htmlparser包中还有一个示例StringExtractor 里面有个直接得到内容的方法,

 * 其中也是使用了StringBean . 另外直接解析Parser的每个标签也可以的.

 *

 *@author chenguoyong

 *

 */

public class GetContent {

       publicvoid getContentUsingStringBean(String url) {

              StringBeansb = new StringBean();

              sb.setLinks(true);// 是否显示web页面的连接(Links)

              //为了取得页面的整洁美观一般设置上面两项为true , 如果要保持页面的原有格式, 如代码页面的空格缩进 可以设置为false

              sb.setCollapse(true);// 如果是true的话把一系列空白字符用一个字符替代.

              sb.setReplaceNonBreakingSpaces(true);//If true regular space

              sb

                 

       }

 

   

              StringExtractorse = new StringExtractor(url);

              Stringtext = null;

              try{

                     text= se.extractStrings(link);

                     System.out.println("Thecontent is :\n" + text);

              }catch (ParserException e) {

http://c.tieba.baidu.com/p/3392933442
http://c.tieba.baidu.com/p/3392940474
http://c.tieba.baidu.com/p/3392948214
http://c.tieba.baidu.com/p/3392954889
http://c.tieba.baidu.com/p/3392961801
http://c.tieba.baidu.com/p/3392968280
http://c.tieba.baidu.com/p/3392974353
http://c.tieba.baidu.com/p/3392980509
http://c.tieba.baidu.com/p/3392986542
http://c.tieba.baidu.com/p/3392992535
http://c.tieba.baidu.com/p/3392998666
http://c.tieba.baidu.com/p/3393004740
http://c.tieba.baidu.com/p/3393010920
http://c.tieba.baidu.com/p/3393016880
http://c.tieba.baidu.com/p/3393021482
http://c.tieba.baidu.com/p/3393017531
http://c.tieba.baidu.com/p/3393011388
http://c.tieba.baidu.com/p/3393027106
http://c.tieba.baidu.com/p/3393053976
http://c.tieba.baidu.com/p/3393027106
http://c.tieba.baidu.com/p/3393060946
http://c.tieba.baidu.com/p/3393068852
http://c.tieba.baidu.com/p/3393060946
http://c.tieba.baidu.com/p/3393074635
http://c.tieba.baidu.com/p/3393075936
http://c.tieba.baidu.com/p/3393053976
http://c.tieba.baidu.com/p/3393082662
http://c.tieba.baidu.com/p/3393088289
http://c.tieba.baidu.com/p/3393093781
http://c.tieba.baidu.com/p/3393099259
http://c.tieba.baidu.com/p/3393104841
http://c.tieba.baidu.com/p/3393110118
http://c.tieba.baidu.com/p/3393115345
http://c.tieba.baidu.com/p/3393120475
http://c.tieba.baidu.com/p/3393125593
http://c.tieba.baidu.com/p/3393130563
http://c.tieba.baidu.com/p/3393135504
http://c.tieba.baidu.com/p/3393078933
http://c.tieba.baidu.com/p/3393075936
http://c.tieba.baidu.com/p/3393091871
http://c.tieba.baidu.com/p/3393093781
http://c.tieba.baidu.com/p/3393098093
http://c.tieba.baidu.com/p/3393098093
http://c.tieba.baidu.com/p/3393102566
http://c.tieba.baidu.com/p/3393085191
http://c.tieba.baidu.com/p/3393108962
http://c.tieba.baidu.com/p/3393113979
http://c.tieba.baidu.com/p/3393118658
http://c.tieba.baidu.com/p/3393124867
http://c.tieba.baidu.com/p/3393129650
http://c.tieba.baidu.com/p/3393135103
http://c.tieba.baidu.com/p/3393155246

郑重声明:本站内容如果来自互联网及其他传播媒体,其版权均属原媒体及文章作者所有。转载目的在于传递更多信息及用于网络分享,并不代表本站赞同其观点和对其真实性负责,也不构成任何其他建议。