html代码里面所有的链接地址和链接名称

package parser;

 

import org.htmlparser.Parser;

import org.htmlparser.Node;

import org.htmlparser.NodeFilter;

import org.htmlparser.Parser;

importorg.htmlparser.filters.TagNameFilter;

import org.htmlparser.tags.LinkTag;

import org.htmlparser.tags.TableTag;

import org.htmlparser.util.NodeList;

import org.htmlparser.util.ParserException;

import org.htmlparser.visitors.HtmlPage;

 

/**

 *htmlparser取得一段html代码里面所有的链接地址和链接名称

 *

 *@author chenguoyong

 *

 */

public class Testhtmlparser {

 

       /**

        * @param args

        */

       publicstatic void main(String[] args) {

              Stringhtmlcode ="<HTML><HEAD><TITLE>AAA</TITLE></HEAD><BODY>"

                            +"<a href=‘http://topic.csdn.net/u/20080522/14/0ff402ef-c382-499a-8213-ba6b2f550425.html‘>连接1</a>"

                            +"<a href=‘http://topic.csdn.net‘>连接2</a></BODY></HTML>";

              //创建Parser对象根据传给字符串和指定的编码

              Parserparser = Parser.createParser(htmlcode, "GBK");

              //创建HtmlPage对象HtmlPage(Parser parser)

              HtmlPagepage = new HtmlPage(parser);

              try{

http://weibo.com/p/1001603758223206303667
http://weibo.com/p/1001603758221000048944
http://weibo.com/p/1001603758220953976290
http://weibo.com/p/1001603758220907781815
http://weibo.com/p/1001603758220853311184
http://weibo.com/p/1001603758221939588132
http://weibo.com/p/1001603758221855711275
http://weibo.com/p/1001603758221805378773
http://weibo.com/p/1001603758221713102251
http://weibo.com/p/1001603758222199639242
http://weibo.com/p/1001603758222254166114
http://weibo.com/p/1001603758222312898733
http://weibo.com/p/1001603758222363220096
http://weibo.com/p/1001603758222669421859
http://weibo.com/p/1001603758222623283555
http://weibo.com/p/1001603758222715548378
http://weibo.com/p/1001603758222761698473
http://weibo.com/p/1001603758223265025161
http://weibo.com/p/1001603758223315345118
http://weibo.com/p/1001603758223365690667
http://weibo.com/p/1001603758223759963259
http://weibo.com/p/1001603758223806101541
http://weibo.com/p/1001603758223864809084
http://weibo.com/p/1001603758223915141620
http://weibo.com/p/1001603758224376472925
http://weibo.com/p/1001603758224334529025
http://weibo.com/p/1001603758224288441648
http://weibo.com/p/1001603758224229720238
http://weibo.com/p/1001603758224972060148
http://weibo.com/p/1001603758225014019919
http://weibo.com/p/1001603758225064352489
http://weibo.com/p/1001603758225165003044
http://weibo.com/p/1001603758225437662561
http://weibo.com/p/1001603758225483782220
http://weibo.com/p/1001603758225529920430
http://weibo.com/p/1001603758225576077443
http://weibo.com/p/1001603758225865470754
http://weibo.com/p/1001603758225911608998
http://weibo.com/p/1001603758225957766785
http://weibo.com/p/1001603758225999710651
http://weibo.com/p/1001603758226242965346
http://weibo.com/p/1001603758226284909190
http://weibo.com/p/1001603758226364622589
http://weibo.com/p/1001603758226414934928
http://weibo.com/p/1001603758227417348265
http://weibo.com/p/1001603758226754679860
http://weibo.com/p/1001603758226708541534
http://weibo.com/p/1001603758227627110656
http://weibo.com/p/1001603758227677443232
http://weibo.com/p/1001603758227731927087
http://weibo.com/p/1001603758227773870957
http://weibo.com/p/1001603758228637914405
http://weibo.com/p/1001603758228684028362
http://weibo.com/p/1001603758228767915996
http://weibo.com/p/1001603758228814054130
http://weibo.com/p/1001603758229304821435
http://weibo.com/p/1001603758229258683175
http://weibo.com/p/1001603758229212519842
http://weibo.com/p/1001603758229149604164

                     //HtmlPage extends visitor,Apply the given visitor to the current

                     //page.

                     parser.visitAllNodesWith(page);

              }catch (ParserException e1) {

                     e1= null;

              }

              //所有的节点

              NodeListnodelist = page.getBody();

              //建立一个节点filter用于过滤节点

              NodeFilterfilter = new TagNameFilter("A");

              //得到所有过滤后,想要的节点

              nodelist= nodelist.extractAllNodesThatMatch(filter, true);

              for(int i = 0; i < nodelist.size(); i++) {

                     LinkTaglink = (LinkTag) nodelist.elementAt(i);

                     //链接地址

                     System.out.println(link.getAttribute("href")+ "\n");

                     //链接名称

                     System.out.println(link.getStringText());

              }

 

       }

 

}

 

结果如下:

http://topic.csdn.net/u/20080522/14/0ff402ef-c382-499a-8213-ba6b2f550425.html

连接1

http://topic.csdn.net

连接2

 

郑重声明:本站内容如果来自互联网及其他传播媒体,其版权均属原媒体及文章作者所有。转载目的在于传递更多信息及用于网络分享,并不代表本站赞同其观点和对其真实性负责,也不构成任何其他建议。