利用htmlparser提取网页纯文本的例子

import org.htmlparser.Node;

import org.htmlparser.NodeFilter;

import org.htmlparser.Parser;

importorg.htmlparser.filters.TagNameFilter;

import org.htmlparser.tags.TableTag;

import org.htmlparser.util.NodeList;

 

/**

 * 标题:利用htmlparser提取网页纯文本的例子

 */

public class TestHTMLParser2 {

       /**

        * 读取目标html内容

        *

        */

       publicstatic void testHtml() {

              try{

                     StringsCurrentLine;

                     StringsTotalString;

                     sCurrentLine= "";

                     sTotalString= "";

                     java.io.InputStreaml_urlStream;

                     java.net.URLl_url = new java.net.URL(

                                   "http://10.249.187.199:8083/injs100/");

                     java.net.HttpURLConnectionl_connection = (java.net.HttpURLConnection) l_url

                                   .openConnection();

                     l_connection.connect();

                     l_urlStream= l_connection.getInputStream();

                     java.io.BufferedReaderl_reader = new java.io.BufferedReader(

                                   newjava.io.InputStreamReader(l_urlStream));

                     while((sCurrentLine = l_reader.readLine()) != null) {

                            sTotalString+= sCurrentLine + "\r\n";

                     }

 

                     StringtestText = extractText(sTotalString);

              }catch (Exception e) {

                     e.printStackTrace();

              }

 

       }

   /**

    * 抽取纯文本信息

    * @param inputHtml:html文本

    * @return

    * @throws Exception

    */

       publicstatic String extractText(String inputHtml) throws Exception {

              StringBuffertext = new StringBuffer();

              Parserparser = Parser.createParser(new String(inputHtml.getBytes(),

                            "GBK"),"GBK");

              //遍历所有的节点

              NodeListnodes = parser.extractAllNodesThatMatch(new NodeFilter() {

                     publicboolean accept(Node node) {

                            returntrue;

                     }

              });

 

              System.out.println(nodes.size());

              for(int i = 0; i < nodes.size(); i++) {

                     Nodenodet = nodes.elementAt(i);

                     //字符串的代表性节点:节点的描述

                     text.append(newString(nodet.toPlainTextString().getBytes("GBK"))

                                   +"\r\n");

              }

              returntext.toString();

       }

   /**

    *  读取文件的方式/utl 来分析内容.filePath也可以是一个Url.

    * @param resource :文件/Url

    * @throws Exception

    */

       publicstatic void test5(String resource) throws Exception {

              ParsermyParser = new Parser(resource);

              myParser.setEncoding("GBK");

              StringfilterStr = "table";

              NodeFilterfilter = new TagNameFilter(filterStr);

              NodeListnodeList = myParser.extractAllNodesThatMatch(filter);

              /*for(inti=0;i<nodeList.size();i++)

              {

                     TableTagtabletag = (TableTag) nodeList.elementAt(i);

                     //标签名称

                     System.out.println(tabletag.getTagName());

                     System.out.println(tabletag.getText());

              }*/

              TableTagtabletag = (TableTag) nodeList.elementAt(1);

              

http://c.tieba.baidu.com/p/3392402431
http://c.tieba.baidu.com/p/3392405849
http://c.tieba.baidu.com/p/3392411579
http://c.tieba.baidu.com/p/3392416082
http://c.tieba.baidu.com/p/3392420343
http://c.tieba.baidu.com/p/3392424840
http://c.tieba.baidu.com/p/3392429238
http://c.tieba.baidu.com/p/3392433592
http://c.tieba.baidu.com/p/3392437923
http://c.tieba.baidu.com/p/3392442352
http://c.tieba.baidu.com/p/3392446861
http://c.tieba.baidu.com/p/3392451443
http://c.tieba.baidu.com/p/3392455917
http://c.tieba.baidu.com/p/3392460228
http://c.tieba.baidu.com/p/3392464693
http://c.tieba.baidu.com/p/3392469164
http://c.tieba.baidu.com/p/3392473582
http://c.tieba.baidu.com/p/3392478006
http://c.tieba.baidu.com/p/3392482459
http://c.tieba.baidu.com/p/3392487114
http://c.tieba.baidu.com/p/3392491670
http://c.tieba.baidu.com/p/3392496190
http://c.tieba.baidu.com/p/3392500743
http://c.tieba.baidu.com/p/3392505309
http://c.tieba.baidu.com/p/3392509820
http://c.tieba.baidu.com/p/3392514415
http://c.tieba.baidu.com/p/3392519080
http://c.tieba.baidu.com/p/3392523563
http://c.tieba.baidu.com/p/3392528180
http://c.tieba.baidu.com/p/3392532879
http://c.tieba.baidu.com/p/3392537592
http://c.tieba.baidu.com/p/3392542235
http://c.tieba.baidu.com/p/3392546757
http://c.tieba.baidu.com/p/3392551549
http://c.tieba.baidu.com/p/3392556372
http://c.tieba.baidu.com/p/3392561202
http://c.tieba.baidu.com/p/3392566194
http://c.tieba.baidu.com/p/3392571173
http://c.tieba.baidu.com/p/3392576221
http://c.tieba.baidu.com/p/3392581132
http://c.tieba.baidu.com/p/3392586257
http://c.tieba.baidu.com/p/3392591293
http://c.tieba.baidu.com/p/3392596453
http://c.tieba.baidu.com/p/3392601756
http://c.tieba.baidu.com/p/3392606977
http://c.tieba.baidu.com/p/3392612191
http://c.tieba.baidu.com/p/3392617313
http://c.tieba.baidu.com/p/3392622531
http://c.tieba.baidu.com/p/3392627719
http://c.tieba.baidu.com/p/3392633007
http://c.tieba.baidu.com/p/3392638423
http://c.tieba.baidu.com/p/3392643790
http://c.tieba.baidu.com/p/3392649198
http://c.tieba.baidu.com/p/3392654429
http://c.tieba.baidu.com/p/3392659821
http://c.tieba.baidu.com/p/3392665260
http://c.tieba.baidu.com/p/3392670593
http://c.tieba.baidu.com/p/3392676058
http://c.tieba.baidu.com/p/3392681592
http://c.tieba.baidu.com/p/3392687221
http://c.tieba.baidu.com/p/3392692701
http://c.tieba.baidu.com/p/3392698484
http://c.tieba.baidu.com/p/3392704140
http://c.tieba.baidu.com/p/3392783374
http://c.tieba.baidu.com/p/3392798951
http://c.tieba.baidu.com/p/3392817954
http://c.tieba.baidu.com/p/3392817954
http://c.tieba.baidu.com/p/3392841279
http://c.tieba.baidu.com/p/3392850395
http://c.tieba.baidu.com/p/3392856094
http://c.tieba.baidu.com/p/3392861130
http://c.tieba.baidu.com/p/3392870496
http://c.tieba.baidu.com/p/3392903790
http://c.tieba.baidu.com/p/3392910784
http://c.tieba.baidu.com/p/3392915350

郑重声明:本站内容如果来自互联网及其他传播媒体,其版权均属原媒体及文章作者所有。转载目的在于传递更多信息及用于网络分享,并不代表本站赞同其观点和对其真实性负责,也不构成任何其他建议。