如何遍历一个网站的所有页面

浏览数：61 / 时间：2015年06月09日

using System;
using System.Collections;
using System.Collections.Generic;

namespace WebSiteIterate
{
    class WebSiteIterate
    {
        private string Url;
        private string Domain;
        private List<string> K_V_Already = new List<string>();

        public WebSiteIterate(string Url)
        {
            this.Url = Url;
            Uri Uri = new System.Uri(this.Url);
            Domain = Uri.Host;
            Handle();
        }

        private void Handle()
        {
            List<string> K_V_Not = new List<string>();
            K_V_Not.Add(this.Url);
            Proxy.Proxy Proxy = (Proxy.Proxy)Activator.GetObject(typeof(Proxy.Proxy), "ipc://Proxy/Proxy");
            Proxy.ProxyListGet();
            List<string> ProxyList = Proxy.ProxyList();
            while (K_V_Not.Count > 0)
            {
                string TempUrl = K_V_Not[0].ToString();
                List<string> K_V_Temp =new List<string>();
                int Number=0;
                while (K_V_Temp.Count == 0 && Number < ProxyList.Count)
                {
                    try
                    {
                        Extraction.Http.HttpResponseMgr HttpResponseMgr = new Extraction.Http.HttpResponseMgr(TempUrl, ProxyList[Number]);
                        string result = HttpResponseMgr.GetResult;
                        string str = result;
                        str = str.Replace("\r\n", "");
                        str = str.Replace("\r", "");
                        str = str.Replace("\n", "");
                        str = str.Replace("‘", "\"");
                        str = str.Replace(" ", "");
                        str = str.Replace(" ", "");
                        System.Text.RegularExpressions.Regex r1 = new System.Text.RegularExpressions.Regex(@"<head[^>]*?>.*?</head>");
                        str = r1.Replace(str, "");
                        System.Text.RegularExpressions.Regex r2 = new System.Text.RegularExpressions.Regex(@"<script[^>]*?>.*?</script>");
                        str = r2.Replace(str, "");
                        string p = "<a\\s?[^>]*?\\s?href=\"([^\"]+)\"[^>]*>([^<]+)</a>";    // 这个表达式只能抓取纯文本的链接如何链接中含有图片抓取不到
                        var collection = System.Text.RegularExpressions.Regex.Matches(str, p);
                        if (collection.Count > 0)
                        {
                            foreach (System.Text.RegularExpressions.Match it in collection)
                            {
                                string link = it.Groups[1].Value;
                                if (!(link.Contains("script") || link.Contains("#") || link.Contains(".rar") || link.Contains(".doc") || link.Contains(".pdf") || link.Contains("mailto") || link.Contains(".xls") || link.Contains("{") || link.Contains("\"")))
                                {
                                    Uri absoluteUri = new Uri(new Uri(TempUrl), link);
                                    K_V_Temp.Add(absoluteUri.ToString());
                                }
                            }
                        }
                    }
                    catch(Exception e)
                    {
                    }
                    Number++;
                }

                for (int i = 0; i < K_V_Temp.Count; i++)
                {
                    if ((!K_V_Already.Contains(K_V_Temp[i].ToString())) && (!K_V_Not.Contains(K_V_Temp[i].ToString())) && K_V_Temp[i].ToString().Contains(Domain))
                    {
                        K_V_Not.Add(K_V_Temp[i]);
                    }
                }
                K_V_Already.Add(K_V_Not[0]);
                K_V_Not.RemoveAt(0);
            }
        }
    }
}