如何遍历一个网站的所有页面
using System;
using System.Collections;
using System.Collections.Generic;
namespace WebSiteIterate
{
class WebSiteIterate
{
private string Url;
private string Domain;
private List<string> K_V_Already = new List<string>();
public WebSiteIterate(string Url)
{
this.Url = Url;
Uri Uri = new System.Uri(this.Url);
Domain = Uri.Host;
Handle();
}
private void Handle()
{
List<string> K_V_Not = new List<string>();
K_V_Not.Add(this.Url);
Proxy.Proxy Proxy = (Proxy.Proxy)Activator.GetObject(typeof(Proxy.Proxy), "ipc://Proxy/Proxy");
Proxy.ProxyListGet();
List<string> ProxyList = Proxy.ProxyList();
while (K_V_Not.Count > 0)
{
string TempUrl = K_V_Not[0].ToString();
List<string> K_V_Temp =new List<string>();
int Number=0;
while (K_V_Temp.Count == 0 && Number < ProxyList.Count)
{
try
{
Extraction.Http.HttpResponseMgr HttpResponseMgr = new Extraction.Http.HttpResponseMgr(TempUrl, ProxyList[Number]);
string result = HttpResponseMgr.GetResult;
string str = result;
str = str.Replace("\r\n", "");
str = str.Replace("\r", "");
str = str.Replace("\n", "");
str = str.Replace("‘", "\"");
str = str.Replace(" ", "");
str = str.Replace(" ", "");
System.Text.RegularExpressions.Regex r1 = new System.Text.RegularExpressions.Regex(@"<head[^>]*?>.*?</head>");
str = r1.Replace(str, "");
System.Text.RegularExpressions.Regex r2 = new System.Text.RegularExpressions.Regex(@"<script[^>]*?>.*?</script>");
str = r2.Replace(str, "");
string p = "<a\\s?[^>]*?\\s?href=\"([^\"]+)\"[^>]*>([^<]+)</a>"; // 这个表达式只能抓取纯文本的链接 如何链接中含有图片抓取不到
var collection = System.Text.RegularExpressions.Regex.Matches(str, p);
if (collection.Count > 0)
{
foreach (System.Text.RegularExpressions.Match it in collection)
{
string link = it.Groups[1].Value;
if (!(link.Contains("script") || link.Contains("#") || link.Contains(".rar") || link.Contains(".doc") || link.Contains(".pdf") || link.Contains("mailto") || link.Contains(".xls") || link.Contains("{") || link.Contains("\"")))
{
Uri absoluteUri = new Uri(new Uri(TempUrl), link);
K_V_Temp.Add(absoluteUri.ToString());
}
}
}
}
catch(Exception e)
{
}
Number++;
}
for (int i = 0; i < K_V_Temp.Count; i++)
{
if ((!K_V_Already.Contains(K_V_Temp[i].ToString())) && (!K_V_Not.Contains(K_V_Temp[i].ToString())) && K_V_Temp[i].ToString().Contains(Domain))
{
K_V_Not.Add(K_V_Temp[i]);
}
}
K_V_Already.Add(K_V_Not[0]);
K_V_Not.RemoveAt(0);
}
}
}
}
郑重声明:本站内容如果来自互联网及其他传播媒体,其版权均属原媒体及文章作者所有。转载目的在于传递更多信息及用于网络分享,并不代表本站赞同其观点和对其真实性负责,也不构成任何其他建议。