正则抓取网页所有href和src

浏览数：171 / 时间：2015年06月09日

根据抓取的页面，用正则来匹配页面href和src

string UserAgent = "Mozilla/5.0 (Windows NT 5.2; rv:29.0) Gecko/20100101 Firefox/29.0";
    string ContentType = "";

    Uri strReqUrl = new Uri("http://m.lhrb.ufstone.net/");
    protected void Application_BeginRequest(object sender, EventArgs e)
    {

        Uri u = new Uri(strReqUrl, Request.RawUrl);
        byte[] b = getVerificationCode(u);

        //MemoryStream ms = new MemoryStream(b);
        //Response.ClearContent();
        //Response.ContentType = ContentType;
        //Response.BinaryWrite(b);

        StringBuilder strHtml = new StringBuilder(Encoding.GetEncoding("gb2312").GetString(b));
        StringBuilder sb = new StringBuilder();
        GetHtmlUrl(ref strHtml);
        Response.Write(strHtml.ToString());
        Response.End();
    }
    public byte[] getVerificationCode(Uri url)
    {
        WebClient MyWebClient = new WebClient();
        MyWebClient.Headers.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
        MyWebClient.Headers.Add("Accept-Language", "    zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3");
        MyWebClient.Headers.Add("User-Agent", this.UserAgent);
        MyWebClient.Credentials = CredentialCache.DefaultCredentials;
        try
        {
            Byte[] pageData = MyWebClient.DownloadData(url.AbsoluteUri);
            ContentType = MyWebClient.ResponseHeaders["Content-Type"];
            return (pageData);
        }
        catch
        {
            return null;
        }
    }

View Code

    void GetHtmlUrl(ref StringBuilder strHtml)
    {
        //string headstr = "(src|href)=", endstr = "(\")";
        //string reg = @"(?<=" + headstr + ")(.*?)(?=" + endstr + ")";

        string reg = "(src|href)\\s*=\\s*(?:\"(?<1>[^\"]*)\"|(?<1>\\S+))";
        Regex r = new Regex(reg, RegexOptions.None);
        Match match = r.Match(strHtml.ToString());
        StringBuilder sb = new StringBuilder();
        while (match.Success)
        {
            //sb.Append(match.Groups["url"].Value + "\n");//得到href值                
            //sb.Append(match.Groups["text"].Value + "\n");//得到<a><a/>中间的内容     

            sb.Append(match + "\n");//得到href值     
            match = match.NextMatch();
            //try
            //{
            //    Uri u = new Uri(strReqUrl, match.Value.Replace("\"", "").Replace("‘", ""));
            //    strHtml.Replace(match.Value, @"/" + u.ToString().Replace(strReqUrl.ToString(), ""));
            //}
            //catch
            //{
            //}
        }
    }