下载网页信息
private void button1_Click(object sender, EventArgs e) { String urlDownLoad = "http://www.cnblogs.com"; //需要获取网页内容的URL地址 //因为URL地址是HTTP协议的,所以返回一个HttpWebRequest对象 HttpWebRequest request = (HttpWebRequest) System.Net.WebRequest.Create(urlDownLoad); HttpWebResponse response = (HttpWebResponse) request.GetResponse(); var buffer = GetBytes(response); string strResult = Encoding.ASCII.GetString(buffer); #region 获取网页编码 //验证 字符编码 的正则表达式 const string regCharset = "(<meta[^>]*charset=(?<charset>[^>‘\"]*)[\\s\\S]*?>)|(xml[^>]+encoding=(\"|‘)*(?<charset>[^>‘\"]*)[\\s\\S]*?>)"; //使用正则表达式获取网页中的实际字符编码 var r = new Regex(regCharset, RegexOptions.IgnoreCase); var m1 = r.Match(strResult); string encodingName = (m1.Captures.Count != 0) ? m1.Groups["charset"].Value : ""; //如果未获取 这手动替换判断 if (string.IsNullOrEmpty(encodingName)) { //如果未获取 这手动替换判断 string str = m1.Groups[1].Value; const string pattern = "<meta charset=\"|\">|\" />"; encodingName = Regex.Replace(str, pattern, ""); } #endregion // 用网页中真实的字符编码获取下载的数据 string strHtml = GetEncodingByName(encodingName).GetString(buffer); response.Close(); } /// <summary> /// 获取网页字符编码 /// </summary> /// <param name="encodingName"></param> /// <returns></returns> private static Encoding GetEncodingByName( string encodingName) { Encoding encoding = Encoding.Default; if (string.IsNullOrEmpty(encodingName)) return encoding; try { encoding = Encoding.GetEncoding(encodingName); } catch { encoding = Encoding.UTF8; } return encoding; } private static byte[] GetBytes(WebResponse response) { byte[] data; using (var memoryStream = new MemoryStream()) { var buffer = new byte[0x100]; using (var rs = response.GetResponseStream()) { if (rs != null) { for (var i = rs.Read(buffer, 0, buffer.Length); i > 0; i = rs.Read(buffer, 0, buffer.Length)) { memoryStream.Write(buffer, 0, i); } } } data = memoryStream.ToArray(); } return data; }
郑重声明:本站内容如果来自互联网及其他传播媒体,其版权均属原媒体及文章作者所有。转载目的在于传递更多信息及用于网络分享,并不代表本站赞同其观点和对其真实性负责,也不构成任何其他建议。