无法判断目标网站编码的解决方法
/// <summary> /// 函数名称:GetDataFromUrl /// 功能说明:获取url指定的网页的源码 /// 参数:string url用于指定 url /// 参数:ref Encoding encode用来获取网页中的字符集编码 /// </summary> public static string GetDataFromUrl(string url, ref Encoding encode) { string str = string.Empty; HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url); //设置http头 request.AllowAutoRedirect = true; request.AllowWriteStreamBuffering = true; request.Referer = ""; request.Timeout = 10 * 1000; request.UserAgent = ""; HttpWebResponse response = null; response = (HttpWebResponse)request.GetResponse(); //根据http应答的http头来判断编码 string characterSet = response.CharacterSet; //Encoding encode; if (characterSet != "") { if (characterSet == "ISO-8859-1") { characterSet = "gb2312"; } encode = Encoding.GetEncoding(characterSet); } else { encode = Encoding.Default; } //声明一个内存流来保存http应答流 Stream receiveStream = response.GetResponseStream(); MemoryStream mStream = new MemoryStream(); byte[] bf = new byte[255]; int count = receiveStream.Read(bf, 0, 255); while (count > 0) { mStream.Write(bf, 0, count); count = receiveStream.Read(bf, 0, 255); } receiveStream.Close(); mStream.Seek(0, SeekOrigin.Begin); //从内存流里读取字符串 StreamReader reader = new StreamReader(mStream, encode); char[] buffer = new char[1024]; count = reader.Read(buffer, 0, 1024); while (count > 0) { str += new String(buffer, 0, count); count = reader.Read(buffer, 0, 1024); } //从解析出的字符串里判断charset,如果和http应答的编码不一直 //那么以页面声明的为准,再次从内存流里重新读取文本 Regex reg = new Regex(@"<meta[\s\S]+?charset=(.*?)""[\s\S]+?>", RegexOptions.Multiline | RegexOptions.IgnoreCase); MatchCollection mc = reg.Matches(str); if (mc.Count > 0) { string tempCharSet = mc[0].Result("$1"); if (string.Compare(tempCharSet, characterSet, true) != 0) { encode = Encoding.GetEncoding(tempCharSet); str = string.Empty; mStream.Seek(0, SeekOrigin.Begin); reader = new StreamReader(mStream, encode); buffer = new char[255]; count = reader.Read(buffer, 0, 255); while (count > 0) { str += new String(buffer, 0, count); count = reader.Read(buffer, 0, 255); } } } reader.Close(); mStream.Close(); if (response != null) response.Close(); return str; }
郑重声明:本站内容如果来自互联网及其他传播媒体,其版权均属原媒体及文章作者所有。转载目的在于传递更多信息及用于网络分享,并不代表本站赞同其观点和对其真实性负责,也不构成任何其他建议。