[致初学者]模拟Web请求——Get

浏览数：44 / 时间：2015年06月09日

在实际生活中，网络请求的应用极其常见，比如使用浏览器，程序中我们还要调用webservice。那么浏览器是怎么请求网络资源的呢？不用它可以自己请求不？

答案是可以的。

如果我们可以用自己的程序主动发起网络请求，那么我们可以：模拟提交数据，做一些简单网页游戏的外挂，可以刷一些帖子的访问量，可以抓取网络上的资源……

废话不我说，此文以使用Get方式对有道词典网进行请求为核心，编写了一个简单的单词查询客户端。正则表达式不太懂可以改为字符串indexOf操作，或者对其自己做一下基本的了解和学习。

效果图：

代码：

using System;
using System.Collections.Generic;
using System.IO;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;

namespace NetCapture
{
    /// <summary>
    /// base类。以后我们可能写别的类似请求，通过继承此类，可以省下一些代码。
    /// </summary>
    public abstract class GetScraperBase
    {
        //正则表达式的匹配模式
        protected abstract string Pattern { get; }
        //如何过滤正则表达式匹配的结果
        protected abstract Dictionary<string, string> FilterMatch(Match match);
        //抓取网页上的内容
        public Dictionary<string, string> Scrape(string url, WebProxy proxy = null)
        {
                var request = WebRequest.Create(url);
                if (proxy != null)
                {
                    request.Proxy = proxy;//可能你在一些环境上不了网，得使用代理服务器
                }
                var response = request.GetResponse();
                var stream = response.GetResponseStream();
                var responseReader = new StreamReader(stream);
                var content = responseReader.ReadToEnd();
                var match = Regex.Match(content, Pattern, RegexOptions.IgnorePatternWhitespace);
                return FilterMatch(match);           
        }
    }

    public class YouDaoScaper : GetScraperBase
    {
        protected override string Pattern
        {
            get
            {
                /* Target result in response:
                         <div class="trans-container">
                         <ul>
                         <li>n. 试验；检验</li>
                         <li>vt. 试验；测试</li>
                         <li>vi. 试验；测试</li>
                         <li>n. (Test)人名；(英)特斯特</li>
                        </ul>
                 * 
                 * there are two groups in this pattern, first is ‘<li>(?<content>.+)</li>[\r\n\s]*‘
                 * it‘s an unnamed group, it has four capture:
                 * first is ‘<li>n. 试验；检验</li>‘ and so on.
                 * 
                 * another group is and named group ‘content‘ , it has four capture, in this sampe:
                 * capture 1 is ‘n. 试验；检验‘ and so on.
                */

                return @"<div\sclass=""trans-container"">[\r\n\s]*
<ul>[\r\n\s]*
(<li>(?<content>.+)</li>[\r\n\s]*)*
</ul>";
            }
        }

        protected override Dictionary<string, string> FilterMatch(Match match)
        { 
            var dict=new Dictionary<string, string>();
            var content = "";
            var group=match.Groups["content"];
            if(group.Success)
            {                
                foreach (Capture capture in group.Captures)
                {
                    content += (capture.Value + "\n");
                }
            }
            dict["content"]=content;
            return dict;
        }

        public string QueryWord(string word)
        {
            var url= "http://dict.youdao.com/search?q="+word;
            var dict = Scrape(url);
            return dict["content"];
        }
    }
}