网页内容扫描器

程序简介

使用RestSharp即时获取目标网页的内容,使用正则表达式提取需要识别的内容并对比结果。

 

c#正则表达式的[贪婪]模式

正则表达式的引擎是贪婪,只要模式允许,它将匹配尽可能多的字符。

如何匹配满足条件的最短字符 ?

通过在“重复描述字符”(如*,如+)后面添加“?”,可以将匹配模式改成非贪婪。

 

代码

主窗体代码

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using Wsion.ClassLib;
using RestSharp;
using System.Net;
using System.Text.RegularExpressions;
using System.Threading;

namespace WebpageScanner
{
    public partial class Form1 : Form
    {
        Thread thread = null;

        ~Form1()
        {
            thread = null;
        }

        public Form1()
        {
            InitializeComponent();
        }

        private void button1_Click(object sender, EventArgs e)
        {
            Thread thread = new Thread(new ThreadStart(process));
            thread.Start();
            button1.Enabled = false;
        }

        private void button2_Click(object sender, EventArgs e)
        {
            textBoxLog.Text = string.Empty;

        }

        private void button3_Click(object sender, EventArgs e)
        {
            button1.Enabled = true;
        }

        private void process()
        {
            while (!button1.Enabled)
            {
                RestClient client = new RestClient();
                client.BaseUrl = new Uri(textBoxUrl.Text);
                //设置代理
                //client.Proxy = new WebProxy("proxy.net", 8080);
                //client.Proxy.Credentials = System.Net.CredentialCache.DefaultCredentials;

                RestRequest request = new RestRequest();
                request.Method = Method.GET;
                IRestResponse responsebody = client.Execute(request);

                string resBody = EncodingHelper.Instance.BytesToUnicode(responsebody.RawBytes);
                string result;
                Regex regex = new Regex(textBoxRegex.Text, RegexOptions.Multiline & RegexOptions.IgnoreCase);
                result = regex.Match(resBody).Groups[1].ToString().Replace(\n,  ).Replace(\r,  ).Trim();


                SetText(textBoxLog, textBoxLog.Text + string.Format("[{0}]{1}{2}{1}{1}",
                    DateTime.Now.ToLongTimeString(), Environment.NewLine, result));

                if (result != textBoxTarget.Text)
                {
                    Alert();
                    SetEnabled(button1, true);
                }

                System.Threading.Thread.Sleep(2000);
            }
        }

        /// <summary>
        /// 提示扫描到信息更新
        /// </summary>
        private void Alert()
        {
            System.Diagnostics.Process.Start(textBoxPath.Text);
        }



        #region SetVal Template
        /*
         * 跨线程改变控件的属性值
         */

        delegate void SetValueHandler<T>(Control control, T val);

        private void SetText(Control control, string val)
        {
            if (control.InvokeRequired)
            {
                SetValueHandler<string> handler = new SetValueHandler<string>(SetText);
                control.Invoke(handler, new object[] { control, val });
            }
            else
            {
                control.Text = val;
            }
        }

        private void SetEnabled(Control control, bool val)
        {
            if (control.InvokeRequired)
            {
                SetValueHandler<bool> handler = new SetValueHandler<bool>(SetEnabled);
                control.Invoke(handler, new object[] { control, val });
            }
            else
            {
                control.Enabled = val;
            }
        }

        #endregion


    }
}

 

 

以下为代码转换器部分

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace Wsion.ClassLib
{
    /// <summary>
    /// 编码转换器
    /// </summary>
    public class EncodingHelper
    {
        public static readonly EncodingHelper Instance = new EncodingHelper();

        public string EncodingConvert(string fromString, Encoding fromEncoding, Encoding toEncoding)
        {
            byte[] fromBytes = fromEncoding.GetBytes(fromString);
            byte[] toBytes = Encoding.Convert(fromEncoding, toEncoding, fromBytes);

            string toString = toEncoding.GetString(toBytes);
            return toString;
        }

        public string GB2312ToUtf8(string gb2312String)
        {
            Encoding fromEncoding = Encoding.GetEncoding("gb2312");
            Encoding toEncoding = Encoding.UTF8;
            return EncodingConvert(gb2312String, fromEncoding, toEncoding);
        }

        public string Utf8ToGB2312(string utf8String)
        {
            Encoding fromEncoding = Encoding.UTF8;
            Encoding toEncoding = Encoding.GetEncoding("gb2312");
            return EncodingConvert(utf8String, fromEncoding, toEncoding);
        }

        public string GbkToUnicode(string gbkString)
        {
            Encoding fromEncoding = Encoding.GetEncoding("gbk");
            Encoding toEncoding = Encoding.Unicode;
            return EncodingConvert(gbkString, fromEncoding, toEncoding);
        }

        public string BytesToUnicode(byte[] bytes)
        {
            Encoding toEncoding = Encoding.GetEncoding("gbk");
            return toEncoding.GetString(bytes);
        }
    }
}

 

目前多线程Thread.Abort()时程序偶尔会崩溃,希望得到指正。

 

源码下载地址:

http://files.cnblogs.com/files/wsion/WebpageScanner.7z

 

原创博文,转载请注明出处

郑重声明:本站内容如果来自互联网及其他传播媒体,其版权均属原媒体及文章作者所有。转载目的在于传递更多信息及用于网络分享,并不代表本站赞同其观点和对其真实性负责,也不构成任何其他建议。