网页爬虫WebCrawler(1)-Http网页内容抓取
在windows下的C++通过Http协议实现对网页的内容抓取:
首先介绍下两个重要的包(一般是在linux下的开源数据包,在windows下则调用其动态链接库dll):curl包和pthreads_dll,其中curl包解释为命令行浏览器,通过调用内置的curl_easy_setopt等函数即可实现特定的网页内容获取(正确的编译导入的curl链接库,还需要另外一个包C-ares)。pthreads是多线程控制包,当中包含了互斥变量加锁和解锁。程序进程分配等函数。
下载地址:点击打开链接。其中要正确的导入外接动态链接库,需要步骤:1,项目->属性->配置属性->C/C++->常规->附加包含目录(添加include的路径),2,项目->属性->配置属性->连接器->常规->附加库目录(添加lib包含的路径);3,在链接器->输入->附加依赖项(libcurld.lib ;pthreadVC2.lib;ws2_32.lib;winmm.lib;wldap32.lib;areslib.lib添加)4,在c/c++->预处理器->预处理器定义(_CONSOLE;BUILDING_LIBCURL;HTTP_ONLY)
具体实现过程介绍:
1:自定义hashTable结构,用以存储获取的string字符。以hashTable类的形式实现,包含hash表set类型,以及add、find和几种常见的string哈希方式函数
Code:
///HashTable.h #ifndef HashTable_H #define HashTable_H #include <set> #include <string> #include <vector> class HashTable { public: HashTable(void); ~HashTable(void); unsigned int ForceAdd(const std::string& str); unsigned int Find(const std::string& str); /*string的常见的hash方式*/ unsigned int RSHash(const std::string& str); unsigned int JSHash (const std::string& str); unsigned int PJWHash (const std::string& str); unsigned int ELFHash (const std::string& str); unsigned int BKDRHash(const std::string& str); unsigned int SDBMHash(const std::string& str); unsigned int DJBHash (const std::string& str); unsigned int DEKHash (const std::string& str); unsigned int BPHash (const std::string& str); unsigned int FNVHash (const std::string& str); unsigned int APHash (const std::string& str); private: std::set<unsigned int> HashFunctionResultSet; std::vector<unsigned int> hhh; }; #endif
/////HashTable.cpp #include "HashTable.h" HashTable::HashTable(void) { } HashTable::~HashTable(void) { } unsigned int HashTable::ForceAdd(const std::string& str) { unsigned int i=ELFHash(str); HashFunctionResultSet.insert(i); return i; } unsigned int HashTable::Find(const std::string& str) { int ff=hhh.size(); const unsigned int i=ELFHash(str); std::set<unsigned int>::const_iterator it; if(HashFunctionResultSet.size()>0) { it=HashFunctionResultSet.find(i); if(it==HashFunctionResultSet.end()) return -1; } else { return -1; } return i; } /*几种常见的字符串hash方式实现函数*/ unsigned int HashTable::APHash(const std::string& str) { unsigned int hash=0xAAAAAAAA; for(std::size_t i=0;i<str.length();i++) { hash^=((i & 1) == 0) ? ( (hash << 7) ^ str[i] * (hash >> 3)) : (~((hash << 11) + str[i] ^ (hash >> 5))); } return hash; } unsigned int HashTable::BKDRHash(const std::string& str) { unsigned int seed=131; //31 131 1313 13131 131313 etc unsigned int hash=0; for(std::size_t i=0;i<str.length();i++) { hash=(hash*seed)+str[i]; } return hash; } unsigned int HashTable::BPHash(const std::string& str) { unsigned int hash = 0; for(std::size_t i = 0; i < str.length(); i++) { hash = hash << 7 ^ str[i]; } return hash; } unsigned int HashTable::DEKHash(const std::string& str) { unsigned int hash = static_cast<unsigned int>(str.length()); for(std::size_t i = 0; i < str.length(); i++) { hash = ((hash << 5) ^ (hash >> 27)) ^ str[i]; } return hash; } unsigned int HashTable::DJBHash(const std::string& str) { unsigned int hash = 5381; for(std::size_t i = 0; i < str.length(); i++) { hash = ((hash << 5) + hash) + str[i]; } return hash; } unsigned int HashTable::ELFHash(const std::string& str) { unsigned int hash=0; unsigned int x=0; for(std::size_t i = 0; i < str.length(); i++) { hash=(hash<<4)+str[i]; if((x = hash & 0xF0000000L) != 0) hash^=(x>>24); hash&=~x; } return hash; } unsigned int HashTable::FNVHash(const std::string& str) { const unsigned int fnv_prime = 0x811C9DC5; unsigned int hash = 0; for(std::size_t i = 0; i < str.length(); i++) { hash *= fnv_prime; hash ^= str[i]; } return hash; } unsigned int HashTable::JSHash(const std::string& str) { unsigned int hash = 1315423911; for(std::size_t i = 0; i < str.length(); i++) { hash ^= ((hash << 5) + str[i] + (hash >> 2)); } return hash; } unsigned int HashTable::PJWHash(const std::string& str) { unsigned int BitsInUnsignedInt = (unsigned int)(sizeof(unsigned int) * 8); unsigned int ThreeQuarters = (unsigned int)((BitsInUnsignedInt * 3) / 4); unsigned int OneEighth = (unsigned int)(BitsInUnsignedInt / 8); unsigned int HighBits = (unsigned int)(0xFFFFFFFF) << (BitsInUnsignedInt - OneEighth); unsigned int hash = 0; unsigned int test = 0; for(std::size_t i = 0; i < str.length(); i++) { hash = (hash << OneEighth) + str[i]; if((test = hash & HighBits) != 0) hash = (( hash ^ (test >> ThreeQuarters)) & (~HighBits)); } return hash; } unsigned int HashTable::RSHash(const std::string& str) { unsigned int b = 378551; unsigned int a = 63689; unsigned int hash = 0; for(std::size_t i = 0; i < str.length(); i++) { hash = hash * a + str[i]; a = a * b; } return hash; } unsigned int HashTable::SDBMHash(const std::string& str) { unsigned int hash = 0; for(std::size_t i = 0; i < str.length(); i++) { hash = str[i] + (hash << 6) + (hash << 16) - hash; } return hash; }
2:实现进程间的互斥处理函数(另外提供进行当前操作的进程ID,以便加锁机制)。以SingleTone类实现。该类只能有静态函数Instance建立一个唯一的类对象。以互斥的方式实现对hashTable的基本操作,当中的变量加锁和解锁有mutex类来实现,具体参见代码:
////mutex.h #ifndef mutex_H #define mutex_H #pragma once #include "pthread.h" class mutex { pthread_mutex_t& m_mutex; public: mutex(pthread_mutex_t& m):m_mutex(m) { pthread_mutex_lock(&m_mutex); } ~mutex(void) { pthread_mutex_unlock(&m_mutex); } }; #endif
////SingleTone.h #ifndef SingleTone_H #define SingleTone_H #include <string> #include <list> #include <map> #include "Constants.h" #include "HashTable.h" #include "pthread.h" #include "curl/curl.h" class SingleTone{ public: static SingleTone* Instance(); void push_back(std::string s); void pop_back(); int size(); std::list<std::string>::reference back(); std::list<std::string>::iterator begin(); std::list<std::string>::iterator end(); void push_front(std::string s); bool empty(); unsigned int Get_m_UniqueMap_ForceAdd(const std::string& key,const std::string& url); unsigned int Get_m_UniqueMap_Find(const std::string& key,const std::string& url); HashTable Get_m_UniqueMap(const std::string& key); void Set_m_UniqueMap(const std::string& key,HashTable& hash); CURL* GetpCurl(); protected: SingleTone(); ~SingleTone(); pthread_mutex_t m_singleton_mutex; private: static SingleTone* m_pSingleTone; std::list<std::string> m_LinkStack; std::map<std::string,HashTable> m_UniqueMap; CURL *m_pcurl; }; #endif
#include "SingleTone.h" #include "mutex.h" SingleTone* SingleTone::m_pSingleTone=NULL; SingleTone::SingleTone() { pthread_mutex_init(&m_singleton_mutex,NULL); m_pcurl=curl_easy_init(); } SingleTone::~SingleTone() { pthread_mutex_destroy(&m_singleton_mutex); } SingleTone* SingleTone::Instance() { if(m_pSingleTone==NULL){ m_pSingleTone=new SingleTone(); } return (m_pSingleTone); } void SingleTone::push_back(std::string s) { mutex m(m_singleton_mutex); return m_LinkStack.push_back(s); } void SingleTone::pop_back() { mutex m(m_singleton_mutex); return m_LinkStack.pop_back(); } int SingleTone::size() { return m_LinkStack.size(); } std::list<std::string>::iterator SingleTone::begin() { return m_LinkStack.begin(); } std::list<std::string>::reference SingleTone::back() { mutex m(m_singleton_mutex); return m_LinkStack.back(); } std::list<std::string>::iterator SingleTone::end() { return m_LinkStack.end(); } void SingleTone::push_front(std::string s) { mutex m(m_singleton_mutex); return m_LinkStack.push_front(s); } bool SingleTone::empty() { return m_LinkStack.empty(); } unsigned int SingleTone::Get_m_UniqueMap_ForceAdd(const std::string& key,const std::string& url) { mutex m(m_singleton_mutex); return m_UniqueMap[key].ForceAdd(url); } unsigned int SingleTone::Get_m_UniqueMap_Find(const std::string& key,const std::string& url) { HashTable hss = m_UniqueMap[key]; unsigned int uiRet =hss.Find(url); //unsigned int uiRet = m_UniqueMap[key]->Find(url); return uiRet; } HashTable SingleTone::Get_m_UniqueMap(const std::string& key) { return m_UniqueMap[key]; } void SingleTone::Set_m_UniqueMap(const std::string& key,HashTable& hash) { m_UniqueMap[key] = hash; } CURL* SingleTone::GetpCurl() { return m_pcurl; }
3:实现HTTP对网页内容的获取:功能包含初始网页内容的获取,和URL设置等函数。这个过程要求是互斥的,所以引入SingleTone类的内容。
Code:
/////Http.h #ifndef Http_H #define Http_H #include "curl/curl.h" #include "pthread.h" #include <string> using namespace std; class Http { public: Http(void); ~Http(void); bool InitCurl(void); bool InitCurl(const std::string& url, std::string& szbuffer); bool DeInitCurl(); void setUrl(const std::string& url); string setUrl(); const string getBuffer(); private: static void writer(void* buffer,size_t size,size_t nmemb,void* f); int setBuffer(char* buffer,size_t size,size_t nmemb); CURL *m_pcurl; char m_errorBuffer[CURL_ERROR_SIZE]; string m_szbuffer; string m_szUrl; pthread_mutex_t m_http_mutex; }; #endif
#include "Http.h" #include "SingleTone.h" #include "mutex.h" Http::Http(void) { m_pcurl=SingleTone::Instance()->GetpCurl(); } Http::~Http(void) { } bool Http::InitCurl(void) { return false; } int Http::setBuffer(char *buffer, size_t size, size_t nmemb) { int result = 0; if (buffer!=NULL) { m_szbuffer.append(buffer, size * nmemb); result = size * nmemb; } buffer = NULL ; return result; } void Http::writer(void *buffer, size_t size, size_t nmemb,void* f) { static_cast<Http*>(f)->setBuffer((char*)buffer,size,nmemb); } bool Http::InitCurl(const std::string& url, std::string& szbuffer) { pthread_mutex_init(&m_http_mutex,NULL); Http::m_szUrl=url; CURLcode result; if(m_pcurl) { curl_easy_setopt(m_pcurl, CURLOPT_ERRORBUFFER, Http::m_errorBuffer); curl_easy_setopt(m_pcurl, CURLOPT_URL,m_szUrl.c_str()); curl_easy_setopt(m_pcurl, CURLOPT_HEADER, 0); curl_easy_setopt(m_pcurl, CURLOPT_FOLLOWLOCATION, 1); curl_easy_setopt(m_pcurl, CURLOPT_WRITEFUNCTION,Http::writer); curl_easy_setopt(m_pcurl, CURLOPT_WRITEDATA,this); result = curl_easy_perform(m_pcurl); } if(result!=CURLE_OK) return false; szbuffer=m_szbuffer; m_szbuffer.clear(); m_szUrl.clear(); pthread_mutex_destroy(&m_http_mutex); return true; } bool Http::DeInitCurl() { curl_easy_cleanup(m_pcurl); curl_global_cleanup(); m_pcurl = NULL; return true; } const string Http::getBuffer() { return m_szbuffer; } string Http::setUrl() { return Http::m_szUrl; } void Http::setUrl(const std::string& url) { Http::m_szUrl = url; }
其中 m_szbuffer存放网页的内容。初始网页的内容存放在Init函数的形参。
郑重声明:本站内容如果来自互联网及其他传播媒体,其版权均属原媒体及文章作者所有。转载目的在于传递更多信息及用于网络分享,并不代表本站赞同其观点和对其真实性负责,也不构成任何其他建议。