网络爬虫获取网站图片

浏览数：67 / 时间：2015年06月09日
  转自：http://blog.csdn.net/huangxy10/article/details/8120106

  备注：把项目属性中的字符集改成多字节集合?  

  1 // 网络爬虫.cpp : 定义控制台应用程序的入口点。
  2 //
  3 
  4 #include "stdafx.h"
  5 /*
  6 
  7 int _tmain(int argc, _TCHAR* argv[])
  8 {
  9     return 0;
 10 }
 11 
 12 */
 13 
 14 //#include <Windows.h>  
 15 #include <string>  
 16 #include <iostream>  
 17 #include <fstream>  
 18 #include <vector>  
 19 #include "winsock2.h"  
 20 #include <time.h>  
 21 #include <queue>  
 22 #include <hash_set>  
 23   
 24 #pragma comment(lib, "ws2_32.lib")   
 25 using namespace std;  
 26   
 27 #define DEFAULT_PAGE_BUF_SIZE 1048576  
 28   
 29 queue<string> hrefUrl;  
 30 hash_set<string> visitedUrl;  
 31 hash_set<string> visitedImg;  
 32 int depth=0;  
 33 int g_ImgCnt=1;  
 34   
 35 //解析URL，解析出主机名，资源名  
 36 bool ParseURL( const string & url, string & host, string & resource){  
 37     if ( strlen(url.c_str()) > 2000 ) {  
 38         return false;  
 39     }  
 40   
 41     const char * pos = strstr( url.c_str(), "http://" );  
 42     if( pos==NULL ) pos = url.c_str();  
 43     else pos += strlen("http://");  
 44     if( strstr( pos, "/")==0 )  
 45         return false;  
 46     char pHost[100];  
 47     char pResource[2000];  
 48     sscanf( pos, "%[^/]%s", pHost, pResource );  
 49     host = pHost;  
 50     resource = pResource;  
 51     return true;  
 52 }  
 53   
 54 //使用Get请求，得到响应  
 55 bool GetHttpResponse( const string & url, char * &response, int &bytesRead ){  
 56     string host, resource;  
 57     if(!ParseURL( url, host, resource )){  
 58         cout << "Can not parse the url"<<endl;  
 59         return false;  
 60     }  
 61       
 62     //建立socket  
 63     struct hostent * hp= gethostbyname( host.c_str() );  
 64     if( hp==NULL ){  
 65         cout<< "Can not find host address"<<endl;  
 66         return false;  
 67     }  
 68   
 69     SOCKET sock = socket( AF_INET, SOCK_STREAM, IPPROTO_TCP);  
 70     if( sock == -1 || sock == -2 ){  
 71         cout << "Can not create sock."<<endl;  
 72         return false;  
 73     }  
 74   
 75     //建立服务器地址  
 76     SOCKADDR_IN sa;  
 77     sa.sin_family = AF_INET;  
 78     sa.sin_port = htons( 80 );  
 79     //char addr[5];  
 80     //memcpy( addr, hp->h_addr, 4 );  
 81     //sa.sin_addr.s_addr = inet_addr(hp->h_addr);  
 82     memcpy( &sa.sin_addr, hp->h_addr, 4 );  
 83   
 84     //建立连接  
 85     if( 0!= connect( sock, (SOCKADDR*)&sa, sizeof(sa) ) ){  
 86         cout << "Can not connect: "<< url <<endl;  
 87         closesocket(sock);  
 88         return false;  
 89     };  
 90   
 91     //准备发送数据  
 92     string request = "GET " + resource + " HTTP/1.1\r\nHost:" + host + "\r\nConnection:Close\r\n\r\n";  
 93   
 94     //发送数据  
 95     if( SOCKET_ERROR ==send( sock, request.c_str(), request.size(), 0 ) ){  
 96         cout << "send error" <<endl;  
 97         closesocket( sock );  
 98         return false;  
 99     }  
100   
101     //接收数据  
102     int m_nContentLength = DEFAULT_PAGE_BUF_SIZE;  
103     char *pageBuf = (char *)malloc(m_nContentLength);  
104     memset(pageBuf, 0, m_nContentLength);  
105   
106     bytesRead = 0;  
107     int ret = 1;  
108     cout <<"Read: ";  
109     while(ret > 0){  
110         ret = recv(sock, pageBuf + bytesRead, m_nContentLength - bytesRead, 0);  
111           
112         if(ret > 0)  
113         {  
114             bytesRead += ret;  
115         }  
116   
117         if( m_nContentLength - bytesRead<100){  
118             cout << "\nRealloc memorry"<<endl;  
119             m_nContentLength *=2;  
120             pageBuf = (char*)realloc( pageBuf, m_nContentLength);       //重新分配内存  
121         }  
122         cout << ret <<" ";  
123     }  
124     cout <<endl;  
125   
126     pageBuf[bytesRead] = ‘\0‘;  
127     response = pageBuf;  
128     closesocket( sock );  
129     return true;  
130     //cout<< response <<endl;  
131 }  
132   
133 //提取所有的URL以及图片URL  
134 void HTMLParse ( string & htmlResponse, vector<string> & imgurls, const string & host ){  
135     //找所有连接，加入queue中  
136     const char *p= htmlResponse.c_str();  
137     char *tag="href=\"";  
138     const char *pos = strstr( p, tag );  
139     ofstream ofile("url.txt", ios::app);  
140     while( pos ){  
141         pos +=strlen(tag);  
142         const char * nextQ = strstr( pos, "\"" );  
143         if( nextQ ){  
144             char * url = new char[ nextQ-pos+1 ];  
145             //char url[100]; //固定大小的会发生缓冲区溢出的危险  
146             sscanf( pos, "%[^\"]", url);  
147             string surl = url;  // 转换成string类型，可以自动释放内存  
148             if( visitedUrl.find( surl ) == visitedUrl.end() ){  
149                 visitedUrl.insert( surl );  
150                 ofile << surl<<endl;  
151                 hrefUrl.push( surl );  
152             }  
153             pos = strstr(pos, tag );  
154             delete [] url;  // 释放掉申请的内存  
155         }  
156     }  
157     ofile << endl << endl;  
158     ofile.close();  
159   
160     tag ="<img ";  
161     const char* att1= "src=\"";  
162     const char* att2="lazy-src=\"";  
163     const char *pos0 = strstr( p, tag );  
164     while( pos0 ){  
165         pos0 += strlen( tag );  
166         const char* pos2 = strstr( pos0, att2 );  
167         if( !pos2 || pos2 > strstr( pos0, ">") ) {  
168             pos = strstr( pos0, att1);  
169             if(!pos) {  
170                 pos0 = strstr(att1, tag );  
171             continue;  
172             } else {  
173                 pos = pos + strlen(att1);  
174             }  
175         }  
176         else {  
177             pos = pos2 + strlen(att2);  
178         }  
179   
180         const char * nextQ = strstr( pos, "\"");  
181         if( nextQ ){  
182             char * url = new char[nextQ-pos+1];  
183             sscanf( pos, "%[^\"]", url);  
184             cout << url<<endl;  
185             string imgUrl = url;  
186             if( visitedImg.find( imgUrl ) == visitedImg.end() ){  
187                 visitedImg.insert( imgUrl );  
188                 imgurls.push_back( imgUrl );  
189             }  
190             pos0 = strstr(pos0, tag );  
191             delete [] url;  
192         }  
193     }  
194     cout << "end of Parse this html"<<endl;  
195 }  
196   
197 //把URL转化为文件名  
198 string ToFileName( const string &url ){  
199     string fileName;  
200     fileName.resize( url.size());  
201     int k=0;  
202     for( int i=0; i<(int)url.size(); i++){  
203         char ch = url[i];  
204         if( ch!=‘\\‘&&ch!=‘/‘&&ch!=‘:‘&&ch!=‘*‘&&ch!=‘?‘&&ch!=‘"‘&&ch!=‘<‘&&ch!=‘>‘&&ch!=‘|‘)  
205             fileName[k++]=ch;  
206     }  
207     return fileName.substr(0,k) + ".txt";  
208 }  
209   
210 //下载图片到img文件夹  
211 void DownLoadImg( vector<string> & imgurls, const string &url ){  
212   
213     //生成保存该url下图片的文件夹  
214     string foldname = ToFileName( url );  
215     foldname = "./img/"+foldname;  
216     if(!CreateDirectory( (LPCSTR)foldname.c_str(),NULL ))  
217         cout << "Can not create directory:"<< foldname<<endl;  
218     char *image;  
219     int byteRead;  
220     for( int i=0; i<imgurls.size(); i++){  
221         //判断是否为图片，bmp，jgp，jpeg，gif   
222         string str = imgurls[i];  
223         int pos = str.find_last_of(".");  
224         if( pos == string::npos )  
225             continue;  
226         else{  
227             string ext = str.substr( pos+1, str.size()-pos-1 );  
228             if( ext!="bmp"&& ext!="jpg" && ext!="jpeg"&& ext!="gif"&&ext!="png")  
229                 continue;  
230         }  
231         //下载其中的内容  
232         if( GetHttpResponse(imgurls[i], image, byteRead)){  
233             if ( strlen(image) ==0 ) {  
234                 continue;  
235             }  
236             const char *p=image;  
237             const char * pos = strstr(p,"\r\n\r\n")+strlen("\r\n\r\n");  
238             int index = imgurls[i].find_last_of("/");  
239             if( index!=string::npos ){  
240                 string imgname = imgurls[i].substr( index , imgurls[i].size() );  
241                 ofstream ofile( foldname+imgname, ios::binary );  
242                 if( !ofile.is_open() )  
243                     continue;  
244                 cout <<g_ImgCnt++<< foldname+imgname<<endl;  
245                 ofile.write( pos, byteRead- (pos-p) );  
246                 ofile.close();  
247             }  
248             free(image);  
249         }  
250     }  
251 }  
252   
253   
254   
255 //广度遍历  
256 void BFS( const string & url ){  
257     char * response;  
258     int bytes;  
259     // 获取网页的相应，放入response中。  
260     if( !GetHttpResponse( url, response, bytes ) ){  
261         cout << "The url is wrong! ignore." << endl;  
262         return;  
263     }  
264     string httpResponse=response;  
265     free( response );  
266     string filename = ToFileName( url );  
267     ofstream ofile( "./html/"+filename );  
268     if( ofile.is_open() ){  
269         // 保存该网页的文本内容  
270         ofile << httpResponse << endl;  
271         ofile.close();  
272     }  
273     vector<string> imgurls;  
274     //解析该网页的所有图片链接，放入imgurls里面  
275     HTMLParse( httpResponse,  imgurls, url );  
276       
277     //下载所有的图片资源  
278     DownLoadImg( imgurls, url );  
279 }  
280   
281 void main()  
282 {  
283     //初始化socket，用于tcp网络连接  
284     WSADATA wsaData;  
285     if( WSAStartup(MAKEWORD(2,2), &wsaData) != 0 ){  
286         return;  
287     }  
288   
289     // 创建文件夹，保存图片和网页文本文件  
290     CreateDirectory((LPCSTR) "./img",0);  
291     CreateDirectory((LPCSTR)"./html",0);  
292     //string urlStart = "http://hao.360.cn/meinvdaohang.html";  
293   
294     // 遍历的起始地址  
295     // string urlStart = "http://www.wmpic.me/tupian";  
296     string urlStart = "http://item.taobao.com/item.htm?spm=a230r.1.14.19.sBBNbz&id=36366887850&ns=1#detail";  
297       
298     // 使用广度遍历  
299     // 提取网页中的超链接放入hrefUrl中，提取图片链接，下载图片。  
300     BFS( urlStart );  
301   
302     // 访问过的网址保存起来  
303     visitedUrl.insert( urlStart );  
304   
305     while( hrefUrl.size()!=0 ){  
306         string url = hrefUrl.front();  // 从队列的最开始取出一个网址  
307         cout << url << endl;  
308         BFS( url );                   // 遍历提取出来的那个网页，找它里面的超链接网页放入hrefUrl，下载它里面的文本，图片  
309         hrefUrl.pop();                 // 遍历完之后，删除这个网址  
310     }  
311     WSACleanup();  
312     return;  
313 }