多线程网页爬虫 python 实现(二)
#!/usr/bin/env python #coding=utf-8 import threading import urllib import re import time cur=0 last=0 totalcount=0 depth=0 t_mutex=threading.Condition() class Mycrawler: def __init__(self,crawlername,seeds,threadnum): self.crawlername=crawlername self.seeds=seeds self.crawqueue=CrawQueue() self.initQueue(self.seeds) self.threadnum=threadnum self.threadpools=[] self.logfile=file(‘log2.txt‘,‘w‘) def initQueue(self,seeds): if isinstance(seeds,str): self.crawqueue.push(seeds) elif isinstance(seeds,list): for seed in seeds: self.crawqueue.push(seed) global last global totalcount totalcount=self.crawqueue.getQueueCount() last=totalcount def crawling(self): global cur global depth global last global totalcount self.log(">>>Depth "+str(depth)+":\n") while self.crawqueue.getQueueCount()!=0: url=self.crawqueue.pop() self.log(url) if url==None: continue self.crawqueue.addToVisited(url) links=self.getLinks(url) if links==None: print ‘None‘ self.crawqueue.failed.append(url) continue beforenum = self.crawqueue.getQueueCount() self.crawqueue.addLinks(links) afternum = self.crawqueue.getQueueCount() totalcount+=afternum-beforenum cur+=1 if cur==last: depth+=1 self.log(">>>Depth "+str(depth)+":\n") last=totalcount def crawling2(self): global last global totalcount global depth self.log(">>>Depth "+str(depth)+":\n") totalcount=self.crawqueue.getQueueCount() last=totalcount while self.crawqueue.getQueueCount()!=0: for i in range(self.threadnum): url=self.crawqueue.pop() if url==None: break crawthread=crawlerThread(url,i,self) self.threadpools.append(crawthread) crawthread.start() for i in range(len(self.threadpools)): crawthread=self.threadpools[i] crawthread.join(30) def log(self,content): self.logfile.write(content+"\n") class crawlerThread(threading.Thread): def __init__(self,url,tid,mycrawler): threading.Thread.__init__(self) self.url=url self.tid=tid self.mycrawler=mycrawler def run(self): global t_mutex global cur global last global totalcount global depth t_mutex.acquire() self.mycrawler.log(self.url) t_mutex.release() links=self.getLinks(self.url) if links==None: t_mutex.acquire() self.mycrawler.crawqueue.addToVisited(self.url) self.mycrawler.crawqueue.addToFailed(self.url) t_mutex.release() else: t_mutex.acquire() self.mycrawler.crawqueue.addToVisited(self.url) beforenum=self.mycrawler.crawqueue.getQueueCount() self.mycrawler.crawqueue.addLinks(links) afternum =self.mycrawler.crawqueue.getQueueCount() totalcount+=afternum-beforenum t_mutex.release() t_mutex.acquire() cur+=1 if cur==last: depth+=1 self.mycrawler.log(">>>Depth "+str(depth)+":\n") last=totalcount t_mutex.release() def getLinks(self,url): try: page=urllib.urlopen(url) html=page.read() reg=r‘"(http://.+?)"‘ regob=re.compile(reg,re.DOTALL) links=regob.findall(html) return links except: print ‘Failed downloading and saving‘,url return None class CrawQueue: def __init__(self): self.queue=[] self.visited=[] self.failed=[] def getQueue(self): return self.queue def getVisited(self): return self.visited def getFailed(self): return self.failed def push(self,url): if url!="" and url not in self.queue and url not in self.visited: self.queue.insert(0,url) def pop(self): if len(self.queue)==0: #print ‘failed to pop: queue is empty‘ return None else: return self.queue.pop() def isEmpty(self): if len(self.queue)==0: return 1 else: return 0 def addToVisited(self,url): self.visited.append(url) def addToFailed(self,url): self.failed.append(url) def remove(self,url): self.queue.remove(url) def getVisitedCount(self): return len(self.visited) def getQueueCount(self): return len(self.queue) def addLinks(self,links): for link in links: self.push(link) if __name__=="__main__": seeds="http://www.douban.com/" threadnum=int(raw_input("设置线程数:")) crawlername="小小爬虫" mycrawler=Mycrawler(crawlername,seeds,threadnum) mycrawler.crawling2()
郑重声明:本站内容如果来自互联网及其他传播媒体,其版权均属原媒体及文章作者所有。转载目的在于传递更多信息及用于网络分享,并不代表本站赞同其观点和对其真实性负责,也不构成任何其他建议。