bs+json解析
# -*- coding=utf-8 -*- import urllib2 from BeautifulSoup import BeautifulSoup as bs3 import json import codecs #字符检测,用来检测其真实的编码格式 import chardet #save content to file def save_to_file(filename, content): f = open(filename, 'w+') assert(f) f.write(content) f.close() def parse_key_link(content): old_code_name = chardet.detect(content)['encoding'] print('old_code_name[key_link]=%s' % (old_code_name,)) js = json.loads(content.decode(old_code_name)) for i in js['result']['items']: print('name=%s, link=%s' % (i['name'].encode(old_code_name),i['link'].encode(old_code_name))) def parse_config(content): old_code_name = chardet.detect(content)['encoding'] print('old_code_name[config]=%s' % (old_code_name,)) js = json.loads(content.decode(old_code_name)) for i in js['result']['paramtypeitems']: print('name=%s' % (i['name'].encode(old_code_name),)) i1 = i['paramitems'] for j in i1: print(' name=%s' % (j['name'].encode(old_code_name),)) j1 = j['valueitems'] for k in j1: print(' specid=%d,value=%s' % (k['specid'],k['value'].encode(old_code_name))) def parse_option(content): old_code_name = chardet.detect(content)['encoding'] print('old_code_name[option]=%s' % (old_code_name,)) js = json.loads(content.decode(old_code_name)) for i in js['result']['configtypeitems']: print('name=%s' % (i['name'].encode(old_code_name),)) i1 = i['configitems'] for j in i1: print(' name=%s' % (j['name'].encode(old_code_name),)) j1 = j['valueitems'] for k in j1: print(' specid=%d,value=%s' % (k['specid'],k['value'].encode(old_code_name))) def parse_color(content): old_code_name = chardet.detect(content)['encoding'] print('old_code_name[color]=%s' % (old_code_name,)) js = json.loads(content.decode(old_code_name)) for i in js['result']['specitems']: print('specid=%d' % (i['specid'],)) i1 = i['coloritems'] for j in i1: print(' id=%d,name=%s,value=%s,picnum=%d' % (j['id'],j['name'].encode(old_code_name),j['value'].encode(old_code_name),j['picnum'])) def parse_innerColor(content): old_code_name = chardet.detect(content)['encoding'] print('old_code_name[innerColor]=%s' % (old_code_name,)) js = json.loads(content.decode(old_code_name)) for i in js['result']['specitems']: print('specid=%d' % (i['specid'],)) i1 = i['coloritems'] for j in i1: j1 = j['values'] for k in j1: print(' id=%d,name=%s,value=%s,picnum=%d' % (j['id'],j['name'].encode(old_code_name),k.encode(old_code_name),j['picnum'])) def parse_json_data(content): name_list = ['keyLink', 'config', 'option','color', 'innerColor'] parse_list = [parse_key_link, parse_config, parse_option, parse_color, parse_innerColor] assert(len(content) == len(parse_list)) for i in range(len(content)): parse_list[i](content[i]) def parse_content(content): #content是GB2312的编码 soup = bs3(content) key_text = 'var levelId' elem_lib = soup.find('script', text=lambda(x):key_text in x) #str_script是utf-8的编码 str_script = str(elem_lib.string) #print(chardet.detect(str_script)) #由于命令行是cp936 GBK的编码,如果编码不符合无法打印 strGBK = str_script.decode('utf-8').encode('gb2312') #print(strGBK) #移除html的转义字符 strGBK = strGBK.replace(' ','') d = strGBK.splitlines() list_data = [] for i in d: if i.isspace(): continue #过滤不需要的变量 if len(i) < 100: continue #取出json数据 idx = i.find('{') if idx == -1: continue #移除最后的; k = i[idx:-1] list_data.append(k) parse_json_data(list_data) def crawler_4_autohome(): autohome_url = 'http://car.autohome.com.cn/config/series/657.html' #uft-8 content = urllib2.urlopen(url=autohome_url).read() #print(chardet.detect(content)) parse_content(content) if __name__ == '__main__': crawler_4_autohome()
源码下载地址:
http://download.csdn.net/detail/davidsu33/8447189
郑重声明:本站内容如果来自互联网及其他传播媒体,其版权均属原媒体及文章作者所有。转载目的在于传递更多信息及用于网络分享,并不代表本站赞同其观点和对其真实性负责,也不构成任何其他建议。