python处理html的table标签
转载:http://www.xuebuyuan.com/583071.html
python处理html的table标签
import sys import csv import urllib2 import BeautifulSoup #page = urllib2.urlopen(sys.argv[1]).read() soup = BeautifulSoup.BeautifulSoup(open(sys.argv[1]).read()) csvout = csv.writer(sys.stdout) for table in soup.findAll(‘table‘): print "<table border=‘1‘>" #print ‘#‘ #print ‘# Table‘ #print ‘# Fields: ‘ + ‘,‘.join([tr.text for tr in table.findAll(‘th‘)]) for row in table.findAll(‘tr‘): print "<tr>" #csvout.writerow([tr.text for tr in row.findAll(‘td‘)]) for tr in row.findAll(‘td‘): print "<td>" print tr.text.encode("utf-8") print "</td>" print "</tr>" print "</table>" break
#!/bin/bash #process.h basedir=$(dirname $1) echo $basedir #echo \<head\>\<meta http-equiv=\"Content-Type\" content=\"text/html\; charset=UTF-8\" /\>\</head\> >> $basedir/baobei.html prodname=$(grep -o ‘<title id="id_title">.*</title>‘ $1 | cut -d \> -f 2 | cut -d \< -f 1) prodname=$(echo $prodname | cut -d _ -f 1) price=$(grep -o ‘<span class="s1">[0-9]*</span>‘ $1 | cut -d \> -f 2 | cut -d \< -f 1) echo \<table\> > $basedir/baobei.html echo \<tr\> >> $basedir/baobei.html echo \<td\>Name\</td\> >> $basedir/baobei.html echo \<td\>$prodname\</td\> >> $basedir/baobei.html echo \</tr\> >> $basedir/baobei.html echo \<tr\> >> $basedir/baobei.html echo \<td\>Price\</td\> >> $basedir/baobei.html echo \<td\>$price\</td\> >> $basedir/baobei.html echo \</tr\> >> $basedir/baobei.html python ./printtab.py $1 >> $basedir/baobei.html echo \</table\> >> $basedir/baobei.html imgsrc=$(head -n 1 $basedir/imglist) if test y$imgsrc = y; then rm -rf $basedir/baobei.html && exit; fi echo \<img src=\‘$imgsrc\‘/\> >> $basedir/baobei.html cat $basedir/baobei.html | tr -d ‘\n‘ | tr -d ‘"‘ > $basedir/baobei.html.tmp mv $basedir/baobei.html.tmp $basedir/baobei.html
#!/bn/bash #process2.sh basedir=$(dirname $1) name=$(grep -o "<td>Name</td><td>.*</td>" $1 | cut -d \> -f 4 | cut -d \< -f 1 ) if test "x$name" = "x" ; then exit ; fi price=$(grep -o "<td>Price</td><td>.*</td>" $1 | cut -d \> -f 4 | cut -d \< -f 1 ) if test "x$price" = "x" ; then exit; fi if test "x$class" = "x" then class=$(grep -o "<td>产品类型</td><td>.*</td>" $1 | cut -d \> -f 4 | cut -d \< -f 1 ) fi if test "x$class" = "x" then class=$(grep -o "<td>设备类型</td><td>.*</td>" $1 | cut -d \> -f 4 | cut -d \< -f 1 ) fi if test "x$class" = "x" then class=$(grep -o "<td>打印针数</td><td>.*</td>" $1 | cut -d \> -f 4 | cut -d \< -f 1 ) fi if test "x$class" = "x" then class="条形码打印机" fi if $( echo $class | grep --quiet ‘票据‘ ) then class="536187477" elif $( echo $class | grep --quiet ‘发票‘ ) then class="536187477" elif $( echo $class | grep --quiet ‘票证‘ ) then class="536187477" elif $( echo $class | grep --quiet ‘存折‘ ) then class="536187477" ################################################################## elif $( echo $class | grep --quiet ‘针‘ ) then class="536187477" ################################################################## elif $( echo $class | grep --quiet ‘灯泡‘ ) then class="536187479" elif $( echo $class | grep --quiet ‘UHE‘ ) then class="536187479" elif $( echo $class | grep --quiet ‘UHP‘ ) then class="536187479" elif $( echo $class | grep --quiet ‘HSCR‘ ) then class="536187479" ############################################################### elif $( echo $class | grep --quiet ‘条形码打印机‘ ) then class="536187480" ################################################################## elif $( echo $class | grep --quiet ‘证卡打印‘ ) then class="536187483" ################################################################## elif $( echo $class | grep --quiet ‘条码‘ ) then class="536187481" elif $( echo $class | grep --quiet ‘扫描‘ ) then class="536187481" elif $( echo $class | grep --quiet ‘阅读‘ ) then class="536187481" elif $( echo $class | grep --quiet ‘采集‘ ) then class="536187481" elif $( echo $class | grep --quiet ‘手持‘ ) then class="536187481" elif $( echo $class | grep --quiet ‘数据终端‘ ) then class="536187481" ################################################################## elif $( echo $class | grep --quiet ‘激光‘ ) then class="536187484" ################################################################## elif $( echo $class | grep --quiet ‘喷墨‘ ) then class="536187486" ################################################################## elif $( echo $class | grep --quiet ‘复印‘ ) then class="536187615" ################################################################## elif $( echo $class | grep --quiet ‘一体机‘ ) then class="536187485" ################################################################## elif $( echo $class | grep --quiet ‘硒鼓‘ ) then class="536187616" elif $( echo $class | grep --quiet ‘墨盒‘ ) then class="536187616" else class="536187616" fi ################################################################ imagepath=$(find $basedir -type f -iname "*.jpg") if test "x$imagepath" = "x"; then exit ; fi image=$(md5sum $imagepath | cut -d ‘ ‘ -f 1) cp -rf $imagepath $basedir/../../template/$image.tbi ################################################################ desc=$(cat $1) ################################################################ echo -e \"$name\""\t"110514"\t"\",$class,\""\t"1"\t"\"上海\""\t"\"上海\""\t"\"b\""\t"$price"\t"0.000000"\t"1"\t"7"\t"2"\t"0.000000"\t"0.000000"\t"0.000000"\t""\t""\t"1"\t"1"\t"0"\t"1"\t"1"\t"0"\t"\"2012-10-16 13:09:48\""\t""\t"\"$desc\""\t""\t"\"20000:31140\;20196:3228846\;29969:107401\;30681:32998\;31468:102250\;31479:92188\;3415558:27513\;3415563:21959\;3415571:21959\;3415581:10122\;3415609:22041\;7884463:75957615\;14319244:80897641\;14319250:123483713\;14791484:10285019\;\""\t""\t""\t"0"\t"0"\t"\"2012-10-16 13:37:51\""\t"100"\t""\t"0"\t"\"$image:0:0:\|\;\""\t"\"\""\t"\"\""\t"\",\""\t"\",\""\t"\"\""\t"\"\""\t"0"\t"\"15758222730\""\t"15758222730
classtable = { 17 "536187477" : "票据打印机" , 18 "536187478" : "针式打印机" , 19 "536187479" : "投影灯泡" , 20 "536187480" : "条形码打印机" , 21 "536187481" : "条码设备" , 22 "536187483" : "证卡打印机" , 23 "536187484" : "激光打印机" , 24 "536187485" : "多功能一体机" , 25 "536187486" : "喷墨打印机" , 26 "536187615" : "复印复合机" , 27 "536187616" : "硒鼓" , 28 }
郑重声明:本站内容如果来自互联网及其他传播媒体,其版权均属原媒体及文章作者所有。转载目的在于传递更多信息及用于网络分享,并不代表本站赞同其观点和对其真实性负责,也不构成任何其他建议。