【py分析网页】可能有用的-re去除网页上的杂碎
def
remove_js_css (content): """ remove the the javascript and the stylesheet and the comment content (<script>....</script> and <style>....</style> <!-- xxx -->) """ r =
re. compile (r ‘‘‘<script.*?</script>‘‘‘ ,re.I|re.M|re.S) s =
r.sub (‘‘,content) r =
re. compile (r ‘‘‘<style.*?</style>‘‘‘ ,re.I|re.M|re.S) s =
r.sub (‘‘, s) r =
re. compile (r ‘‘‘<!--.*?-->‘‘‘ , re.I|re.M|re.S) s =
r.sub(‘‘,s) r =
re. compile (r ‘‘‘<meta.*?>‘‘‘ , re.I|re.M|re.S) s =
r.sub(‘‘,s) r =
re. compile (r ‘‘‘<ins.*?</ins>‘‘‘ , re.I|re.M|re.S) s =
r.sub(‘‘,s) return
s |
- 去除空行
def
remove_empty_line (content): """remove multi space """ r =
re. compile (r ‘‘‘^\s+$‘‘‘ , re.M|re.S) s =
r.sub (‘‘, content) r =
re. compile (r ‘‘‘\n+‘‘‘ ,re.M|re.S) s =
r.sub( ‘\n‘ ,s) return
s |
def
remove_any_tag (s): s =
re.sub(r ‘‘‘<[^>]+>‘‘‘ ,‘‘,s) return
s.strip() def
remove_any_tag_but_a (s): text =
re.findall (r ‘‘‘<a[^r][^>]*>(.*?)</a>‘‘‘ ,s,re.I|re.S|re.S) text_b =
remove_any_tag (s) return
len (‘‘.join(text)), len (text_b) def
remove_image (s,n = 50 ): image =
‘a‘
*
n r =
re. compile
(r ‘‘‘<img.*?>‘‘‘ ,re.I|re.M|re.S) s =
r.sub(image,s) return
s def
remove_video (s,n = 1000 ): video =
‘a‘
*
n r =
re. compile
(r ‘‘‘<embed.*?>‘‘‘ ,re.I|re.M|re.S) s =
r.sub(video,s) return
s |
郑重声明:本站内容如果来自互联网及其他传播媒体,其版权均属原媒体及文章作者所有。转载目的在于传递更多信息及用于网络分享,并不代表本站赞同其观点和对其真实性负责,也不构成任何其他建议。