php 正则抓去页面函数整理
整理了下抓取页面的一些函数 方便以后使用
//抓取页面 function getcontents($url) { $ch = curl_init(); $timeout = 5; curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout); $contents = curl_exec($ch); return $contents; } //抓取table function get_td_array($table) { // 去掉 HTML 标记属性 $table = preg_replace("‘<table[^>]*?>‘si", "", $table); $table = preg_replace("‘<tr[^>]*?>‘si", "", $table); $table = preg_replace("‘<td[^>]*?>‘si", "", $table); $table = str_replace("</tr>", "{tr}", $table); $table = str_replace("</td>", "{td}", $table); // 去掉 HTML 标记 $table = preg_replace("‘<[\/\!]*?[^<>]*?>‘si", "", $table); // 去掉空白字符 $table = preg_replace("‘([\r\n])[\s]+‘", "", $table); $table = str_replace(" ", "", $table); $table = str_replace(" ", "", $table); $table = explode(‘{tr}‘, $table); array_pop($table); foreach ($table as $key => $tr) { $td = explode(‘{td}‘, $tr); array_pop($td); $td_array[] = $td; } return $td_array; } //post提交数据 模拟采集页面 function curlPost($url,$postData=array()) { if(empty($url)) return false; $o=""; foreach ($postData as $k=>$v){ $o.= "$k=".urlencode($v)."&"; } $postData=substr($o,0,-1); $ch = curl_init(); $timeout = 5; curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout); curl_setopt($ch, CURLOPT_POST, 1); curl_setopt($ch, CURLOPT_HEADER, 0); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_POSTFIELDS, $postData); $contents = curl_exec($ch); return $contents; } // 提交的数据 $postData = array( ‘region_fullname‘=>iconv(‘GBK‘,‘UTF-8‘,‘黄山‘), ‘$total‘ => $totalPage, ‘$pgsz‘=> $prepage, ‘$pg‘ => $page, ); $contents = curlPost($url,$postData);
郑重声明:本站内容如果来自互联网及其他传播媒体,其版权均属原媒体及文章作者所有。转载目的在于传递更多信息及用于网络分享,并不代表本站赞同其观点和对其真实性负责,也不构成任何其他建议。