PHP - php抓取页面方法汇总
//网页抓取方法总结 //一、使用file_get_contents() $timeout = array( ‘http‘=> array( ‘timeout‘=>5, //设置一个超时时间,单位为秒 ) ); $ctx = stream_context_create($timeout); $text = file_get_contents("http://www.baidu.com",0, $ctx); // var_dump($text); //二、使用fopen() function request($url) { $timeout = array( ‘http‘ => array( ‘method‘=>‘GET‘, ‘timeout‘ => 5 //设置一个超时时间,单位为秒 ) ); $ctx = stream_context_create($timeout); $response = ‘‘; if ($fp = fopen($url, "r", false, $ctx)) { while( $c = fread($fp, 8192)) { $response .= $c; } fclose($fp); } return $response; } $data = request(‘http://www.baidu.com‘); //var_dump($data); //使用file_get_contents和fopen必须空间开启allow_url_fopen。方 //法:编辑php.ini,设置 allow_url_fopen = On,allow_url_fopen关闭时 //fopen和file_get_contents都不能打开远程文件。 //三、使用CURL function request2($url, $method=‘GET‘, $data=‘‘) { $ch = curl_init(); if($method == ‘POST‘) { curl_setopt($ch, CURLOPT_POST, 1); $data ? curl_setopt($ch, CURLOPT_POSTFIELDS, $data) : ‘‘; }elseif($method == ‘GET‘) { $url = $data ? $url.‘?‘.http_build_query($data) : $url; } //curl_setopt($ch, CURLOPT_HEADER, 1); //是否获取http响应头 curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_USERAGENT, ‘Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)‘); curl_setopt($ch,CURLOPT_RETURNTRANSFER,1); //默认为0,直接输出curl_exec获取到的信息 curl_setopt($ch,CURLOPT_TIMEOUT,30); $res = curl_exec($ch); $info = curl_getinfo($ch); curl_close($ch); return array(‘content‘=>$res,‘headers‘=>$info); } $data = request2(‘http://www.baidu.com‘); //var_dump($data); //fopen / file_get_contents 每次请求都会重新做DNS查询,并不对DNS信息进行缓存。 //但是CURL会自动对DNS信息进行缓存。对同一域名下的网页或者图片的请求只需要一次DNS查询。 //这大大减少了DNS查询的次数。 //所以CURL的性能比fopen / file_get_contents 好很多。 //四、使用socket //1、使用socket_create() function request3($url, $rettype=1, $method=‘GET‘, $postdata=‘‘) { //分析URL $pattern = ‘ /^(http|https):\/\/([a-zA-Z0-9_.]+)(:(\d+)){0,1}(.*)/i‘; if(!preg_match($pattern, $url, $mathes)) { die(‘URL格式错误!‘); } $host = gethostbyname($mathes[2]); $port = empty($mathes[4]) ? 80 : $mathes[4]; $suri = empty($mathes[5]) ? ‘/‘ : $mathes[5]; //查询字符串 //1、创建一个scoket $socket = socket_create(AF_INET, SOCK_STREAM, SOL_TCP); if(!$socket) { die(‘创建scoket失败!‘.socket_strerror(socket_last_error())); } //2.创建一个socket连接 $sconn = socket_connect($socket, $host, $port); if(!$sconn) { die(‘初始化scoket连接失败!‘.socket_strerror(socket_last_error())); } //3.写入scoket if($method == ‘GET‘) { $header = "GET $suri HTTP/1.1\r\n"; }elseif($method == ‘POST‘) { $header = "Content-Type: application/x-www-form-urlencoded\r\n"; $header .= "Content-Length: ".strlen($postdata)."\r\n"; $header .= "\r\n"; $header .= $postdata."\r\n"; } $header .= "Host: $host\r\n"; $header .= "Connection: Close\r\n"; // $header .= "User-Agent: Mozilla/5.0 (Windows NT 6.1; rv:29.0) Gecko/20100101 Firefox/29.0"; $header .= "\r\n"; $bytes = socket_write($socket, $header, strlen($header)); if($bytes === false) { die(‘写入scoket失败!‘.socket_strerror(socket_last_error())); } //4.读取socket $response = ‘‘; while($v = socket_read($socket, 4096)) { $response.= $v; } //5.关闭scoket socket_close($socket); $data = preg_split(‘/\r\n\r\n/‘, $response, 2); if($rettype == 1) { //获取响应正文 return $data[1]; }elseif($rettype == 2) { //获取响应头 return $data[0]; }else { //获取响应 return $data[1]; } } $response = request3(‘http://www.baidu.com‘, 1); // var_dump($response); //2、使用fsockopen() function request4($url, $rettype=1, $method=‘GET‘, $postdata=‘‘) { //分析URL $pattern = ‘ /^(http|https):\/\/([a-zA-Z0-9_.]+)(:(\d+)){0,1}(.*)/i‘; if(!preg_match($pattern, $url, $mathes)) { die(‘URL格式错误!‘); } $host = gethostbyname($mathes[2]); $port = empty($mathes[4]) ? 80 : $mathes[4]; $suri = empty($mathes[5]) ? ‘/‘ : $mathes[5]; //查询字符串 //1、打开一个scoket连接 $fp = fsockopen($host, $port, $errno, $errstr); if(!$fp) { die(‘打开scoket连接失败!‘.$errstr); } //2.往文件句柄写入内容 if($method == ‘GET‘) { $header = "GET $suri HTTP/1.1\r\n"; }elseif($method == ‘POST‘) { $header = "Content-Type: application/x-www-form-urlencoded\r\n"; $header .= "Content-Length: ".strlen($postdata)."\r\n"; $header .= "\r\n"; $header .= $postdata."\r\n"; } $header .= "Host: $host\r\n"; $header .= "Connection: Close\r\n"; $header .= "\r\n"; fwrite($fp, $header); //3.读取内容 $response = ‘‘; while (!feof($fp)) { $response .= fgets($fp, 128); } //4.关闭文件句柄 fclose($fp); $data = preg_split(‘/\r\n\r\n/‘, $response, 2); if($rettype == 1) { //获取响应正文 return $data[1]; }elseif($rettype == 2) { //获取响应头 return $data[0]; }else { //获取响应 return $data[1]; } } $response = request4(‘http://www.163.com‘); // var_dump($response); //四、使用snoopy function request5($url) { include_once ‘interview_lib/snoopy.php‘; $snoopy = new snoopy(); $snoopy->referer = ‘http://www.sina.com‘; $snoopy->fetch($url); return $snoopy->results; } $data = request5(‘http://www.163.com‘); var_dump($data); //snoopy很好用,封装好了,测试的都能抓取 ?>
参考:http://blog.csdn.net/lxzo123/article/details/6718771
http://www.nowamagic.net/librarys/veda/detail/2585
http://www1.phpchina.com/archives/view-42979-1.html
http://blog.csdn.net/lxzo123/article/details/6718771
郑重声明:本站内容如果来自互联网及其他传播媒体,其版权均属原媒体及文章作者所有。转载目的在于传递更多信息及用于网络分享,并不代表本站赞同其观点和对其真实性负责,也不构成任何其他建议。