PHP - php抓取页面方法汇总

浏览数：40 / 时间：2015年06月09日

    //网页抓取方法总结
    //一、使用file_get_contents()
    $timeout = array(  
        ‘http‘=> array(  
            ‘timeout‘=>5, //设置一个超时时间，单位为秒  
        )  
    );  
    $ctx = stream_context_create($timeout);  
    $text = file_get_contents("http://www.baidu.com",0, $ctx);
    // var_dump($text); 
    
    //二、使用fopen()
    function request($url) {
        $timeout = array(
           ‘http‘ => array(
               ‘method‘=>‘GET‘,
               ‘timeout‘ => 5 //设置一个超时时间，单位为秒
           )
        );

        $ctx = stream_context_create($timeout);
        $response = ‘‘;
        if ($fp = fopen($url, "r", false, $ctx)) {
            while( $c = fread($fp, 8192)) {
                $response .= $c;
            }
            fclose($fp);
        }
        return $response;
    }
    $data = request(‘http://www.baidu.com‘);
    //var_dump($data);
    
    //使用file_get_contents和fopen必须空间开启allow_url_fopen。方
    //法：编辑php.ini，设置 allow_url_fopen = On，allow_url_fopen关闭时
    //fopen和file_get_contents都不能打开远程文件。
    
    //三、使用CURL
    function request2($url, $method=‘GET‘, $data=‘‘) {
        $ch = curl_init();        
        
        if($method == ‘POST‘) {
            curl_setopt($ch, CURLOPT_POST, 1);
            $data ? curl_setopt($ch, CURLOPT_POSTFIELDS, $data) : ‘‘;
        }elseif($method == ‘GET‘) {
            $url = $data ? $url.‘?‘.http_build_query($data) : $url;
        }
        //curl_setopt($ch, CURLOPT_HEADER, 1);  //是否获取http响应头
        curl_setopt($ch, CURLOPT_URL, $url);
        curl_setopt($ch, CURLOPT_USERAGENT, ‘Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)‘);    
        curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);   //默认为0，直接输出curl_exec获取到的信息
        curl_setopt($ch,CURLOPT_TIMEOUT,30); 
        $res = curl_exec($ch);
        $info = curl_getinfo($ch);
        curl_close($ch);
        return array(‘content‘=>$res,‘headers‘=>$info);
    }
    $data = request2(‘http://www.baidu.com‘);
    //var_dump($data);
    
    //fopen / file_get_contents 每次请求都会重新做DNS查询，并不对DNS信息进行缓存。
    //但是CURL会自动对DNS信息进行缓存。对同一域名下的网页或者图片的请求只需要一次DNS查询。
    //这大大减少了DNS查询的次数。
    //所以CURL的性能比fopen / file_get_contents 好很多。
    
    //四、使用socket
    //1、使用socket_create()
    function request3($url, $rettype=1, $method=‘GET‘, $postdata=‘‘) {
        //分析URL
        $pattern = ‘ /^(http|https):\/\/([a-zA-Z0-9_.]+)(:(\d+)){0,1}(.*)/i‘;
        if(!preg_match($pattern, $url, $mathes)) {
            die(‘URL格式错误！‘);
        }
        $host = gethostbyname($mathes[2]);
        $port = empty($mathes[4]) ? 80 : $mathes[4];
        $suri = empty($mathes[5]) ? ‘/‘ : $mathes[5]; //查询字符串
        
        //1、创建一个scoket
        $socket = socket_create(AF_INET, SOCK_STREAM, SOL_TCP);
        if(!$socket) {
            die(‘创建scoket失败！‘.socket_strerror(socket_last_error()));
        }
        //2.创建一个socket连接
        $sconn = socket_connect($socket, $host, $port);
        if(!$sconn) {
            die(‘初始化scoket连接失败！‘.socket_strerror(socket_last_error()));
        }
        //3.写入scoket
        if($method == ‘GET‘) {
            $header = "GET $suri HTTP/1.1\r\n";
        }elseif($method == ‘POST‘) {
              $header = "Content-Type: application/x-www-form-urlencoded\r\n";
              $header .= "Content-Length: ".strlen($postdata)."\r\n";
              $header .= "\r\n";
              $header .= $postdata."\r\n";
        }
        $header .= "Host: $host\r\n";
        $header .= "Connection: Close\r\n";
        // $header .= "User-Agent: Mozilla/5.0 (Windows NT 6.1; rv:29.0) Gecko/20100101 Firefox/29.0";
        $header .= "\r\n";
        $bytes = socket_write($socket, $header, strlen($header));
        if($bytes === false) {
            die(‘写入scoket失败！‘.socket_strerror(socket_last_error()));
        }
        //4.读取socket
        $response = ‘‘;
        while($v = socket_read($socket, 4096)) {
            $response.= $v;
        }
        //5.关闭scoket
        socket_close($socket);
        
        $data = preg_split(‘/\r\n\r\n/‘, $response, 2);
        if($rettype == 1) {   //获取响应正文
            return $data[1];
        }elseif($rettype == 2) {  //获取响应头
            return $data[0];
        }else {  //获取响应
            return $data[1];
        }
    }
    $response = request3(‘http://www.baidu.com‘, 1);
    // var_dump($response);
    
    //2、使用fsockopen()
    function request4($url, $rettype=1, $method=‘GET‘, $postdata=‘‘) {
        //分析URL
        $pattern = ‘ /^(http|https):\/\/([a-zA-Z0-9_.]+)(:(\d+)){0,1}(.*)/i‘;
        if(!preg_match($pattern, $url, $mathes)) {
            die(‘URL格式错误！‘);
        }
        $host = gethostbyname($mathes[2]);
        $port = empty($mathes[4]) ? 80 : $mathes[4];
        $suri = empty($mathes[5]) ? ‘/‘ : $mathes[5]; //查询字符串
        
        //1、打开一个scoket连接
        $fp = fsockopen($host, $port, $errno, $errstr);
        if(!$fp) {
            die(‘打开scoket连接失败！‘.$errstr);
        }
        
        //2.往文件句柄写入内容
        if($method == ‘GET‘) {
            $header = "GET $suri HTTP/1.1\r\n";
        }elseif($method == ‘POST‘) {
              $header = "Content-Type: application/x-www-form-urlencoded\r\n";
              $header .= "Content-Length: ".strlen($postdata)."\r\n";
              $header .= "\r\n";
              $header .= $postdata."\r\n";
        }
        $header .= "Host: $host\r\n";
        $header .= "Connection: Close\r\n";
        $header .= "\r\n";
        fwrite($fp, $header);
        
        //3.读取内容
        $response = ‘‘;
        while (!feof($fp)) {
            $response .= fgets($fp, 128);
        }
        
        //4.关闭文件句柄
        fclose($fp);
        
        $data = preg_split(‘/\r\n\r\n/‘, $response, 2);
        if($rettype == 1) {   //获取响应正文
            return $data[1];
        }elseif($rettype == 2) {  //获取响应头
            return $data[0];
        }else {  //获取响应
            return $data[1];
        }
    }
    
    $response = request4(‘http://www.163.com‘);
    // var_dump($response);
    
    //四、使用snoopy
    function request5($url) {
        include_once ‘interview_lib/snoopy.php‘;
        $snoopy = new snoopy();
        $snoopy->referer = ‘http://www.sina.com‘;
        $snoopy->fetch($url);
        return $snoopy->results;
    }
    $data = request5(‘http://www.163.com‘);
    var_dump($data);
    
    //snoopy很好用，封装好了，测试的都能抓取

    
?>