php 抓取网页的几种方式

最简单的就是 file_get_contents 其次是curl。

curl 我写个简单的例子,支持https:

function curl_file_get_contents($url, $params = array(), $is_post = false, $time_out = 20, $header=array())
{
    $ca=getcwd()."/cacert.pem";
    $isSsl=substr($url,0,8)=="https://"?true:false;
    $ch = curl_init();//初始化curl
    /*
    $url_arr = parse_url($url);
    $url_scheme=$url_arr["scheme"];
    $url_host = $url_arr["host"];
    $url_port = $url_arr["port"];
    if(empty($url_port)){
    $url_port=($url_scheme=='https')?'443':'80';
    }
    */
    //$url_ip = gethostbyname($url_host);
    //if($url_host==$url_ip){
    //$url_ip='127.0.0.1';
    //}
    //$url_nameresolve="{$url_host}:{$url_port}:{$url_ip}";
    curl_setopt($ch, CURLOPT_URL, $url);//抓取指定网页
    //curl_setopt($ch, CURLOPT_RESOLVE, [$url_nameresolve]);
    //curl_setopt($ch, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_0); // 强制使用 HTTP/1.0
    curl_setopt($ch, CURLOPT_IPRESOLVE, CURL_IPRESOLVE_V4); //强制使用IPv4
    curl_setopt($ch, CURLOPT_HEADER, false);//设置是否返回response header
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);//要求结果为字符串且输出到屏幕上
    curl_setopt($ch, CURLOPT_BINARYTRANSFER, true);
    //当需要通过curl_getinfo来获取发出请求的header信息时,该选项需要设置为true
    //curl_setopt($ch, CURLINFO_HEADER_OUT, true);
    curl_setopt($ch, CURLOPT_TIMEOUT, $time_out);
    curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $time_out);
    curl_setopt($ch, CURLOPT_POST, $is_post);
    //curl_setopt($ch, CURLOPT_SSLVERSION, 1);
    //curl_setopt($ch, CURLOPT_PROXY, $proxy_host); //使用http代理
    //curl_setopt($ch, CURLOPT_PROXYPORT, $proxy_port);
    //curl_setopt($ch, CURLOPT_PROXYUSERPWD, "$proxy_user:$proxy_pwd");
    if($isSsl){
        curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
        curl_setopt($ch, CURLOPT_CAINFO, $ca);
        curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 2);
    }
    
    //curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
    if ($is_post) {
        curl_setopt($ch, CURLOPT_POSTFIELDS, $params);
        curl_setopt($ch, CURLOPT_CUSTOMREQUEST, 'POST');
    }else{
        curl_setopt($ch, CURLOPT_CUSTOMREQUEST, 'GET');
    }
    
    if ($header) {
        curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
    }
    //$cfcuid='';
    //if(isset($_COOKIE[session_name()])){
    //  	curl_setopt($ch, CURLOPT_COOKIE, session_name()."=".$_COOKIE[session_name()].$cfcuid);
    //}
    //session_write_close(); //关键
    $response = curl_exec($ch);
    //打印请求的header信息
    //$request_header = curl_getinfo( $ch, CURLINFO_HEADER_OUT);
    //print_r($request_header);
    curl_close($ch);
    return $response;
}
//使用方法:
    $fullurl="https://www.baidu.com";
    $header=array();
    $header[] = "User-Agent: custom bot";
    $header[] = "Authorization: ok";
    $header[] = "fromclient: php";
    $header[] = "Expect:"; //post最大数据为1024,如果大于就要加下面这句话不然会报错或504
    //$header[] = "Cookie: ".session_name()."=".$_COOKIE[session_name()];
    $post_data = array(
        'id' => 1
    );
    $response = curl_file_get_contents($fullurl,$post_data,true,20,$header);
    //如果只是get
    //$response = curl_file_get_contents($fullurl);
    echo $response;

如果需要获取链接,图片,表格等数据那就需要使用snoopy

http://sourceforge.net/projects/snoopy

如果要像jquery一样的操作。可以采用 simple_html_dom

源码:

https://simplehtmldom.sourceforge.io/docs/1.9/

https://sourceforge.net/projects/simplehtmldom/

使用例子:

https://www.cnblogs.com/rockchip/p/3202552.html (推荐)

https://baijiahao.baidu.com/s?id=1714463047885529357

我写了一个例子:

include "simple_html_dom.php";
$fullurl="https://www.baidu.com";
$localurl="https://www.163.com";
$response = curl_file_get_contents($fullurl);
$html = str_get_html($response);
foreach($html->find('a') as $element){
    //补全链接,将../bb/1.png改为https://www.aabb.com/aa/bb/1.png
    $newurl=full_url_format($fullurl,$element->href);
    //替换主机为www.163.com
    $newurl=replaceHost($newurl,$localurl,'/subDir');
    $element->href=$newurl;
}
$news=$html->find("div#news",0);
$newsHtml=$hotnews->innertext;
$news->save('test.html');
//上面用到两个自定义方法,后面会讲到

上面代码用到的full_url_format和replaceHost代码如下:

//替换主机名
function replaceHost($srcurl,$hostname,$rootPath=''){
    //host or host:port
    $hostinfo = parse_url($hostname);
    $host=$hostname;
    $port="";
    if(isset($hostinfo['scheme'])) {
        $host = $hostinfo['host'];
        $port = empty($hostinfo['port'])?'':':'.$hostinfo['port'];
    }
    $srcinfo = parse_url($srcurl);
    $fragment= empty($srcinfo['fragment'])?'':'#'.$srcinfo['fragment'];
    $query= empty($srcinfo['query'])?'':'?'.$srcinfo['query'];
    return $srcinfo['scheme'].'://'.$host.$port.$rootPath.$srcinfo['path'].$query.$fragment;
}
//将相对路径转成完整网址链接
function full_url_format($baseurl, $srcurl) {
    $srcinfo = parse_url($srcurl);
    if(isset($srcinfo['scheme'])) {
        return $srcurl;
    }
    
    $baseinfo = parse_url($baseurl);
    $port = empty($baseinfo['port'])?'':':'.$baseinfo['port'];
    $url = $baseinfo['scheme'].'://'.$baseinfo['host'].$port;
    if(!isset($srcinfo['path'])){
        return $srcurl;
    }
    if(substr($srcinfo['path'], 0, 1) == '/') {
        $path = $srcinfo['path'];
    }else{
        $path = dirname($baseinfo['path']).'/'.$srcinfo['path'];
    }
    $fragment= empty($srcinfo['fragment'])?'':'#'.$srcinfo['fragment'];
    $query= empty($srcinfo['query'])?'':'?'.$srcinfo['query'];
    //echo $fragment;
    $rst = array();
    $path_array = explode('/', $path);
    //$base_array = explode('/', $baseinfo['path']);
    if(!$path_array[0]) {
        $rst[] = '';
    }

    foreach ($path_array AS $key => $dir) {
        if ($dir == '..') {
            if (end($rst) == '..') {
                $rst[] = '..';
            }elseif(!array_pop($rst)) {
                $rst[] = '..';
            }
        }elseif($dir && $dir != '.') {
            $rst[] = $dir;
        }
    }

    if(!end($path_array)) {
        $rst[] = '';
    }
    $url .= str_replace('\\', '/', implode('/', $rst)).$query.$fragment;
    return $url;
}

以上足够使用。

对于https链接,snoopy和simple_html_dom默认是不支持的。可以采用curl

点赞

发表评论

电子邮件地址不会被公开。必填项已用 * 标注