去发现生活中的美好,记录生活中的点点滴滴

php采集网站数据

php admin 2971℃

一般采集网站数据的流程为:通过CURL模拟浏览器请求URL,得到字符串后进行正则匹配(preg_match_all、preg_match)、字符串处理(str_replace、substr等)等操作后得到想要的数据。

以下是获取大众点评的店名和电话号码实例,废话不多说,直接贴代码:

public function getWebData()
{
    $key = input('keyword');
    $key = strtoupper($key);
    $key = urlencode($key);
    $url = "http://m.dianping.com/search/keyword/7/0_".$key;  //请求的url地址

    $shops = $this->getShop($url);
    $webData = [];
    foreach($shops['shopId'] as $k=>&$v){
        $detailUrl = "http://m.dianping.com/shop/".$v;
        $phone = join(',',$this->getPhone($detailUrl));
        $webData[$k]['shopname'] = trim($shops['shopname'][$k]);
        $webData[$k]['phone'] = $phone;
    }
    print_r($webData);exit;
}


private function getShop($url){
    $res = $this->postCurl($url);
    preg_match_all('/href="\/shop\/(.*)"/',$res,$shopIds);
    $shop['shopId'] = $shopIds[1];
    $result = $this->postCurl($url);
    $result = str_replace('<span class="mark-group"></span>','',$result);
    $result = str_replace('<span class="mark-promo"></span>','',$result);
    $result = str_replace('<span class="mark-takeaway"></span>','',$result);

    preg_match_all('/<h3 class="shopname">([\w\W]*?)<\/h3>/',$result,$shopName);
    $shop['shopname'] = $shopName[1];
    return $shop;
}

private function getPhone($url){
    $res = $this->postCurl($url);
    preg_match_all('/href="tel:(.*)"/',$res,$arr);
    return $arr[1];
}

private function postCurl($url){
    header("content-type:text/html;charset=utf-8");
    $curl = curl_init($url);
    $header = array();
    $header[] = 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8';

    $header[] = 'Cache-Control:no-cache';
    $header[] = 'Connection:keep-alive';
    $header[] = 'Content-Encoding:gzip';
    $header[] = 'Content-Language:zh-CN';
    $header[] = 'Content-Type:text/html;charset=UTF-8';
    $header[] = 'Date:Thu, 01 Dec 2016 08:10:39 GMT';
    $header[] = 'Set-Cookie:msource=default; Domain=.dianping.com; Expires=Thu, 01-Dec-2016 08:40:39 GMT; Path=/';
    $header[] = 'Set-Cookie:ll=""; Domain=.dianping.com; Expires=Thu, 01-Jan-1970 00:00:10 GMT; Path=/';
    $header[] = 'Set-Cookie:cityid=6; Domain=.dianping.com; Expires=Tue, 19-Dec-2084 11:24:46 GMT; Path=/';
    $header[] = 'Set-Cookie:locallat=""; Domain=.dianping.com; Expires=Thu, 01-Jan-1970 00:00:10 GMT; Path=/';
    $header[] = 'Set-Cookie:issqt=false; Domain=.dianping.com; Expires=Thu, 01-Dec-2016 15:10:39 GMT; Path=/';
    $header[] = 'Set-Cookie:default_ab=shop%3AA%3A1%7Cindex%3AA%3A1%7CshopList%3AA%3A1; Domain=.dianping.com; Expires=Thu, 08-Dec-2016 08:10:39 GMT; Path=/';
    $header[] = 'Set-Cookie:dper=""; Domain=.dianping.com; Expires=Thu, 01-Jan-1970 00:00:10 GMT; Path=/';
    $header[] = 'Set-Cookie:cityid=6; Domain=.dianping.com; Expires=Tue, 19-Dec-2084 11:24:46 GMT; Path=/';
    $header[] = 'Set-Cookie:sqttype=0; Domain=.dianping.com; Expires=Thu, 01-Dec-2016 15:10:39 GMT; Path=/';
    $header[] = 'Cookie:_hc.v=ca87c14c-9e44-2928-6489-0366f7389a9b.1480496379; __utma=1.1049748573.1480496379.1480496379.1480496379.1; __utmz=1.1480496379.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; s_ViewType=10; aburl=1; cy=7; cye=shenzhen; switchcityflashtoast=1; PHOENIX_ID=0a060940-158b9260a80-504946; download_banner=on; source=m_browser_test_22; issqt=false; sqttype=0; cityid=6; __mta=42174874.1480496437420.1480577398028.1480578913342.9; msource=default; default_ab=shop%3AA%3A1%7Cindex%3AA%3A1%7CshopList%3AA%3A1; __mta=42174874.1480496437420.1480578086298.1480578930471.10; pvhistory="6L+U5ZuePjo8L3N0YXRpY3Rlc3QvbG9nZXZlbnQ/bmFtZT1XaGVyZUFtSUZhaWwmaW5mbz1odG1sLSU1QiU3QiUyMmNvZGUlMjIlM0ExJTJDJTIybWVzc2FnZSUyMiUzQSUyMk9ubHklMjBzZWN1cmUlMjBvcmlnaW5zJTIwYXJlJTIwYWxsb3dlZCUyMChzZWUlM0ElMjBodHRwcyUzQSUyRiUyRmdvby5nbCUyRlkwWmtOVikuJTIyJTdEJTVEJmNhbGxiYWNrPVdoZXJlQW1JMTE0ODA1NzkzODAzMjE+OjwxNDgwNTc5NDU0NDA3XV9b"; m_flash2=1';

    curl_setopt($curl, CURLOPT_HTTPHEADER, $header);
    // 不输出header头信息
    curl_setopt($curl, CURLOPT_HEADER, 0);
    curl_setopt($curl, CURLOPT_ENCODING, 'gzip');
    // 伪装浏览器
    curl_setopt($curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.59 Safari/537.36');
    // 保存到字符串而不是输出
    curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
    curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);
    $rs = curl_exec($curl);
    curl_close($curl);
    return  $rs;
}

转载请注明:永盟博客 » php采集网站数据

喜欢 (1)