一般采集网站数据的流程为:通过CURL模拟浏览器请求URL,得到字符串后进行正则匹配(preg_match_all、preg_match)、字符串处理(str_replace、substr等)等操作后得到想要的数据。
以下是获取大众点评的店名和电话号码实例,废话不多说,直接贴代码:
public function getWebData() { $key = input('keyword'); $key = strtoupper($key); $key = urlencode($key); $url = "http://m.dianping.com/search/keyword/7/0_".$key; //请求的url地址 $shops = $this->getShop($url); $webData = []; foreach($shops['shopId'] as $k=>&$v){ $detailUrl = "http://m.dianping.com/shop/".$v; $phone = join(',',$this->getPhone($detailUrl)); $webData[$k]['shopname'] = trim($shops['shopname'][$k]); $webData[$k]['phone'] = $phone; } print_r($webData);exit; } private function getShop($url){ $res = $this->postCurl($url); preg_match_all('/href="\/shop\/(.*)"/',$res,$shopIds); $shop['shopId'] = $shopIds[1]; $result = $this->postCurl($url); $result = str_replace('<span class="mark-group"></span>','',$result); $result = str_replace('<span class="mark-promo"></span>','',$result); $result = str_replace('<span class="mark-takeaway"></span>','',$result); preg_match_all('/<h3 class="shopname">([\w\W]*?)<\/h3>/',$result,$shopName); $shop['shopname'] = $shopName[1]; return $shop; } private function getPhone($url){ $res = $this->postCurl($url); preg_match_all('/href="tel:(.*)"/',$res,$arr); return $arr[1]; } private function postCurl($url){ header("content-type:text/html;charset=utf-8"); $curl = curl_init($url); $header = array(); $header[] = 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'; $header[] = 'Cache-Control:no-cache'; $header[] = 'Connection:keep-alive'; $header[] = 'Content-Encoding:gzip'; $header[] = 'Content-Language:zh-CN'; $header[] = 'Content-Type:text/html;charset=UTF-8'; $header[] = 'Date:Thu, 01 Dec 2016 08:10:39 GMT'; $header[] = 'Set-Cookie:msource=default; Domain=.dianping.com; Expires=Thu, 01-Dec-2016 08:40:39 GMT; Path=/'; $header[] = 'Set-Cookie:ll=""; Domain=.dianping.com; Expires=Thu, 01-Jan-1970 00:00:10 GMT; Path=/'; $header[] = 'Set-Cookie:cityid=6; Domain=.dianping.com; Expires=Tue, 19-Dec-2084 11:24:46 GMT; Path=/'; $header[] = 'Set-Cookie:locallat=""; Domain=.dianping.com; Expires=Thu, 01-Jan-1970 00:00:10 GMT; Path=/'; $header[] = 'Set-Cookie:issqt=false; Domain=.dianping.com; Expires=Thu, 01-Dec-2016 15:10:39 GMT; Path=/'; $header[] = 'Set-Cookie:default_ab=shop%3AA%3A1%7Cindex%3AA%3A1%7CshopList%3AA%3A1; Domain=.dianping.com; Expires=Thu, 08-Dec-2016 08:10:39 GMT; Path=/'; $header[] = 'Set-Cookie:dper=""; Domain=.dianping.com; Expires=Thu, 01-Jan-1970 00:00:10 GMT; Path=/'; $header[] = 'Set-Cookie:cityid=6; Domain=.dianping.com; Expires=Tue, 19-Dec-2084 11:24:46 GMT; Path=/'; $header[] = 'Set-Cookie:sqttype=0; Domain=.dianping.com; Expires=Thu, 01-Dec-2016 15:10:39 GMT; Path=/'; $header[] = 'Cookie:_hc.v=ca87c14c-9e44-2928-6489-0366f7389a9b.1480496379; __utma=1.1049748573.1480496379.1480496379.1480496379.1; __utmz=1.1480496379.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; s_ViewType=10; aburl=1; cy=7; cye=shenzhen; switchcityflashtoast=1; PHOENIX_ID=0a060940-158b9260a80-504946; download_banner=on; source=m_browser_test_22; issqt=false; sqttype=0; cityid=6; __mta=42174874.1480496437420.1480577398028.1480578913342.9; msource=default; default_ab=shop%3AA%3A1%7Cindex%3AA%3A1%7CshopList%3AA%3A1; __mta=42174874.1480496437420.1480578086298.1480578930471.10; pvhistory="6L+U5ZuePjo8L3N0YXRpY3Rlc3QvbG9nZXZlbnQ/bmFtZT1XaGVyZUFtSUZhaWwmaW5mbz1odG1sLSU1QiU3QiUyMmNvZGUlMjIlM0ExJTJDJTIybWVzc2FnZSUyMiUzQSUyMk9ubHklMjBzZWN1cmUlMjBvcmlnaW5zJTIwYXJlJTIwYWxsb3dlZCUyMChzZWUlM0ElMjBodHRwcyUzQSUyRiUyRmdvby5nbCUyRlkwWmtOVikuJTIyJTdEJTVEJmNhbGxiYWNrPVdoZXJlQW1JMTE0ODA1NzkzODAzMjE+OjwxNDgwNTc5NDU0NDA3XV9b"; m_flash2=1'; curl_setopt($curl, CURLOPT_HTTPHEADER, $header); // 不输出header头信息 curl_setopt($curl, CURLOPT_HEADER, 0); curl_setopt($curl, CURLOPT_ENCODING, 'gzip'); // 伪装浏览器 curl_setopt($curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.59 Safari/537.36'); // 保存到字符串而不是输出 curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false); $rs = curl_exec($curl); curl_close($curl); return $rs; }