零、用PHP做爬虫的优势,可以直接集成到已有的PHP网站中。
一、屏蔽错误
libxml_use_internal_errors(true);
public function curl($url,$post_data=null)
{
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
if ( !is_null($post_data) ) {
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_POSTFIELDS, $post_data);
}
$html = curl_exec($ch);
//$html = utf8_encode($html);
curl_close($ch);
return $data;
}/*curl*/
三、解析键名缺少引号的json数据
public function ext_json_decode($str, $mode=true)
{
if(preg_match('/\w:/', $str)){
$str = preg_replace('/(\w+):/is', '"$1":', $str);
}
return json_decode($str, $mode);
}
四、用simplexml 直接解析html 文档碰到特殊字符经常解析不来,DOMdocument 的容错能力更强。
/*html 转 simplexml */
public function html_to_xml($html)
{
$meta = '';
$dom = new DOMdocument();
$dom->loadHtml($meta.$html);
$xml = simplexml_import_dom($dom);
return $xml;
}
function parse_xml($xml) {
$xml_items = $xml->xpath('//div[@class="p"]/div');
foreach ($xml_list as $key => $xml) {
$result = dom_import_simplexml($xml);
$div_items = $result->getElementsByTagName('div');
foreach ($div_items as $key => $item) {
$video['title'] = $item->getElementsByTagName('a')->item(0)->getAttribute('title');
}
}
} /*parse_xml*/