PHP中多线程抓取网页
作者:admin 日期:2012-04-06
用php自带的curl功能实现的多线程下载工具,比file_get_contents,以及linux自带的命令行curl、wget效率高多了。
大家如果觉得好,就拿去直接用吧。
/**
* @param mixed string or array,参数$urlArray是要抓取的网页(或文件,下同)的网址,可以是单个网址,也可以是多个网址组成的数组。
*/
function multiDownload($urlArray) {
if (empty($urlArray)) return false;
$isStr = false;
if (is_string($urlArray)) {
$urlArray = array($urlArray);
$isStr = true;
}
self::log(sprintf("%s Multi thread download begin...", __METHOD__));
$mh = curl_multi_init(); //curl_multi_init -- Returns a new cURL multi handle
$curlArray = array();
foreach ($urlArray as $i => $url) {
self::log(sprintf("%s Download url: |%s|...", __METHOD__, $url));
$curlArray[$i] = curl_init($url);
curl_setopt($curlArray[$i], CURLOPT_RETURNTRANSFER, true); //设置为true表示返回抓取的内容,而不是直接输出到浏览器上。TRUE to return the transfer as a string of the return value of curl_exec() instead of outputting it out directly
curl_setopt($curlArray[$i], CURLOPT_AUTOREFERER, true); //自动设置referer。TRUE to automatically set the Referer: field in requests where it follows a Location: redirect.
curl_setopt($curlArray[$i], CURLOPT_FOLLOWLOCATION, true); //跟踪url的跳转,比如301, 302等
curl_setopt($curlArray[$i], CURLOPT_MAXREDIRS, 2); //跟踪最大的跳转次数
curl_setopt($curlArray[$i], CURLOPT_HEADER, 0); //TRUE to include the header in the output.
curl_setopt($curlArray[$i], CURLOPT_ENCODING, ""); //接受的编码类型,The contents of the "Accept-Encoding: " header. This enables decoding of the response. Supported encodings are "identity", "deflate", and "gzip". If an empty string, "", is set, a header containing all supported encoding types is sent.
curl_setopt($curlArray[$i], CURLOPT_CONNECTTIMEOUT, 5); //连接超时时间
curl_multi_add_handle($mh, $curlArray[$i]); //curl_multi_add_handle -- Add a normal cURL handle to a cURL multi handle
}
$running = NULL;
$count = 0;
do {
//10秒钟没退出,就超时退出
if ($count++>100) break;
usleep(100000);
curl_multi_exec($mh, $running); //curl_multi_exec -- Run the sub-connections of the current cURL handle
} while($running > 0);
$content = array();
foreach ($urlArray as $i => $url) {
$content[$url] = curl_multi_getcontent($curlArray[$i]); //curl_multi_getcontent -- Return the content of a cURL handle if CURLOPT_RETURNTRANSFER is set
}
//curl_multi_remove_handle -- Remove a multi handle from a set of cURL handles
foreach ($urlArray as $i => $url){
curl_multi_remove_handle($mh, $curlArray[$i]);
}
//curl_multi_close -- Close a set of cURL handles
curl_multi_close($mh);
self::log(sprintf("%s Multi thread download end...", __METHOD__));
//如果参数$urlArray是字符串,则将返回值也转换为字符串
if ($isStr) $content = implode('', $content);
return $content;
}
评论: 0 | 引用: 0 | 查看次数: 1932
发表评论
广告位