由于工作需要最近写了一个php抓取proxycn.com数据并使用代理下载cnet软件刷下载量的小工具.
1、抓取代理中国代理列表数据
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
|
set_time_limit(0);
/**
* @param $url 要抓取的链接地址
* @param $file 写入数据的文件名
* @return ''
*/
function
FetchProxyCn(
$url
,
$file
)
{
if
(!
empty
(
$url
))
{
$html
=
file_get_contents
(
$url
);
preg_match(
'/<td background="img\/table_4.gif">(.*?)<td background="img\/table_5.gif">/isU'
,
$html
,
$list
);
//echo $list[1];
preg_match(
'/<script language="JavaScript">(.*?)<\/script>/isU'
,
$list
[1],
$iport
);
$str
=
str_replace
(
'</SCRIPT>'
,
'{SCRIPT}'
,
$iport
[1]);
$str
=
explode
(
'{SCRIPT}'
,
$str
);
unset(
$str
[0]);
unset(
$str
[
count
(
$str
)]);
$str
=
array_values
(
$str
);
for
(
$i
= 0;
$i
<
count
(
$str
);
$i
++)
{
preg_match(
"/<TD class=\"list\">(\d{1,4})<\/TD>/isU"
,
$str
[
$i
],
$port
);
$port
=
$port
[1];
preg_match(
"/whois=(.*) /"
,
$str
[
$i
],
$ip
);
$ip
=
$ip
[1];
preg_match(
"/HTTP<\/TD><TD class=\"list\">(.*?)<\/TD>/is"
,
$str
[
$i
],
$area
);
$area
=
$area
[1];
$string
=
$ip
.
':'
.
$port
.
':'
.
$area
.
"\r\n"
;
wfile(
$file
,
$string
,
'a'
);
}
}
}
/**
* 写文件
* @param string $file 文件路径
* @param string $str 写入内容
* @param char $mode 写入模式
*/
function
wfile(
$file
,
$str
,
$mode
=
'w'
)
{
$oldmask
= @umask(0);
$fp
= @
fopen
(
$file
,
$mode
);
@
flock
(
$fp
, 3);
if
(!
$fp
)
{
Return false;
}
else
{
@fwrite(
$fp
,
$str
);
@fclose(
$fp
);
@umask(
$oldmask
);
Return true;
}
}
$file
=
'proxycn.txt'
;
if
(!
file_exists
(
$file
))
{
for
(
$i
= 1;
$i
<= 116;
$i
++)
{
$url
=
"http://www.proxycn.com/html_proxy/http-$i.html"
;
FetchProxyCn(
$url
,
$file
);
}
}
else
{
$ctime
=
filectime
(
$file
);
$now
= time();
$timediff
=
$now
-
$ctime
;
if
(
$timediff
> 1)
{
unlink(
$file
);
for
(
$i
= 1;
$i
<= 116;
$i
++)
{
$url
=
"http://www.proxycn.com/html_proxy/http-$i.html"
;
FetchProxyCn(
$url
,
$file
);
}
}
}
|
保存代码为proxycn.php, 这段代码会抓取代理中国列表数据并且在当前目录生成名为proxycn.txt的文本文件,格式为
IP:Port:地区
2、抓取cnet页面内容,分析得到最终下载链接.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
|
header(
"Expires: Mon, 26 Jul 1970 05:00:00 GMT"
);
header(
"Last-Modified: "
.
gmdate
(
"D, d M Y H:i:s"
) .
" GMT"
);
header(
"Cache-Control: no-cache, must-revalidate"
);
header(
"Pragma: no-cache"
);
$ip
=
$_SERVER
[
"REMOTE_ADDR"
];
$product_name
=
$_GET
[
'product_name'
];
$product_url
=
$_GET
[
'product_url'
];
$time
=
$_GET
[
'time'
];
$proxy
=
$_GET
[
'proxy'
];
$type
=
$_GET
[
'type'
];
// $type == 1 开启代理否者关闭代理
$file_name
=
$product_name
.
'.txt'
;
if
(
file_exists
(
$file_name
))
{
unlink(
$file_name
);
}
/**
* Curl设置代理访问网页
* @param sting $url 需要通过代理访问的URL
* @param string $user_agent 浏览器头
* @param string $proxy 代理地址
*/
function
curl_string (
$url
,
$user_agent
,
$proxy
)
{
$ch
= curl_init();
curl_setopt (
$ch
, CURLOPT_PROXY,
$proxy
);
curl_setopt (
$ch
, CURLOPT_URL,
$url
);
curl_setopt (
$ch
, CURLOPT_USERAGENT,
$user_agent
);
curl_setopt (
$ch
, CURLOPT_COOKIEJAR,
"c:\cookie.txt"
);
curl_setopt (
$ch
, CURLOPT_HEADER, 1);
curl_setopt (
$ch
, CURLOPT_RETURNTRANSFER, 1);
curl_setopt (
$ch
, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt (
$ch
, CURLOPT_TIMEOUT, 120);
$result
= curl_exec (
$ch
);
curl_close(
$ch
);
return
$result
;
}
define(
'BAIDU_URL'
,
$product_url
);
if
(!
empty
(
$product_name
) && !
empty
(
$product_url
))
{
// 开启代理访问
if
(
$type
== 1 && !
empty
(
$proxy
))
{
$user_agent
=
"Mozilla/4.0"
;
$stringHtmlContent
= curl_string(BAIDU_URL,
$user_agent
,
$proxy
);
if
(
empty
(
$stringHtmlContent
))
{
if
(
file_exists
(
$file_name
))
{
unlink(
$file_name
);
}
exit
(
'代理连接失败'
);
}
}
else
{
//不使用代理访问
$stringHtmlContent
=
file_get_contents
(BAIDU_URL);
}
preg_match(
"/<div class=\"dlText\">(.*?)<span class=\"dlSpywareFree\">/is"
,
$stringHtmlContent
,
$arrayHtmlContent
);
preg_match_all(
"/href=\"([^\"]+)/i"
,
$arrayHtmlContent
[1],
$match
);
$str_url2
=
$match
[1][0];
$stringHtmlContent2
=
file_get_contents
(
$str_url2
);
preg_match(
"/<div class=\"watchListB\">(.*?)<div id=\"contentBody\">/is"
,
$stringHtmlContent2
,
$arrayHtmlContent2
);
preg_match(
"/<script type=\"text\/javascript\">(.*?)<\/script>/is"
,
$arrayHtmlContent2
[1],
$exe
);
preg_match_all(
"/src:'(.*?)'/i"
,
$exe
[1],
$match2
);
$exe_url
=
$match2
[1];
$urlexe
=
$exe_url
[0];
}
else
{
echo
'参数错误!'
;
}
wfile(
$file_name
,
$urlexe
);
|
保存代码为cnet.php,上传文件到服务器,使用客户端访问该文件,将会得到最终下载连接.