<?
PHP
//
GET all links from URL
function
remove_html_tags(
&
$item
,
$key
)
{
$item
=
trim
(
strip_tags
(
$item
));
}
function
get_path(
$url
)
{
preg_match
(
"
/^(http:\/\/)?([^\/]+)/i
"
,
$url
,
$matches
);
return
substr
(
$url
,
strlen
(
$matches
[
0
]));
}
function
get_host(
$url
)
{
preg_match
(
"
/^(http:\/\/)?([^\/]+)/i
"
,
$url
,
$matches
);
return
$matches
[
2
];
}
function
get_all_html(
$url
)
{
$htmlString
=
''
;
$host
=
get_host(
$url
);
$path
=
get_path(
$url
);
$fp
=
fsockopen
(
$host
,
80
,
$errno
,
$errstr
,
30
);
if
(
!
$fp
)
{
echo
"
$errstr
(
$errno
)<br />\n
"
;
}
else
{
$out
=
"
GET
$path
HTTP/1.1\r\n
"
;
$out
.=
"
Host:
$host
\r\n
"
;
$out
.=
"
Connection: Close\r\n\r\n
"
;
$myInt
=
fwrite
(
$fp
,
$out
);
while
(
!
feof
(
$fp
))
{
$htmlString
=
$htmlString
.
fgets
(
$fp
);
}
fclose
(
$fp
);
}
return
$htmlString
;
}
function
get_links(
$url
)
{
$preg
=
"
/a[\s]+[^>]*?href[\s]?=[\s\
"
\
'
]+(.*?)[\"\
'
]
+.*?>
"
.
"
([
^<
]
+|.*?
)
?<
\
/
a
>/
i
"
;
preg_match_all(trim(
$preg
),
get_all_html(
$url
),
$out
, PREG_PATTERN_ORDER);
$keys
=
$out
[1];
$values
=
$out
[2];
array_walk(
$values
, 'remove_html_tags');
return (array_combine(
$keys
,
$values
));
}
//过滤关键字 不包含特定的字符串 得到想要下载的链接
function filter_string(
$var
)
{
$pos
= strpos(
$var
, '中日研修');//只想要 含中日研修的链接
if (
$pos
=== false)
return false;
else
return true;
}
//保存到数据库
function save_html(
$title
,
$html
)
{
$link
= mysql_connect(
"
localhost
:
3306
"
,
"
root
"
,
""
)OR die(mysql_error());
$res
= mysql_select_db(
"
test
"
,
$link
);
$result
= mysql_query(
"
insert into save_info(title
,
content) values (
'
$title
'
,
'
$html
'
)
"
,
$link
);
return
$result
;
}
$url
=
"
http
:
//
www.zggjlww.cn/share/?mods=news&action=list&id=15";
$arr
=
get_links(
$url
);
$newMatches
=
array_filter
(
$arr
,
"
filter_string
"
);
//
提取a 中的 href内容 合成完整的url, 连合标题保存到二维数组中
//$strLinks = implode($newMatches,'<p></p>');
//print_r($newMatches);
//foreach所操作的是指定数组的一个拷贝,而不是该数组本身。 每次指向数组的第一位
//foreach ($newMatches as $url => $site)
//{
// echo "<br /> 址址: $site, url: $url";
//}
/*
* reset($newMatches);
* while (list($key, $val) = each($newMatches))
* {
* echo "$key => $val\n";
* }
*/
reset
(
$newMatches
);
$count
=
1
;
while
(
list
(
$key
,
$val
)
=
each
(
$newMatches
))
{
$key
=
"
http://
"
.
get_host(
$url
)
.
$key
;
$html
=
substr
(
htmlentities
(get_all_html(
$key
))
,
2000
,
10000
);
//
我抓的页面太大了,截断了点了才能保存到mysql
save_html(
$val
,
$html
);
$rest
=
count
(
$newMatches
)
-
$count
;
$count
++
;
echo
"
$val
= >
$key
<br />保存成功... 还有
$rest
张页面要处理...<p></p>
"
;
}
?>