1、网站下载 wget
[root@stone ~]# wget www.baidu.com
--2013-05-20 10:21:08-- http://www.baidu.com/
Resolving www.baidu.com... 115.239.210.26, 115.239.210.27
Connecting to www.baidu.com|115.239.210.26|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10480 (10K) [text/html]
Saving to: `index.html'
100%[===================================================================================================>] 10,480 --.-K/s in 0.05s
2013-05-20 10:21:08 (199 KB/s) - `index.html' saved [10480/10480]
[root@stone ~]# wget -O www.baidu.com www.baidu.com
--2013-05-20 10:25:28-- http://www.baidu.com/
Resolving www.baidu.com... 115.239.210.26, 115.239.210.27
Connecting to www.baidu.com|115.239.210.26|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10460 (10K) [text/html]
Saving to: `www.baidu.com'
100%[===================================================================================================>] 10,460 --.-K/s in 0.05s
2013-05-20 10:25:28 (191 KB/s) - `www.baidu.com' saved [10460/10460]
#-O指定输出文件名
#-t指定重试次数
#-o指定日志文件
#-limit-rate指定最大下载速度
#-Q或者--quota指定最大下载配额
#-c URL可以从断点继续下载
#--mirror下载一个网站的所有页面
#-r表示递归下载
#-l depth指定递归的层级,与-r配合使用
#--user --password指定用户名和密码
2、以格式化纯文本形式下载网页 lynx
[root@stone ~]# lynx -dump www.baidu.com > index.html
3、curl入门
下载
[root@stone ~]# curl -C - -O http://mirrors.163.com/centos/RPM-GPG-KEY-CentOS-6
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
100 1706 100 1706 0 0 285 0 0:00:05 0:00:05 --:--:-- 373
[root@stone ~]# ll RPM-GPG-KEY-CentOS-6
-rw-r--r-- 1 root root 1706 May 20 11:09 RPM-GPG-KEY-CentOS-6
#-C -表示断点续传
#-C offset表示从offset偏移量处续传
#-O表示将下载内容保存为与URL中最后相同文件名的文件中
#-o filename表示将下载的文件保存到指定的文件中
#--silent,-s表示静默下载,不显示进度信息
#--limit-rate限定下载速度
#--max-filesize指定可下载的最大文件大小
#-u username:password指定用户名和密码
#-u usernmae只指定用户名,密码在提示后输入
发送http请求
[root@stone ~]# curl -I www.baidu.com
HTTP/1.1 200 OK
Date: Mon, 20 May 2013 03:18:12 GMT
Server: BWS/1.0
Content-Length: 10460
Content-Type: text/html;charset=utf-8
Cache-Control: private
Set-Cookie: BDSVRTM=1; path=/
Set-Cookie: H_PS_PSSID=1420_2447_1944_1788_2249; path=/; domain=.baidu.com
Set-Cookie: BAIDUID=78C44F4DC793B800B02746B241A9C08C:FG=1; expires=Mon, 20-May-43 03:18:12 GMT; path=/; domain=.baidu.com
Expires: Mon, 20 May 2013 03:18:12 GMT
P3P: CP=" OTI DSP COR IVA OUR IND COM "
Connection: Keep-Alive
#-I打印HTTP头部信息
[root@stone ~]# curl http://book.sarathlakshman.com/lsc/mlogs/submit.php -d "host=test-host&user=slynux"
<html>
You have entered :
<p>HOST : test-host</p>
<p>USER : slynux</p>
<html>
#-d(--data)发送POST请求并读取网站的HTML响应
4、制作图片抓取器及下载工具
[root@stone bin]# curl -s www.e-acic.com | egrep -o "<img src=[^>]*>"
<img src="/images/ac_06.jpg" />
<img src="/images/ac_14.jpg" />
<img src="/images/ac_14.jpg" />
<img src="/images/ac_14.jpg" />
<img src="/images/ac_14.jpg" />
<img src="/images/ac_14.jpg" />
<img src="/images/ac_14.jpg" />
<img src="/images/ac_14.jpg" />
<img src="/images/ac_14.jpg" />
<img src="/uploads/2013/01/311556055003.jpg" />
<img src="/uploads/2013/01/301405454704.png" />
<img src="/uploads/2013/01/301500445838.jpg" />
<img src="/images/wenzi1.png" />
<img src="/uploads/2013/01/301458396818.jpg" />
<img src="/uploads/2013/01/280942174858.png" />
<img src="/uploads/2013/05/171009004173.jpg" />
<img src="/uploads/2013/05/171009045981.png" />
<img src="images/index_28.png" />
<img src="images/index_30.png" />
<img src="images/index_33.png" />
<img src="images/index_34.png" />
[root@stone bin]# curl -s www.e-acic.com | egrep -o "<img src=[^>]*>" | sed 's/<img src=\"\([^"]*\).*/\1/g'
/images/ac_06.jpg
/images/ac_14.jpg
/images/ac_14.jpg
/images/ac_14.jpg
/images/ac_14.jpg
/images/ac_14.jpg
/images/ac_14.jpg
/images/ac_14.jpg
/images/ac_14.jpg
/uploads/2013/01/311556055003.jpg
/uploads/2013/01/301405454704.png
/uploads/2013/01/301500445838.jpg
/images/wenzi1.png
/uploads/2013/01/301458396818.jpg
/uploads/2013/01/280942174858.png
/uploads/2013/05/171009004173.jpg
/uploads/2013/05/171009045981.png
images/index_28.png
images/index_30.png
images/index_33.png
images/index_34.png
[root@stone ~]# cat bin/img_download.sh
#!/bin/bash
if [ $# -ne 3 ];then
echo "Usage: $0 url -d directory"
exit -1
fi
for i in {1..4}
do
case $1 in
-d) shift;directory=$1;shift;;
*) url=${url:-$1};shift;;
esac
done
mkdir -p $directory
baseurl=$(echo $url | egrep -o "https?://[a-z.]+")
curl -s $url | egrep -o "<img src=[^>]*>" | sed 's/<img src=\"\([^"]*\).*/\1/g' > /tmp/$$.list
sed -i 's|^/|$baseurl/|' /tmp/$$.list
cd $directory
while read filename;
do
curl -C - -O "$filename"
done < /tmp/$$.list
5、查找网站中的无效链接
[root@stone ~]# cat bin/find_broken.sh
#!/bin/bash
if [ $# -eq 2 ];
then
echo -e "Usage $0 URL\n"
exit -1
fi
echo Broken links:
mkdir /tmp/broken.lynx
cd /tmp/broken.lynx
lynx -traversal $1 > /dev/null
count=0
sort -u reject.dat > links.txt
while read link;
do
output=`curl -I $link -s | grep "HTTP/.*OK"`;
if [[ -z $output ]];
then
echo $links;
let count++
fi
done < links.txt
[ $count -eq 0 ] && echo No broken links found.
#运行到lynx -traversal $1 > /dev/null会卡住
6、跟踪网站变更
[root@stone ~]# cat bin/change.sh
#!/bin/bash
if [ $# -eq 2 ];
then
echo -e "Usage $0 URL\n"
exit -1
fi
first_time=0
if [ ! -e "last.html" ];
then
first_time=1
fi
curl --silent $1 -o recent.html
if [ $first_time -ne 1 ];
then
changes=$(diff -u last.html recent.html)
if [ -n "$changes" ];
then
echo -e "Changes:\n"
echo "$changes"
else
echo -e "\nWebsite has no changes"
fi
else
echo "[Fist run] Archiving..."
fi
cp recent.html last.html
#检查不同的网站需要在不同的目录下面进行