兴趣是最好的导师。
看中一个免费的美漫网站,但是被墙起来了,除了校门不能看
打算把上面的图片爬下来。
# -*- coding: utf-8 -*-
import urllib2
from bs4 import BeautifulSoup
import socket
baseurl = "http://www.readcomics.tv/loki-agent-of-asgard/chapter-1/full"
#伪装浏览器,以免被封
def user_agent(url):
req_header = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
req_timeout = 20
try:
req = urllib2.Request(url,None,req_header)
page = urllib2.urlopen(req,None,req_timeout)
html = page
except urllib2.URLError as e:
print e.message
except socket.timeout as e:
user_agent(url)
return html
def page_loop(pageid):
url = baseurl+'?p=%s'%pageid
print url
page = user_agent(url)
soup = BeautifulSoup(page)
total_img = 0
img = soup.find_all(['img'])
for myimg in img:
link = myimg.get('src')
total_img += 1
print link
# content2 = urllib2.urlopen(link).read()
content2 = user_agent(link).read()
#这句代码直接从OSC上面弄下来的
#D:\myimg是保存路径,你可以自己改成自己的,但是路径必须要自己创建好
with open(u'ImageDownload'+'/'+link[link.rfind('/'):],'wb') as code:
code.write(content2)
print total_img
return total_img
page_start = 0
page_stop = 4
total = 0
for i in range(page_start,page_stop):
total+=page_loop(i)
print total
#total就是统计下总共保存到本地的图片数量
不是很完善,问题如下:
1.没有生成文件夹
2.保存的文件名称不合规范
reference:
http://blog.csdn.net/cruise_h/article/details/25737737
https://www.lylinux.org/python%E7%88%AC%E8%99%AB%E6%8A%93%E5%8F%96%E7%BD%91%E7%AB%99%E5%9B%BE%E7%89%87%E5%B9%B6%E4%BF%9D%E5%AD%98.html