使用python抓取网页内容,并且下载网页内容中的图片。
使用urllib库抓取网页内容,该库是python自带,不需要安装。
由于我这边抓取的是国外的网站,用到了代理,且设置User-Agent防止网站对没有User-Agent的请求过滤
爬虫类代码Scrapy.py:
import urllib.request
class Scrapy:
proxy = ''
opener = ''
proxy_host = '192.168.88.17:8333'
user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.170 Safari/537.36'
values = {'test':1}
headers = ''
def __init__(self):
self.headers = {
'User-Agent':self.user_agent,
'Host':'mangakakalot.com',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'max-age=0',
'Proxy-Connection':'keep-alive',
'Upgrade-Insecure-Requests':'1'
}
# 设置代理
def setProxy(self):
self.proxy = urllib.request.ProxyHandler({'http': self.proxy_host})
self.opener = urllib.request.build_opener(self.proxy, urllib.request.HTTPHandler)
# 获取网页内容
def getHtml(self, url):
self.setProxy()
urllib.request.install_opener(self.opener)
req = urllib.request.Request(url, urllib.parse.urlencode(self.values).encode(encoding='UTF8'), {'User-Agent':self.user_agent})
return urllib.request.urlopen(req).read().decode('utf8')
# 下载图片
def downImg(self, url, folder, filename):
self.setProxy()
self.opener.addheaders = [('User-Agent', self.user_agent)]
urllib.request.install_opener(self.opener)
urllib.request.urlretrieve(url, folder + filename)
调用例子:
# -*- coding: UTF-8 -*-
from Scrapy import Scrapy
scrapy = Scrapy()
# 抓取网页内容
html = scrapy.getHtml('http://xxxxxxxxxx')
# 下载地址
scrapy.downImg('http://xxxxxxxxxx.jpg', './data/pic/', 'xx.jpg')