一、爬虫入门
网络爬虫(又被称为网页蜘蛛,网络机器人,在FOAF社区中间,更经常的称为网页追逐者),是一种按照一定的规则,自动地抓取万维网信息的程序或者脚本。
运用python3.6中的urllib.request
1.快速爬取一个网页
(1)get请求方式
#!/usr/bin/env python#-*- coding: UTF-8 -*-#Author:Du Fei
importurllib.request#keywd = "python"
keywd ="百度"
#解决中文编码问题
keywd=urllib.request.quote(keywd)
url = "" +keywd
req=urllib.request.Request(url)#urlopen将网页存到内存
data =urllib.request.urlopen(req).read()
fh=open("F:/python/data/douban/2.html","wb")
fh.write(data)
fh.close()
View Code
(2)post请求方式
#!/usr/bin/env python#-*- coding: UTF-8 -*-#Author:Du Fei#post请求#登录模拟
importurllib.requestimporturllib.parse
url =""
#对字段相应设置
mydata=urllib.parse.urlencode({"name":"[email protected]","pass":"123ssd"}).encode("utf-8")
req=urllib.request.Request(url,mydata)
data=urllib.request.urlopen(req).read()
fh=open("F:/python/data/douban/2_1.html","wb")
fh.write(data)
fh.close()
View Code
2.模拟浏览器访问
应用场景:有些网页为了防止别人恶意采集其信息所以进行了一些反爬虫的设置,而我们又想进行爬取。
解决方法:设置一些Headers信息(User-Agent),模拟成浏览器去访问这些网站。
爬取淘宝高清图片
#!/usr/bin/env python#-*- coding: UTF-8 -*-#Author:Du Fei
importurllib.requestimportre
keyname="连衣裙"
#编码
key=urllib.request.quote(keyname)
#User-Agent :Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/60.0
伪装成火狐浏览器
headers=("User-Agent","Mozilla /5.0 (Windows NT 10.0; Win64; x6;rv:60.0) Gecko/20100101 Firefox/60.0")#创建opener对象
opener =urllib.request.build_opener()#添加报头
opener.addheaders=[headers]#将opener添加为全局
urllib.request.install_opener(opener)
for i in range(0,1):#构造网址
url =""+key+"&cat=50344007&style=grid&seller_type=taobao&bcoffset=12&s=" +str(i*60)
data= urllib.request.urlopen(url).read().decode("utf-8", "ingnore")
#定义正则
pat = 'pic_url":"//(.*?)"'
#图片网址
image_list=re.compile(pat).findall(data)
print(image_list)
for j in range(0,len(image_list)):
thisimg = image_list[j]
thisimg_url ="http://" +thisimg
file="F:/python/data/douban/img/" +str(i)+str(j)+".jpg"
urllib.request.urlretrieve(thisimg_url,filename=file)
View Code
爬取CSDN数据
#!/usr/bin/env python#-*- coding: UTF-8 -*-#Author:Du Fei
importurllib.requestimportre
url="http://blog.csdn.net/"
#伪装成浏览器#User-Agent用户代理
headers=("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36")#创建opener对象
opener =urllib.request.build_opener()#添加报头
opener.addheaders=[headers]#将opener添加为全局
urllib.request.install_opener(opener)#获取url数据
data =urllib.request.urlopen(url).read().decode("utf-8","ingnore")
pat='
file= "F:/python/data/douban/csdn/" + str(i) + ".html"urllib.request.urlretrieve(result[i],filename=file)print("第"+str(i)+"爬取成功")
View Code
3.异常处理
爬虫在爬取网站上的数据常见的错误:URLError和HTTPError
脚本中加入异常处理机制使爬虫脚本更稳健。
爬取新浪新闻首页
#!/usr/bin/env python#-*- coding: UTF-8 -*-#Author:Du Fei
"""需求:将新浪新闻首页(http://news.sina.com.cn/)所有新闻都爬取到本地
思路:先爬首页,通过正则获取所有新闻链接,然后依次爬取新闻,并存储到本地"""
importurllib.requestimporturllib.errorimportre#获取首页#urlopen将网页存到内存
data =urllib.request.urlopen("http://news.sina.com.cn/").read()#获取的数据编码
data2=data.decode("utf-8","ignore")
pat='
#可能有多行 re.S
datalist=re.compile(pat,re.S).findall(pagedata)for j inrange(0,len(datalist)):print("第"+str(i)+"页第"+str(j)+"个段子的内容是:")print(datalist[j])excepturllib.error.URLError as e:if hasattr(e, "code"):print(e.code)if hasattr(e, "reason"):print(e.reason)exceptException as e:print(e)print("第" + str(i) + "页第" + str(j) + "篇文章失败")
View Code
(2)多线程爬虫(爬取糗事百科)
#!/usr/bin/env python#-*- coding: UTF-8 -*-#Author:Du Fei
importurllib.requestimportreimporturllib.errorimportthreading
headers=("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36")
opener=urllib.request.build_opener()
opener.addheaders=[headers]
urllib.request.install_opener(opener)
classOne(threading.Thread):#初始化
def init(self):#初始化线程
threading.Thread.init(self)#线程要做的事情
defrun(self):#奇数页
for i in range(1,36,2):try:#"" +str(i)
pagedata= urllib.request.urlopen(url).read().decode("utf-8", "ignore")#
pat = '
#可能有多行 re.S
datalist =re.compile(pat, re.S).findall(pagedata)for j inrange(0, len(datalist)):print("第" + str(i) + "页第" + str(j) + "个段子的内容是:")print(datalist[j])excepturllib.error.URLError as e:if hasattr(e, "code"):print(e.code)if hasattr(e, "reason"):print(e.reason)
classTwo(threading.Thread):#初始化
def init(self):#初始化线程
threading.Thread.init(self)#线程要做的事情
defrun(self):#偶数页
for i in range(0,36,2):try:#"" +str(i)
pagedata= urllib.request.urlopen(url).read().decode("utf-8", "ignore")#
pat = '
#可能有多行 re.S
datalist =re.compile(pat, re.S).findall(pagedata)for j inrange(0, len(datalist)):print("第" + str(i) + "页第" + str(j) + "个段子的内容是:")print(datalist[j])excepturllib.error.URLError as e:if hasattr(e, "code"):print(e.code)if hasattr(e, "reason"):print(e.reason)
one =One()
one.start()
two=Two()
two.start()
View Code
二、Scrapy框架
实战
1.自动模拟登陆豆瓣
(1).douban.py
#-*- coding: utf-8 -*-
importscrapyfrom scrapy.http importRequest,FormRequestimporturllib.requestclassDbSpider(scrapy.Spider):
name= "db"allowed_domains= ["douban.com"]
header={"User-Agent:":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0"}'''start_urls = (
'http://www.douban.com/',
)'''
defstart_requests(self):return [Request("https://accounts.douban.com/login",callback=self.parse,meta={"cookiejar":1})]defparse(self, response):
captcha=response.xpath("//img[@id='captcha_image']/@src").extract()
url="https://accounts.douban.com/login"
if len(captcha)>0:print("此时验证码")#半自动化验证码
#验证码下载到本地地址
localpath="F:/python/data/db/captcha.png"urllib.request.urlretrieve(captcha[0],filename=localpath)print("请查看本地验证码图片并输入验证码")
captcha_value=input()
data={
"form_email": "[email protected]",
"form_password": "abded",
"captcha-solution":captcha_value,
"redir":"https://www.douban.com/people/233455/",
}
else:
print("此时没有验证码")
data={
"form_email": "[email protected]",
"form_password": "abded",
"redir":"https://www.douban.com/people/233455/",
}
print("登陆中……")
return [FormRequest.from_response(response,
meta={"cookiejar": response.meta["cookiejar"]},
headers=self.header,
formdata=data,
callback=self.next,
)]
def next(self, response):
print("此时已经登陆完成并爬取了个人中心的数据")
title = response.xpath("/html/head/title/text()").extract()
# note = response.xpath("//div[@class='note']/text()").extract()
print(title[0])
# print(note[0])
View Code
(2).setting.py
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
View Code
2.爬取当当网数据入Linux中的mysql
(1)items.py
importscrapy
classDangdangItem(scrapy.Item):#define the fields for your item here like:
#name = scrapy.Field()
title=scrapy.Field()
link=scrapy.Field()
comment=scrapy.Field()
View Code
(2)dd.py
#-*- coding: utf-8 -*-
importscrapyfrom dangdang.items importDangdangItemfrom scrapy.http importRequest
classDdSpider(scrapy.Spider):
name= 'dd'allowed_domains= ['dangdang.com']
start_urls= ['']
def parse(self, response):
item=DangdangItem()
item["title"]=response.xpath("//a[@name='itemlist-picture']/@title").extract()
item["link"]=response.xpath("//a[@name='itemlist-picture']/@href").extract()
item["comment"]=response.xpath("//a[@class='search_comment_num']/text()").extract()
yield item
for i in range(2,5):
url="http://category.dangdang.com/pg"+str(i)+"-cp01.54.00.00.00.00.html"
yield Request(url,callback=self.parse)
View Code
(3)pipelines.py
#-*- coding: utf-8 -*-
#Define your item pipelines here
Don't forget to add your pipeline to the ITEM_PIPELINES setting
importpymysql
classDangdangPipeline(object):defprocess_item(self, item, spider):#连接数据库
conn = pymysql.connect(host='XXXX', port=3306, user='root', passwd='XXX', db='XX',
charset='utf8')print(conn)#创建操作的游标
cursor =conn.cursor()#设置字符编码及自动提交
cursor.execute('set names utf8') #固定格式
for i in range(0,len(item["title"])):
title=item["title"][i]
link=item["link"][i]
comment=item["comment"][i]
# print(title)
# print(link)
# print(comment)
sql = "insert into boods(title,link,comment) values(%s,%s,%s)"
cursor.execute(sql, (title, link, comment))
cursor.close()
conn.close()
return item
View Code
(4)setting.py中添加
ROBOTSTXT_OBEY =False
ITEM_PIPELINES={'dangdang.pipelines.DangdangPipeline': 300,
}
View Code
3.爬取京东商城商品信息(自动爬取)
创建一个crawl爬虫,爬取京东的商品信息,并且写入数据库中。
(1)创建scrapy项目
scrapy startproject jingdong
(2)常见自动爬取文件
scrapy genspider -t crawl jd jd.com
(3)items.py
#-*- coding: utf-8 -*-
#Define here the models for your scraped items
See documentation in:
importscrapy
classJingdongItem(scrapy.Item):#define the fields for your item here like:
#name = scrapy.Field()
#商品id
id =scrapy.Field()#商品名
title =scrapy.Field()#商品所在商店名
shop =scrapy.Field()#商品所在商店链接
shoplink =scrapy.Field()#商品价格
price =scrapy.Field()#商品好评
comment = scrapy.Field()
View Code
(4)jd.py
#-*- coding: utf-8 -*-
importscrapyfrom scrapy.linkextractors importLinkExtractorfrom scrapy.spiders importCrawlSpider, Rulefrom jingdong.items importJingdongItemimportreimporturllib.request#自动爬虫
classJdSpider(CrawlSpider):
name= 'jd'allowed_domains= ['jd.com']
start_urls= ['']
rules = (
Rule(LinkExtractor(allow=''), callback='parse_item', follow=True),
)
def parse_item(self, response):
try:
#实例化容器
i = JingdongItem()
#获取当前页
thisurl = response.url
pat ="item.jd.com/(.*?).html"
#在thisurl中查找有没有pat这样格式的表达式
x=re.search(pat,thisurl)
if(x):
#获取商品的id
thisid=re.compile(pat).findall(thisurl)[0]
#标题
title=response.xpath("//div[@id='spec-n1']/img[@id='spec-img']/@alt").extract()
#商家
shop=response.xpath("//div[@class='name']/a/text()"