python爬虫教程-Python爬虫全集

一、爬虫入门

网络爬虫(又被称为网页蜘蛛,网络机器人,在FOAF社区中间,更经常的称为网页追逐者),是一种按照一定的规则,自动地抓取万维网信息的程序或者脚本。

运用python3.6中的urllib.request

1.快速爬取一个网页

(1)get请求方式

ContractedBlock.gif

ExpandedBlockStart.gif

#!/usr/bin/env python#-*- coding: UTF-8 -*-#Author:Du Fei

importurllib.request#keywd = "python"

keywd ="百度"

#解决中文编码问题

keywd=urllib.request.quote(keywd)

url = "" +keywd

req=urllib.request.Request(url)#urlopen将网页存到内存

data =urllib.request.urlopen(req).read()

fh=open("F:/python/data/douban/2.html","wb")

fh.write(data)

fh.close()

View Code

(2)post请求方式

ContractedBlock.gif

ExpandedBlockStart.gif

#!/usr/bin/env python#-*- coding: UTF-8 -*-#Author:Du Fei#post请求#登录模拟

importurllib.requestimporturllib.parse

url =""

#对字段相应设置

mydata=urllib.parse.urlencode({"name":"[email protected]","pass":"123ssd"}).encode("utf-8")

req=urllib.request.Request(url,mydata)

data=urllib.request.urlopen(req).read()

fh=open("F:/python/data/douban/2_1.html","wb")

fh.write(data)

fh.close()

View Code

2.模拟浏览器访问

应用场景:有些网页为了防止别人恶意采集其信息所以进行了一些反爬虫的设置,而我们又想进行爬取。

解决方法:设置一些Headers信息(User-Agent),模拟成浏览器去访问这些网站。

爬取淘宝高清图片

ContractedBlock.gif

ExpandedBlockStart.gif

#!/usr/bin/env python#-*- coding: UTF-8 -*-#Author:Du Fei

importurllib.requestimportre

keyname="连衣裙"

#编码

key=urllib.request.quote(keyname)

#User-Agent :Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/60.0

伪装成火狐浏览器

headers=("User-Agent","Mozilla /5.0 (Windows NT 10.0; Win64; x6;rv:60.0) Gecko/20100101 Firefox/60.0")#创建opener对象

opener =urllib.request.build_opener()#添加报头

opener.addheaders=[headers]#将opener添加为全局

urllib.request.install_opener(opener)

for i in range(0,1):#构造网址

url =""+key+"&cat=50344007&style=grid&seller_type=taobao&bcoffset=12&s=" +str(i*60)

data= urllib.request.urlopen(url).read().decode("utf-8", "ingnore")

#定义正则

pat = 'pic_url":"//(.*?)"'

#图片网址

image_list=re.compile(pat).findall(data)

print(image_list)

for j in range(0,len(image_list)):

thisimg = image_list[j]

thisimg_url ="http://" +thisimg

file="F:/python/data/douban/img/" +str(i)+str(j)+".jpg"

urllib.request.urlretrieve(thisimg_url,filename=file)

View Code

爬取CSDN数据

ContractedBlock.gif

ExpandedBlockStart.gif

#!/usr/bin/env python#-*- coding: UTF-8 -*-#Author:Du Fei

importurllib.requestimportre

url="http://blog.csdn.net/"

#伪装成浏览器#User-Agent用户代理

headers=("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36")#创建opener对象

opener =urllib.request.build_opener()#添加报头

opener.addheaders=[headers]#将opener添加为全局

urllib.request.install_opener(opener)#获取url数据

data =urllib.request.urlopen(url).read().decode("utf-8","ingnore")

pat='

file= "F:/python/data/douban/csdn/" + str(i) + ".html"urllib.request.urlretrieve(result[i],filename=file)print("第"+str(i)+"爬取成功")

View Code

3.异常处理

爬虫在爬取网站上的数据常见的错误:URLError和HTTPError

脚本中加入异常处理机制使爬虫脚本更稳健。

爬取新浪新闻首页

ContractedBlock.gif

ExpandedBlockStart.gif

#!/usr/bin/env python#-*- coding: UTF-8 -*-#Author:Du Fei

"""需求:将新浪新闻首页(http://news.sina.com.cn/)所有新闻都爬取到本地

思路:先爬首页,通过正则获取所有新闻链接,然后依次爬取新闻,并存储到本地"""

importurllib.requestimporturllib.errorimportre#获取首页#urlopen将网页存到内存

data =urllib.request.urlopen("http://news.sina.com.cn/").read()#获取的数据编码

data2=data.decode("utf-8","ignore")

pat='

#可能有多行 re.S

datalist=re.compile(pat,re.S).findall(pagedata)for j inrange(0,len(datalist)):print("第"+str(i)+"页第"+str(j)+"个段子的内容是:")print(datalist[j])excepturllib.error.URLError as e:if hasattr(e, "code"):print(e.code)if hasattr(e, "reason"):print(e.reason)exceptException as e:print(e)print("第" + str(i) + "页第" + str(j) + "篇文章失败")

View Code

(2)多线程爬虫(爬取糗事百科)

ContractedBlock.gif

ExpandedBlockStart.gif

#!/usr/bin/env python#-*- coding: UTF-8 -*-#Author:Du Fei

importurllib.requestimportreimporturllib.errorimportthreading

headers=("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36")

opener=urllib.request.build_opener()

opener.addheaders=[headers]

urllib.request.install_opener(opener)

classOne(threading.Thread):#初始化

def init(self):#初始化线程

threading.Thread.init(self)#线程要做的事情

defrun(self):#奇数页

for i in range(1,36,2):try:#"" +str(i)

pagedata= urllib.request.urlopen(url).read().decode("utf-8", "ignore")#

pat = '

.? (.?).*?
'

#可能有多行 re.S

datalist =re.compile(pat, re.S).findall(pagedata)for j inrange(0, len(datalist)):print("第" + str(i) + "页第" + str(j) + "个段子的内容是:")print(datalist[j])excepturllib.error.URLError as e:if hasattr(e, "code"):print(e.code)if hasattr(e, "reason"):print(e.reason)

classTwo(threading.Thread):#初始化

def init(self):#初始化线程

threading.Thread.init(self)#线程要做的事情

defrun(self):#偶数页

for i in range(0,36,2):try:#"" +str(i)

pagedata= urllib.request.urlopen(url).read().decode("utf-8", "ignore")#

pat = '

.? (.?).*?
'

#可能有多行 re.S

datalist =re.compile(pat, re.S).findall(pagedata)for j inrange(0, len(datalist)):print("第" + str(i) + "页第" + str(j) + "个段子的内容是:")print(datalist[j])excepturllib.error.URLError as e:if hasattr(e, "code"):print(e.code)if hasattr(e, "reason"):print(e.reason)

one =One()

one.start()

two=Two()

two.start()

View Code

二、Scrapy框架

实战

1.自动模拟登陆豆瓣

(1).douban.py

ContractedBlock.gif

ExpandedBlockStart.gif

#-*- coding: utf-8 -*-

importscrapyfrom scrapy.http importRequest,FormRequestimporturllib.requestclassDbSpider(scrapy.Spider):

name= "db"allowed_domains= ["douban.com"]

header={"User-Agent:":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0"}'''start_urls = (

'http://www.douban.com/',

)'''

defstart_requests(self):return [Request("https://accounts.douban.com/login",callback=self.parse,meta={"cookiejar":1})]defparse(self, response):

captcha=response.xpath("//img[@id='captcha_image']/@src").extract()

url="https://accounts.douban.com/login"

if len(captcha)>0:print("此时验证码")#半自动化验证码

#验证码下载到本地地址

localpath="F:/python/data/db/captcha.png"urllib.request.urlretrieve(captcha[0],filename=localpath)print("请查看本地验证码图片并输入验证码")

captcha_value=input()

data={

"form_email": "[email protected]",

"form_password": "abded",

"captcha-solution":captcha_value,

"redir":"https://www.douban.com/people/233455/",

}

else:

print("此时没有验证码")

data={

"form_email": "[email protected]",

"form_password": "abded",

"redir":"https://www.douban.com/people/233455/",

}

print("登陆中……")

return [FormRequest.from_response(response,

meta={"cookiejar": response.meta["cookiejar"]},

headers=self.header,

formdata=data,

callback=self.next,

)]

def next(self, response):

print("此时已经登陆完成并爬取了个人中心的数据")

title = response.xpath("/html/head/title/text()").extract()

# note = response.xpath("//div[@class='note']/text()").extract()

print(title[0])

# print(note[0])

View Code

(2).setting.py

ContractedBlock.gif

ExpandedBlockStart.gif

USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'

View Code

2.爬取当当网数据入Linux中的mysql

(1)items.py

ContractedBlock.gif

ExpandedBlockStart.gif

importscrapy

classDangdangItem(scrapy.Item):#define the fields for your item here like:

#name = scrapy.Field()

title=scrapy.Field()

link=scrapy.Field()

comment=scrapy.Field()

View Code

(2)dd.py

ContractedBlock.gif

ExpandedBlockStart.gif

#-*- coding: utf-8 -*-

importscrapyfrom dangdang.items importDangdangItemfrom scrapy.http importRequest

classDdSpider(scrapy.Spider):

name= 'dd'allowed_domains= ['dangdang.com']

start_urls= ['']

def parse(self, response):

item=DangdangItem()

item["title"]=response.xpath("//a[@name='itemlist-picture']/@title").extract()

item["link"]=response.xpath("//a[@name='itemlist-picture']/@href").extract()

item["comment"]=response.xpath("//a[@class='search_comment_num']/text()").extract()

yield item

for i in range(2,5):

url="http://category.dangdang.com/pg"+str(i)+"-cp01.54.00.00.00.00.html"

yield Request(url,callback=self.parse)

View Code

(3)pipelines.py

ContractedBlock.gif

ExpandedBlockStart.gif

#-*- coding: utf-8 -*-

#Define your item pipelines here

Don't forget to add your pipeline to the ITEM_PIPELINES setting

importpymysql

classDangdangPipeline(object):defprocess_item(self, item, spider):#连接数据库

conn = pymysql.connect(host='XXXX', port=3306, user='root', passwd='XXX', db='XX',

charset='utf8')print(conn)#创建操作的游标

cursor =conn.cursor()#设置字符编码及自动提交

cursor.execute('set names utf8') #固定格式

for i in range(0,len(item["title"])):

title=item["title"][i]

link=item["link"][i]

comment=item["comment"][i]

# print(title)

# print(link)

# print(comment)

sql = "insert into boods(title,link,comment) values(%s,%s,%s)"

cursor.execute(sql, (title, link, comment))

cursor.close()

conn.close()

return item

View Code

(4)setting.py中添加

ContractedBlock.gif

ExpandedBlockStart.gif

ROBOTSTXT_OBEY =False

ITEM_PIPELINES={'dangdang.pipelines.DangdangPipeline': 300,

}

View Code

3.爬取京东商城商品信息(自动爬取)

创建一个crawl爬虫,爬取京东的商品信息,并且写入数据库中。

(1)创建scrapy项目

scrapy startproject jingdong

(2)常见自动爬取文件

scrapy genspider -t crawl jd jd.com

(3)items.py

ContractedBlock.gif

ExpandedBlockStart.gif

#-*- coding: utf-8 -*-

#Define here the models for your scraped items

See documentation in:

importscrapy

classJingdongItem(scrapy.Item):#define the fields for your item here like:

#name = scrapy.Field()

#商品id

id =scrapy.Field()#商品名

title =scrapy.Field()#商品所在商店名

shop =scrapy.Field()#商品所在商店链接

shoplink =scrapy.Field()#商品价格

price =scrapy.Field()#商品好评

comment = scrapy.Field()

View Code

(4)jd.py

ContractedBlock.gif

ExpandedBlockStart.gif

#-*- coding: utf-8 -*-

importscrapyfrom scrapy.linkextractors importLinkExtractorfrom scrapy.spiders importCrawlSpider, Rulefrom jingdong.items importJingdongItemimportreimporturllib.request#自动爬虫

classJdSpider(CrawlSpider):

name= 'jd'allowed_domains= ['jd.com']

start_urls= ['']

rules = (

Rule(LinkExtractor(allow=''), callback='parse_item', follow=True),

)

def parse_item(self, response):

try:

#实例化容器

i = JingdongItem()

#获取当前页

thisurl = response.url

pat ="item.jd.com/(.*?).html"

#在thisurl中查找有没有pat这样格式的表达式

x=re.search(pat,thisurl)

if(x):

#获取商品的id

thisid=re.compile(pat).findall(thisurl)[0]

#标题

title=response.xpath("//div[@id='spec-n1']/img[@id='spec-img']/@alt").extract()

#商家

shop=response.xpath("//div[@class='name']/a/text()"

你可能感兴趣的:(python爬虫教程-Python爬虫全集)