import xlwt
'''
爬取网页时直接出现403,意思是没有访问权限
'''
import requests
from bs4 import BeautifulSoup
#入口网页
start_url = 'https://www.dianping.com/search/category/344/10'
def get_content(url,headers = None):
response = requests.get(url,headers=headers)#发起了一次请求
html = response.content
return html
'''
获取所有行政区的url
'''
def region_url(html):
soup = BeautifulSoup(html,'lxml')#lxml解析器
#
#
芙蓉区
#列表推导式
base_url = 'https://www.dianping.com'
region_url_list = [base_url+i['href'] for i in soup.find('div',id="region-nav").find_all('a')]
return region_url_list
#获取商户的详情页的url地址
#find:取第一个(返回一个具体的元素,没有为null) find_all:匹配所有(返回列表,没有返回[])
def get_shop_url(html):
base_url = 'https://www.dianping.com'
soup = BeautifulSoup(html,'lxml')#lxml解析器
shop_url_list = [base_url+i.find('a')['href'] for i in soup.find_all('div',class_='tit')]
return shop_url_list
#获取所得信息(店名,价格,评分)。。。解析页面
def get_detail(html):
soup = BeautifulSoup(html,'lxml')#lxml解析器
#
1911牛肉烤串
title = soup.find('div',class_='breadcrumb').find('span').text
#
人均:-
price = soup.find('span',id="avgPriceTitle").text
#
evaluation = soup.find('span',id="comment_score").find_all('span',class_="item")#评分的list
#
3条评论
comments = soup.find('span',id="reviewCount").text#评论的数量
#
#
# 麓松路南丰港安置小区12栋
#
#
address = soup.find('span',class_="item",itemprop="street-address").text.strip()
# print u'店名'+title
# for ev in evaluation:
# print ev.text
# print u'价格'+price
# print u'评论数量'+comments
# print u'地址'+address
return (title,evaluation[0].text,evaluation[1].text,evaluation[2].text,price,comments,address)
#文件作为脚本直接执行,而import到其他脚本中是不会被执行的。
if __name__ =='__main__':
items = []
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36',
'Cookie':'_hc.v=dd67ff67-20d0-6e83-7f61-ce93e4d46539.1503387665; _lx_utm=utm_source%3Dbaidu%26utm_medium%3Dorganic; _lxsdk_cuid=15e08e4c108c8-01758fac19fbe5-3f63440c-100200-15e08e4c108c8; _lxsdk=15e08e4c108c8-01758fac19fbe5-3f63440c-100200-15e08e4c108c8; __utma=205923334.211352043.1503391484.1503391484.1503391484.1; __utmz=205923334.1503391484.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); looyu_id=29bc50ef1530ab64cbaa69b29cad64f39a_51868%3A1; s_ViewType=10; JSESSIONID=A49EED22A236962EA3506BA888799402; aburl=1; cy=344; cye=changsha; PHOENIX_ID=0a010918-15e0a223263-d4c1a92; __mta=146625163.1503391361571.1503401588676.1503408592089.10; _lxsdk_s=15e0a219034-38-9d5-acb%7C%7C37'
}
html = get_content(start_url)
region_url_list = region_url(html)
#遍历所有行政区的所有商户
for url in region_url_list:#遍历所有的行政区
#简单的出错处理,有错则略过
try:
for n in range(1,51):#遍历所有的50页
html = get_content(url+'p'+str(n))
#所有商户的详情页
shop_url_list = get_shop_url(html)
for shop_url in shop_url_list:
# print shop_url
#提取数据,获取
detail_html = get_content(shop_url,headers)
'''
#403 Forbidden(没有访问权限):
(1)直接出现:
(2)爬取一会儿出现403:可以通过代理ip解决
referer 防盗链
Host域名
Cookie
'''
items.append(get_detail(detail_html))
except:
continue
new_table = r'F:\reptile_Python\daZhongDianPin_spiders\dzdp.xls'
wb = xlwt.Workbook(encoding='utf-8')
ws =wb.add_sheet('test1')
headData = ['商户名字','口味评分','环境评分','服务评分','人均价格','评论数量','地址']
for colnum in range(0,7):
ws.write(0,colnum,headData[colnum],xlwt.easyxf('font:bold on'))
index = 1
lens = len(items)
for j in range(0,lens):
for i in range(0,7):
ws.write(index,i,items[j][i])
index=index+1
wb.save(new_table)