python爬虫 爬取大众点评中所有行政区内的商户 将获取信息存于excle中

import xlwt
'''
爬取网页时直接出现403,意思是没有访问权限
'''
import requests
from bs4 import BeautifulSoup

#入口网页
start_url = 'https://www.dianping.com/search/category/344/10'

def get_content(url,headers = None):
    response = requests.get(url,headers=headers)#发起了一次请求
    html = response.content
    return html

'''
    获取所有行政区的url
'''
def region_url(html):
    soup = BeautifulSoup(html,'lxml')#lxml解析器
    #
# 芙蓉区 #列表推导式 base_url = 'https://www.dianping.com' region_url_list = [base_url+i['href'] for i in soup.find('div',id="region-nav").find_all('a')] return region_url_list #获取商户的详情页的url地址 #find:取第一个(返回一个具体的元素,没有为null) find_all:匹配所有(返回列表,没有返回[]) def get_shop_url(html): base_url = 'https://www.dianping.com' soup = BeautifulSoup(html,'lxml')#lxml解析器 shop_url_list = [base_url+i.find('a')['href'] for i in soup.find_all('div',class_='tit')] return shop_url_list #获取所得信息(店名,价格,评分)。。。解析页面 def get_detail(html): soup = BeautifulSoup(html,'lxml')#lxml解析器 #

1911牛肉烤串

title = soup.find('div',class_='breadcrumb').find('span').text #人均:- price = soup.find('span',id="avgPriceTitle").text #口味:7.6环境:7.4服务:7.5 evaluation = soup.find('span',id="comment_score").find_all('span',class_="item")#评分的list #3条评论 comments = soup.find('span',id="reviewCount").text#评论的数量 #
# # 麓松路南丰港安置小区12栋 # #
address = soup.find('span',class_="item",itemprop="street-address").text.strip() # print u'店名'+title # for ev in evaluation: # print ev.text # print u'价格'+price # print u'评论数量'+comments # print u'地址'+address return (title,evaluation[0].text,evaluation[1].text,evaluation[2].text,price,comments,address) #文件作为脚本直接执行,而import到其他脚本中是不会被执行的。 if __name__ =='__main__': items = [] headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36', 'Cookie':'_hc.v=dd67ff67-20d0-6e83-7f61-ce93e4d46539.1503387665; _lx_utm=utm_source%3Dbaidu%26utm_medium%3Dorganic; _lxsdk_cuid=15e08e4c108c8-01758fac19fbe5-3f63440c-100200-15e08e4c108c8; _lxsdk=15e08e4c108c8-01758fac19fbe5-3f63440c-100200-15e08e4c108c8; __utma=205923334.211352043.1503391484.1503391484.1503391484.1; __utmz=205923334.1503391484.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); looyu_id=29bc50ef1530ab64cbaa69b29cad64f39a_51868%3A1; s_ViewType=10; JSESSIONID=A49EED22A236962EA3506BA888799402; aburl=1; cy=344; cye=changsha; PHOENIX_ID=0a010918-15e0a223263-d4c1a92; __mta=146625163.1503391361571.1503401588676.1503408592089.10; _lxsdk_s=15e0a219034-38-9d5-acb%7C%7C37' } html = get_content(start_url) region_url_list = region_url(html) #遍历所有行政区的所有商户 for url in region_url_list:#遍历所有的行政区 #简单的出错处理,有错则略过 try: for n in range(1,51):#遍历所有的50页 html = get_content(url+'p'+str(n)) #所有商户的详情页 shop_url_list = get_shop_url(html) for shop_url in shop_url_list: # print shop_url #提取数据,获取 detail_html = get_content(shop_url,headers) ''' #403 Forbidden(没有访问权限): (1)直接出现: (2)爬取一会儿出现403:可以通过代理ip解决 referer 防盗链 Host域名 Cookie ''' items.append(get_detail(detail_html)) except: continue new_table = r'F:\reptile_Python\daZhongDianPin_spiders\dzdp.xls' wb = xlwt.Workbook(encoding='utf-8') ws =wb.add_sheet('test1') headData = ['商户名字','口味评分','环境评分','服务评分','人均价格','评论数量','地址'] for colnum in range(0,7): ws.write(0,colnum,headData[colnum],xlwt.easyxf('font:bold on')) index = 1 lens = len(items) for j in range(0,lens): for i in range(0,7): ws.write(index,i,items[j][i]) index=index+1 wb.save(new_table)

你可能感兴趣的:(Python)