这段代码是抓取点评上海所有没发门店的,大概是有16000多家门店,代码很简单,上代码。
#-*-coding:utf-8 -*-
import requests
import socket
import MySQLdb
import datetime
import time
from lxml import etree
import random
from UserAgent import user_agent_list
class DpShangHai:
def __init__(self):
self.Accept = '*/*'
self.AcceptEncoding = 'gzip, deflate, sdch'
self.AcceptLanguage = 'zh-CN,zh;q=0.8'
self.CacheControl = 'max-age=0'
self.Host = 'www.dpfile.com'
self.pageIndex = None
self.UserAgent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
self.Connection = 'keep-alive'
self.Accept_Language = 'zh-CN,zh;q=0.8'
#初始化headers
self.headers = ''
self.proxy = ''
# 存放程序是否继续运行的变量
self.enable = False
# connect MySQLdb
self.db = MySQLdb.connect("IP", "username", "password", "database")
#定义SQL对象
self.sql = ''
#cursor()方法获取操作游标
self.cursor = self.db.cursor()
self.of = open('proxy.txt', 'w')
self.dates = str(datetime.date.today())
self.LIST = [
('http://www.dianping.com/search/keyword/1/0_%E7%BE%8E%E5%8F%91/r5','浦东新区'),
('http://www.dianping.com/search/keyword/1/0_%E7%BE%8E%E5%8F%91/r2','徐汇区'),
('http://www.dianping.com/search/keyword/1/0_%E7%BE%8E%E5%8F%91/r6','黄浦区'),
('http://www.dianping.com/search/keyword/1/0_%E7%BE%8E%E5%8F%91/r1','卢湾区'),
('http://www.dianping.com/search/keyword/1/0_%E7%BE%8E%E5%8F%91/r3','静安区'),
('http://www.dianping.com/search/keyword/1/0_%E7%BE%8E%E5%8F%91/r4','长宁区'),
('http://www.dianping.com/search/keyword/1/0_%E7%BE%8E%E5%8F%91/r12','闵行区'),
('http://www.dianping.com/search/keyword/1/0_%E7%BE%8E%E5%8F%91/r10','杨浦区'),
('http://www.dianping.com/search/keyword/1/0_%E7%BE%8E%E5%8F%91/r7','普陀区'),
('http://www.dianping.com/search/keyword/1/0_%E7%BE%8E%E5%8F%91/r9','虹口区'),
('http://www.dianping.com/search/keyword/1/0_%E7%BE%8E%E5%8F%91/r13','宝山区'),
('http://www.dianping.com/search/keyword/1/0_%E7%BE%8E%E5%8F%91/r8','闸北区'),
('http://www.dianping.com/search/keyword/1/0_%E7%BE%8E%E5%8F%91/r5937','松江区'),
('http://www.dianping.com/search/keyword/1/0_%E7%BE%8E%E5%8F%91/r5938','嘉定区'),
('http://www.dianping.com/search/keyword/1/0_%E7%BE%8E%E5%8F%91/r5939','青浦区'),
('http://www.dianping.com/search/keyword/1/0_%E7%BE%8E%E5%8F%91/r8846','奉贤区'),
('http://www.dianping.com/search/keyword/1/0_%E7%BE%8E%E5%8F%91/r8847','金山区'),
('http://www.dianping.com/search/keyword/1/0_%E7%BE%8E%E5%8F%91/c3580','崇明县'),
]
def getPage(self,url):
self.Referer = url
self.UserAgent = random.choice(user_agent_list)
self.headers = {'Accept': self.Accept,
'Accept-Encoding': self.AcceptEncoding,
'Accept-Language': self.AcceptLanguage,
'Cache-Control': self.CacheControl,
'Connection': self.Connection,
# 'Host': self.Host,
'Referer': self.Referer,
'User-Agent': self.UserAgent,
}
try:
timeout = 20
socket.setdefaulttimeout(timeout)
session = requests.session()
request = session.get(url, timeout=60, headers=self.headers)
except:
print u"页面打开出错"
return None
else:
return request.text
def analysis_cbd(self,url,area):
url_list = []
area_list = []
cbd_list = []
htmlInfo = self.getPage(url)
if htmlInfo:
tree = etree.HTML(htmlInfo)
href_list = tree.xpath('*//div[@class="nc-items nc-sub"]/a/@href')
cbd_lists = tree.xpath('*//div[@class="nc-items nc-sub"]/a/span/text()')
for href,cbd in zip(href_list,cbd_lists):
if href == url.split('.com')[-1]:
pass
else:
url_list.append('http://www.dianping.com' + href),
area_list.append(area),
cbd_list.append(cbd),
return url_list,area_list,cbd_list
def analysis_shop(self):
for url in self.LIST:
time.sleep(random.uniform(1, 1.5))
url_list,area_list,cbd_list = self.analysis_cbd(url[0],url[1])
for url,area,cbd in zip(url_list,area_list,cbd_list):
time.sleep(random.uniform(1,1.5))
for page in xrange(1,51,1):
time.sleep(random.uniform(2, 3))
url_each = url + 'p' + str(page)
print url_each,area,cbd
try:
html = self.getPage(url_each)
except:
print u"链接超时..."
else:
if html != None:
tree = etree.HTML(html)
if len(tree.xpath('//div[@class="section Fix"]/div[2]/div[1]/div/h4/text()')) > 0:
print u"上海站全站商户中,没有找到相应的商户"
break
else:
li_list = tree.xpath('//*[@class="shop-list J_shop-list shop-all-list"]/ul/li')
for li in li_list:
try:
if len(li.xpath('./div[@class="pic"]/a/@href')) > 0:
shop_id = li.xpath('./div[@class="pic"]/a/@href')[0].replace('/shop/','')
else:
shop_id = 'NULL'
if len(li.xpath('./div[@class="txt"]/div[1]/a/@title'))> 0:
shop_name = li.xpath('./div[@class="txt"]/div[1]/a/@title')[0]
else:
shop_name = 'NULL'
if len(li.xpath('./div[@class="txt"]/div[2]/span/@title')) > 0:
level = li.xpath('./div[@class="txt"]/div[2]/span/@title')[0]
else:
level = 'NULL'
if len(li.xpath('./div[@class="txt"]/div[2]/a[1]/b/text()')) > 0:
comment = li.xpath('./div[@class="txt"]/div[2]/a[1]/b/text()')[0]
else:
comment = '0'
if len(li.xpath('./div[@class="txt"]/div[2]/a[2]/b/text()')) > 0:
average = li.xpath('./div[@class="txt"]/div[2]/a[2]/b/text()')[0].replace(u"¥",'')
else:
average = '0'
if len(li.xpath('./div[@class="txt"]/div[3]/span/text()')) > 0:
adress = li.xpath('./div[@class="txt"]/div[3]/span/text()')[0]
else:
adress = 'NULL'
if len(li.xpath('./div[@class="svr-info"]')) > 0:
group_purchase = 'Y'
else:
group_purchase = 'N'
except:
print u"解析错误..."
else:
'''
插入数据....
'''
# print shop_id,shop_name,level,area,cbd,adress,comment,average,group_purchase
try:
self.db.set_character_set('utf8')
self.sql = "INSERT INTO gz_bi_shanghai_shop(date,platfrom,shopid,shopname,level,area,cbd,adress,comments,average,group_purchase) VALUES ('" + self.dates + "',1,'" + shop_id + "','" + shop_name + "','" + level + "', '" + area + "', '" + cbd + "', '" + adress + "', '" + comment + "', '" + average + "', '" + group_purchase + "')"
# print(self.sql)
try:
result = self.cursor.execute(self.sql)
insert_id = self.db.insert_id()
self.db.commit()
# 判断是否执行成功
if result:
print "插入成功:%s" % insert_id
else:
print "插入为NULL"
except MySQLdb.Error, e:
print(e) # print(MySQLdb.Error)
# 发生错误时回滚
self.db.rollback()
# 主键唯一,无法插入
if "key 'PRIMARY'" in e.args[1]:
print "数据已存在,未插入数据"
else:
print "插入数据失败,原因 %d: %s" % (e.args[0], e.args[1])
except MySQLdb.Error, e:
print "数据库错误,原因%d: %s" % (e.args[0], e.args[1])
if __name__ == '__main__':
print u"正在启动爬虫程序请稍后..."
a = DpShangHai()
a.analysis_shop()
import random
user_agent_list = [\
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
"(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 " ,
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" ,
]
直接运行数据写入mysql数据库几个小时数据就来啦。