一个简单的爬虫(一)

这段代码是抓取点评上海所有没发门店的,大概是有16000多家门店,代码很简单,上代码。

#-*-coding:utf-8 -*-

import requests
import socket
import MySQLdb
import datetime
import time
from lxml import etree
import random
from UserAgent import user_agent_list

class DpShangHai:
    def __init__(self):
        self.Accept = '*/*'
        self.AcceptEncoding = 'gzip, deflate, sdch'
        self.AcceptLanguage = 'zh-CN,zh;q=0.8'
        self.CacheControl = 'max-age=0'
        self.Host = 'www.dpfile.com'
        self.pageIndex = None
        self.UserAgent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
        self.Connection = 'keep-alive'
        self.Accept_Language = 'zh-CN,zh;q=0.8'
        #初始化headers
        self.headers = ''
        self.proxy = ''
        # 存放程序是否继续运行的变量
        self.enable = False
        # connect MySQLdb
        self.db = MySQLdb.connect("IP", "username", "password", "database")
        #定义SQL对象
        self.sql = ''
        #cursor()方法获取操作游标
        self.cursor = self.db.cursor()
        self.of = open('proxy.txt', 'w')
        self.dates = str(datetime.date.today())
        self.LIST  = [
            ('http://www.dianping.com/search/keyword/1/0_%E7%BE%8E%E5%8F%91/r5','浦东新区'),
            ('http://www.dianping.com/search/keyword/1/0_%E7%BE%8E%E5%8F%91/r2','徐汇区'),
            ('http://www.dianping.com/search/keyword/1/0_%E7%BE%8E%E5%8F%91/r6','黄浦区'),
            ('http://www.dianping.com/search/keyword/1/0_%E7%BE%8E%E5%8F%91/r1','卢湾区'),
            ('http://www.dianping.com/search/keyword/1/0_%E7%BE%8E%E5%8F%91/r3','静安区'),
            ('http://www.dianping.com/search/keyword/1/0_%E7%BE%8E%E5%8F%91/r4','长宁区'),
            ('http://www.dianping.com/search/keyword/1/0_%E7%BE%8E%E5%8F%91/r12','闵行区'),
            ('http://www.dianping.com/search/keyword/1/0_%E7%BE%8E%E5%8F%91/r10','杨浦区'),
            ('http://www.dianping.com/search/keyword/1/0_%E7%BE%8E%E5%8F%91/r7','普陀区'),
            ('http://www.dianping.com/search/keyword/1/0_%E7%BE%8E%E5%8F%91/r9','虹口区'),
            ('http://www.dianping.com/search/keyword/1/0_%E7%BE%8E%E5%8F%91/r13','宝山区'),
            ('http://www.dianping.com/search/keyword/1/0_%E7%BE%8E%E5%8F%91/r8','闸北区'),
            ('http://www.dianping.com/search/keyword/1/0_%E7%BE%8E%E5%8F%91/r5937','松江区'),
            ('http://www.dianping.com/search/keyword/1/0_%E7%BE%8E%E5%8F%91/r5938','嘉定区'),
            ('http://www.dianping.com/search/keyword/1/0_%E7%BE%8E%E5%8F%91/r5939','青浦区'),
            ('http://www.dianping.com/search/keyword/1/0_%E7%BE%8E%E5%8F%91/r8846','奉贤区'),
            ('http://www.dianping.com/search/keyword/1/0_%E7%BE%8E%E5%8F%91/r8847','金山区'),
            ('http://www.dianping.com/search/keyword/1/0_%E7%BE%8E%E5%8F%91/c3580','崇明县'),
        ]

    def getPage(self,url):
        self.Referer = url
        self.UserAgent = random.choice(user_agent_list)
        self.headers = {'Accept': self.Accept,
                        'Accept-Encoding': self.AcceptEncoding,
                        'Accept-Language': self.AcceptLanguage,
                        'Cache-Control': self.CacheControl,
                        'Connection': self.Connection,
                        # 'Host': self.Host,
                        'Referer': self.Referer,
                        'User-Agent': self.UserAgent,
                        }
        try:
            timeout = 20
            socket.setdefaulttimeout(timeout)
            session = requests.session()
            request = session.get(url, timeout=60, headers=self.headers)
        except:

            print u"页面打开出错"
            return None
        else:

            return request.text

    def analysis_cbd(self,url,area):
        url_list = []
        area_list = []
        cbd_list = []
        htmlInfo = self.getPage(url)
        if htmlInfo:
            tree = etree.HTML(htmlInfo)
            href_list = tree.xpath('*//div[@class="nc-items nc-sub"]/a/@href')
            cbd_lists = tree.xpath('*//div[@class="nc-items nc-sub"]/a/span/text()')
            for href,cbd in zip(href_list,cbd_lists):
                if href == url.split('.com')[-1]:
                    pass
                else:
                    url_list.append('http://www.dianping.com' + href),
                    area_list.append(area),
                    cbd_list.append(cbd),

            return url_list,area_list,cbd_list

    def analysis_shop(self):
        for url in self.LIST:
            time.sleep(random.uniform(1, 1.5))
            url_list,area_list,cbd_list = self.analysis_cbd(url[0],url[1])
            for url,area,cbd in zip(url_list,area_list,cbd_list):
                time.sleep(random.uniform(1,1.5))
                for page in xrange(1,51,1):
                    time.sleep(random.uniform(2, 3))
                    url_each = url + 'p' + str(page)
                    print url_each,area,cbd
                    try:
                        html = self.getPage(url_each)
                    except:
                        print u"链接超时..."
                    else:
                        if html != None:
                            tree = etree.HTML(html)

                            if len(tree.xpath('//div[@class="section Fix"]/div[2]/div[1]/div/h4/text()')) > 0:

                                print u"上海站全站商户中,没有找到相应的商户"
                                break
                            else:


                                    li_list = tree.xpath('//*[@class="shop-list J_shop-list shop-all-list"]/ul/li')
                                    for li in li_list:
                                        try:
                                            if len(li.xpath('./div[@class="pic"]/a/@href')) > 0:
                                                shop_id = li.xpath('./div[@class="pic"]/a/@href')[0].replace('/shop/','')
                                            else:
                                                shop_id = 'NULL'

                                            if len(li.xpath('./div[@class="txt"]/div[1]/a/@title'))> 0:
                                                shop_name = li.xpath('./div[@class="txt"]/div[1]/a/@title')[0]
                                            else:
                                                shop_name = 'NULL'

                                            if len(li.xpath('./div[@class="txt"]/div[2]/span/@title')) > 0:
                                                level = li.xpath('./div[@class="txt"]/div[2]/span/@title')[0]
                                            else:
                                                level = 'NULL'

                                            if len(li.xpath('./div[@class="txt"]/div[2]/a[1]/b/text()')) > 0:
                                                comment = li.xpath('./div[@class="txt"]/div[2]/a[1]/b/text()')[0]
                                            else:
                                                comment = '0'

                                            if len(li.xpath('./div[@class="txt"]/div[2]/a[2]/b/text()')) > 0:
                                                average = li.xpath('./div[@class="txt"]/div[2]/a[2]/b/text()')[0].replace(u"¥",'')
                                            else:
                                                average = '0'

                                            if len(li.xpath('./div[@class="txt"]/div[3]/span/text()')) > 0:
                                                adress = li.xpath('./div[@class="txt"]/div[3]/span/text()')[0]
                                            else:
                                                adress = 'NULL'

                                            if len(li.xpath('./div[@class="svr-info"]')) > 0:
                                                group_purchase = 'Y'
                                            else:
                                                group_purchase = 'N'
                                        except:

                                            print u"解析错误..."

                                        else:
                                            '''
                                            插入数据....
                                            '''
                                            # print shop_id,shop_name,level,area,cbd,adress,comment,average,group_purchase
                                            try:
                                                self.db.set_character_set('utf8')
                                                self.sql = "INSERT INTO gz_bi_shanghai_shop(date,platfrom,shopid,shopname,level,area,cbd,adress,comments,average,group_purchase) VALUES ('" + self.dates + "',1,'" + shop_id + "','" + shop_name + "','" + level + "', '" + area + "', '" + cbd + "', '" + adress + "', '" + comment + "', '" + average + "', '" + group_purchase + "')"

                                                # print(self.sql)
                                                try:
                                                    result = self.cursor.execute(self.sql)
                                                    insert_id = self.db.insert_id()
                                                    self.db.commit()
                                                    # 判断是否执行成功
                                                    if result:
                                                        print "插入成功:%s" % insert_id
                                                    else:
                                                        print "插入为NULL"
                                                except MySQLdb.Error, e:
                                                    print(e)  # print(MySQLdb.Error)

                                                    # 发生错误时回滚
                                                    self.db.rollback()
                                                    # 主键唯一,无法插入
                                                    if "key 'PRIMARY'" in e.args[1]:
                                                        print "数据已存在,未插入数据"
                                                    else:
                                                        print "插入数据失败,原因 %d: %s" % (e.args[0], e.args[1])
                                            except MySQLdb.Error, e:
                                                print "数据库错误,原因%d: %s" % (e.args[0], e.args[1])

if __name__ == '__main__':
    print u"正在启动爬虫程序请稍后..."
    a = DpShangHai()
    a.analysis_shop()


本次抓取因为数量较小没有使用代理,只切换了一下useragent,上UA

import random


user_agent_list = [\
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
    "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
    "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
    "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
    "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
    "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
    "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
    "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
    "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
    "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "  ,
    "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"  ,
]
直接运行数据写入mysql数据库几个小时数据就来啦。

你可能感兴趣的:(爬虫)