高性能爬取携程网景区评论数据

昨天花了一天时间写的,改来改去,不断提升性能,终于可以把代码放出来了,如果发现服务域名无效,只需要更改Origin为http形式就行。

 

 

import csv

import requests
from bs4 import BeautifulSoup as bs
from  multiprocessing import Pool
import  gevent,time



class Comment(object):
    #请求头
    headers = {
        "Cookie":"_abtest_userid=dc18ac38-f25f-488b-8535-e709964f2257; gad_city=a4f35f7b1b0a14c597bf3a50fb024f55; MKT_Pagesource=PC; _ga=GA1.2.7662667.1551412569; _gid=GA1.2.815726266.1551412569; _RSG=Wd46GhfGxU6uCJw.ghACmB; _RDG=2808fbf74f24dd23c52c70ca0da70efd1f; _RGUID=67bb429a-d018-49e0-bc0c-2ababee93338; appFloatCnt=25; manualclose=1; TicketSiteID=SiteID=1006; StartCity_Pkg=PkgStartCity=1; Session=smartlinkcode=U130026&smartlinklanguage=zh&SmartLinkKeyWord=&SmartLinkQuary=&SmartLinkHost=; Union=AllianceID=4897&SID=130026&OUID=&Expires=1552049736787; ASP.NET_SessionSvc=MTAuOC4xODkuNjJ8OTA5MHxqaW5xaWFvfGRlZmF1bHR8MTU1MDU2ODM3MDEyOQ; _gat=1; Mkt_UnionRecord=%5B%7B%22aid%22%3A%224897%22%2C%22timestamp%22%3A1551490732614%7D%5D; _RF1=125.88.24.39; _jzqco=%7C%7C%7C%7C%7C1.89900671.1551412570377.1551490727886.1551490732641.1551490727886.1551490732641.0.0.0.187.187; __zpspc=9.12.1551487431.1551490732.14%232%7Cwww.baidu.com%7C%7C%7C%7C%23; _bfi=p1%3D290510%26p2%3D290510%26v1%3D228%26v2%3D226; _bfa=1.1551412564624.2kwvv2.1.1551451759831.1551487429617.7.229; _bfs=1.25",
            "Host": "you.ctrip.com",
        "Origin": "https://you.ctrip.com",
         "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
    }

    def __init__(self,url):
        self.url =url


    def request(self,poiID,districtEName,Filename):
        data = {
            "poiID": poiID,
            "districtEName": districtEName,
            "districtId": 152,
            "pagenow": 1,
            "order": 3.0,
            "star": 0.0,
            "tourist": 0.0,
            "resourceId": 6766,
            "resourcetype": 2
        }
        response = requests.post(url=self.url,headers=Comment.headers,data=data)
        print(response.status_code)
        if response.status_code == 404:
            print(Filename+"Nodone!")
            return
        soup = bs(response.text,"lxml")
        self.WriteScore(soup,Filename)
        pagenum = self.getPageNum(soup)
        pagenum = int(pagenum.get_text())

        print(Filename+"正在采集...")
        try:
            gevent.joinall([gevent.spawn(self.writeCommentInformation,j,poiID, districtEName,Filename) for j in range(pagenum)])
        except Exception as e:
            print(e)
            print("error--------")

        #获取数据
    def getCommentInformation(self,page,poiID,districtEName):
        #请求数据包
        data = {
            "poiID": poiID, #景区id
            "districtEName": districtEName, #区域名字
            "districtId": 152,
            "pagenow": page,
            "order": 3.0,
            "star": 0.0,
            "tourist": 0.0,
            "resourceId": 6766,
            "resourcetype": 2
        }
        response = requests.post(url=self.url,headers=Comment.headers,data=data)
        soup = bs(response.text, "lxml")
        #返回作者,日期,评论
        return  [author.get_text() for author in soup.find_all(name="a",attrs={"itemprop":"author"})],\
                [ date.get_text() for date in soup.find_all(name="em", attrs={"itemprop": "datePublished"})],\
                [ comment.get_text() for comment in soup.find_all(name="span", attrs={"class": "heightbox"})]

    # 获取综合,景色 趣味 性价比分数
    def getSocre(self,soup):

        return  [score.get_text() for score in soup.find_all(name="span",attrs={"class":"score"})]

#写入数据
    def writeCommentInformation(self,i, poiID, districtEName,FileName):

        author,date,comment=self.getCommentInformation(i, poiID, districtEName)
        print(FileName+str(i)+"页采集完毕--"+str(len(author)))
        if len(author)==0:
            return
        with open("/Volumes/Tigo/Data/"+FileName+".csv","a+",encoding="utf-8") as f:
            w = csv.writer(f)
            w.writerows(list(zip(author,date,comment)))

    def getPageNum(self,soup):
        return soup.find(name="b",attrs={"class":"numpage"})

#写入景区评分
    def WriteScore(self,soup,Filename):

        filewriteScore = open("/Volumes/Tigo/Data/" + Filename+"评分" + ".csv", "a+", encoding="utf-8")
        writeFile = csv.writer(filewriteScore)
        l = []
        l.append(Filename)
        l.extend(self.getSocre(soup))
        writeFile.writerow(l)
        filewriteScore.close()




if __name__=="__main__":

    url = "http://you.ctrip.com/destinationsite/TTDSecond/SharedView/AsynCommentView" #数据请求地址---固定不变的,只需要改变请求数据包就行
    c = Comment(url)
    pool = Pool(5)
    with open("/Volumes/Tigo/RequestsData.csv","r") as f: #从文件中读取景区id和区域名
        r = csv.reader(f)
        for i in r:
            pool.apply_async(func=c.request,args=(i[2],i[4],i[0],))
        pool.close()
        pool.join()


 

数据集在这里:

https://github.com/LianZS/SpyderPro/blob/master/RequestsData.xlsx

你可能感兴趣的:(爬虫,爬虫技术基础及实战)