昨天花了一天时间写的,改来改去,不断提升性能,终于可以把代码放出来了,如果发现服务域名无效,只需要更改Origin为http形式就行。
import csv
import requests
from bs4 import BeautifulSoup as bs
from multiprocessing import Pool
import gevent,time
class Comment(object):
#请求头
headers = {
"Cookie":"_abtest_userid=dc18ac38-f25f-488b-8535-e709964f2257; gad_city=a4f35f7b1b0a14c597bf3a50fb024f55; MKT_Pagesource=PC; _ga=GA1.2.7662667.1551412569; _gid=GA1.2.815726266.1551412569; _RSG=Wd46GhfGxU6uCJw.ghACmB; _RDG=2808fbf74f24dd23c52c70ca0da70efd1f; _RGUID=67bb429a-d018-49e0-bc0c-2ababee93338; appFloatCnt=25; manualclose=1; TicketSiteID=SiteID=1006; StartCity_Pkg=PkgStartCity=1; Session=smartlinkcode=U130026&smartlinklanguage=zh&SmartLinkKeyWord=&SmartLinkQuary=&SmartLinkHost=; Union=AllianceID=4897&SID=130026&OUID=&Expires=1552049736787; ASP.NET_SessionSvc=MTAuOC4xODkuNjJ8OTA5MHxqaW5xaWFvfGRlZmF1bHR8MTU1MDU2ODM3MDEyOQ; _gat=1; Mkt_UnionRecord=%5B%7B%22aid%22%3A%224897%22%2C%22timestamp%22%3A1551490732614%7D%5D; _RF1=125.88.24.39; _jzqco=%7C%7C%7C%7C%7C1.89900671.1551412570377.1551490727886.1551490732641.1551490727886.1551490732641.0.0.0.187.187; __zpspc=9.12.1551487431.1551490732.14%232%7Cwww.baidu.com%7C%7C%7C%7C%23; _bfi=p1%3D290510%26p2%3D290510%26v1%3D228%26v2%3D226; _bfa=1.1551412564624.2kwvv2.1.1551451759831.1551487429617.7.229; _bfs=1.25",
"Host": "you.ctrip.com",
"Origin": "https://you.ctrip.com",
"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
}
def __init__(self,url):
self.url =url
def request(self,poiID,districtEName,Filename):
data = {
"poiID": poiID,
"districtEName": districtEName,
"districtId": 152,
"pagenow": 1,
"order": 3.0,
"star": 0.0,
"tourist": 0.0,
"resourceId": 6766,
"resourcetype": 2
}
response = requests.post(url=self.url,headers=Comment.headers,data=data)
print(response.status_code)
if response.status_code == 404:
print(Filename+"Nodone!")
return
soup = bs(response.text,"lxml")
self.WriteScore(soup,Filename)
pagenum = self.getPageNum(soup)
pagenum = int(pagenum.get_text())
print(Filename+"正在采集...")
try:
gevent.joinall([gevent.spawn(self.writeCommentInformation,j,poiID, districtEName,Filename) for j in range(pagenum)])
except Exception as e:
print(e)
print("error--------")
#获取数据
def getCommentInformation(self,page,poiID,districtEName):
#请求数据包
data = {
"poiID": poiID, #景区id
"districtEName": districtEName, #区域名字
"districtId": 152,
"pagenow": page,
"order": 3.0,
"star": 0.0,
"tourist": 0.0,
"resourceId": 6766,
"resourcetype": 2
}
response = requests.post(url=self.url,headers=Comment.headers,data=data)
soup = bs(response.text, "lxml")
#返回作者,日期,评论
return [author.get_text() for author in soup.find_all(name="a",attrs={"itemprop":"author"})],\
[ date.get_text() for date in soup.find_all(name="em", attrs={"itemprop": "datePublished"})],\
[ comment.get_text() for comment in soup.find_all(name="span", attrs={"class": "heightbox"})]
# 获取综合,景色 趣味 性价比分数
def getSocre(self,soup):
return [score.get_text() for score in soup.find_all(name="span",attrs={"class":"score"})]
#写入数据
def writeCommentInformation(self,i, poiID, districtEName,FileName):
author,date,comment=self.getCommentInformation(i, poiID, districtEName)
print(FileName+str(i)+"页采集完毕--"+str(len(author)))
if len(author)==0:
return
with open("/Volumes/Tigo/Data/"+FileName+".csv","a+",encoding="utf-8") as f:
w = csv.writer(f)
w.writerows(list(zip(author,date,comment)))
def getPageNum(self,soup):
return soup.find(name="b",attrs={"class":"numpage"})
#写入景区评分
def WriteScore(self,soup,Filename):
filewriteScore = open("/Volumes/Tigo/Data/" + Filename+"评分" + ".csv", "a+", encoding="utf-8")
writeFile = csv.writer(filewriteScore)
l = []
l.append(Filename)
l.extend(self.getSocre(soup))
writeFile.writerow(l)
filewriteScore.close()
if __name__=="__main__":
url = "http://you.ctrip.com/destinationsite/TTDSecond/SharedView/AsynCommentView" #数据请求地址---固定不变的,只需要改变请求数据包就行
c = Comment(url)
pool = Pool(5)
with open("/Volumes/Tigo/RequestsData.csv","r") as f: #从文件中读取景区id和区域名
r = csv.reader(f)
for i in r:
pool.apply_async(func=c.request,args=(i[2],i[4],i[0],))
pool.close()
pool.join()
数据集在这里:
https://github.com/LianZS/SpyderPro/blob/master/RequestsData.xlsx