yinlong_spider:
import scrapy
import urllib.request
import requests
import demjson
from scrapy.spiders import CrawlSpider
from yilong.items import YilongItem
import re
from bs4 import BeautifulSoup
class yilongSpider( CrawlSpider):
name = “yilong”
allowed_domains = “elong.com”
start_urls = [
“http://hotel.elong.com/”
]
def parse(self,response):
def remove_emoji(comment, restr=’’):
# 过滤表情
try:
co = re.compile(u’[\U00010000-\U0010ffff]’)
except re.error:
co = re.compile(u’[\uD800-\uDBFF][\uDC00-\uDFFF]’)
return co.sub(restr, comment)
url = 'http://hotel.elong.com/ajax/tmapilist/asyncsearch'
header = {'Accept': 'application/jsontext/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Length': '1665',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Host': 'hotel.elong.com',
'Origin': 'http://hotel.elong.com',
'Referer': 'http://hotel.elong.com/hangzhou/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
'cookie': 'CookieGuid=ab7f1877-dfda-4882-b9c8-6116d3a41166; s_eVar44=brand360sem; _fid=e607b469-3334-454a-90ef-e7282f7a4a45; __guid=206901770.676507428123648000.1541419138672.057; SHBrowseHotel=cn=91450084%2C%2C%2C%2C%2C%2C%3B92375725%2C%2C%2C%2C%2C%2C%3B21201436%2C%2C%2C%2C%2C%2C%3B&; ShHotel=CityID=1201&CityNameCN=%E6%9D%AD%E5%B7%9E%E5%B8%82&CityName=%E6%9D%AD%E5%B7%9E%E5%B8%82&OutDate=2018-11-09&CityNameEN=hangzhou&InDate=2018-11-08; SessionGuid=da5a119d-ff37-4b56-8f62-5b9b0615858a; Esid=59a9e2d7-1b62-4aa6-a390-56a3f43613c3; semtcid=cd67cc75-c72d-42a7-a755-180e4d22f6e9; semid=brand360sem; outerFrom=brand360sem; com.eLong.CommonService.OrderFromCookieInfo=Status=1&Orderfromtype=1&Isusefparam=0&Pkid=50792&Parentid=4300&Coefficient=0.0&Makecomefrom=0&Cookiesdays=0&Savecookies=0&Priority=9001; fv=pcweb; ext_param=bns%3D4%26ct%3D3; s_cc=true; s_visit=1; newjava2=0b48025058222f4e0d14b79dbc0d2df5; JSESSIONID=B97F1B21A3B60E349023A0C13129345D; anti_token=42126823-FBE1-4301-8FE6-50A6812CF3D5; __tctmb=0.2119295649609267.1541672954728.1541672954728.1; __tccgd=0.0; __tctmc=0.136017320; monitor_count=37; s_sq=elongcom%3D%2526pid%253Dhotel.elong.com%25252Fhangzhou%2526pidt%253D1%2526oid%253Djavascript%25253Avoid(0)%2526ot%253DA; __tctmd=0.1'
}
for i in range(1, 10):
data = {'code': '8999595',
'listRequest.pageIndex': i,
'listRequest.pageSize': '20',
'listRequest.cityName': '杭州市',
}
# print(data['listRequest.pageIndex'])
html = requests.post(url, data=data, headers=header)
text = html.json()['value']['hotelIds']
text = text.split(',')
text1 = html.json()['value']['hotelListHtml']
soup = BeautifulSoup(text1, 'html.parser')
hotelname = soup.find_all('img')
# print(hotelname)
for k in hotelname:
f = open(k['alt'] + '.txt', 'a+', encoding='UTF-8')
for a in text:
print(a)
for i in range(1, 10):
url1 = 'http://hotel.elong.com/ajax/comment/getcommentbypage/?hotelId=' + a + '+&recommendedType=1&pageIndex=' + str(
i) + '&mainTagId=0&subTagId=0&rankType=0&eToken=e607b469-3334-454a-90ef-e7282f7a4a45&code=7051534&_=1541673964193'
headers = {'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive', 'Host': 'hotel.elong.com',
'Referer': 'http://hotel.elong.com/21201502/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest'}
html1 = requests.get(url1, headers=headers)
# request = urllib.request.Request(url1, headers=headers)
# response = urllib.request.urlopen(request)
for o in range(20):
print(o)
name = html1.json()['value']['Comments'][o]['CommentUser']['NickName']
name = remove_emoji(name, restr='')
print(name)
comment = html1.json()['value']['Comments'][o]['Content']
comment = remove_emoji(comment, restr='')
print(comment)
time = html1.json()['value']['Comments'][o]['createTimeString']
print(time)
score = html1.json()['value']['Comments'][o]['Source']
score = str(score)
print(score)
item = YilongItem()
item['name'] = name
# print(item['name'])
item['comment'] = comment
item['score'] = score
item['time'] = time
yield item
item.py:
# -*- coding: utf-8 -*-
import scrapy
class YilongItem(scrapy.Item):
name = scrapy.Field()
comment = scrapy.Field()
score = scrapy.Field()
time = scrapy.Field()
# define the fields for your item here like:
# name = scrapy.Field()
pass
pipelines.py:
import pymysql
class YilongPipeline(object):
# def init(self):
# self.conn = pymysql.connect(host=‘localhost’,
# user=‘root’,
# password=‘123’,
# db=‘test’,
# charset=‘utf8’
# )
# cursor = self.conn.cursor()
# cursor.execute(“DROP TABLE IF EXISTS comment”)
# sql = “”“CREATE TABLE comment(name char(25),分数 char(110),评价 text(10000),日期 char (25) )”""
# cursor.execute(sql)
def process_item(self, item, spider):
f = open('D:\python1\comment.txt', 'a+', encoding='UTF-8')
f.write(item['name']+'\n'+item['score']+'\n'+item['comment']+'\n'+item['time']+'\n\n')
f.close()
# cursor = self.conn.cursor()
# cursor.execute( "INSERT INTO comment(name,分数,评价,日期) VALUES ('%s','%s','%s','%s');" % (item['name'], item['score'], item['comment'], item['time']))
# self.conn.commit()
return item
# def close_spider(self):
# self.conn.close()
学习总结:
1.在该任务中学会了scrapy框架的基本结构,对框架有了一定的了解