利用scrapy爬取艺龙评论

yinlong_spider:
import scrapy
import urllib.request
import requests
import demjson
from scrapy.spiders import CrawlSpider
from yilong.items import YilongItem
import re
from bs4 import BeautifulSoup
class yilongSpider( CrawlSpider):
name = “yilong”
allowed_domains = “elong.com”
start_urls = [
“http://hotel.elong.com/”
]
def parse(self,response):
def remove_emoji(comment, restr=’’):
# 过滤表情
try:
co = re.compile(u’[\U00010000-\U0010ffff]’)
except re.error:
co = re.compile(u’[\uD800-\uDBFF][\uDC00-\uDFFF]’)
return co.sub(restr, comment)

    url = 'http://hotel.elong.com/ajax/tmapilist/asyncsearch'
    header = {'Accept': 'application/jsontext/javascript, */*; q=0.01',
              'Accept-Encoding': 'gzip, deflate',
              'Accept-Language': 'zh-CN,zh;q=0.9',
              'Connection': 'keep-alive',
              'Content-Length': '1665',
              'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
              'Host': 'hotel.elong.com',
              'Origin': 'http://hotel.elong.com',
              'Referer': 'http://hotel.elong.com/hangzhou/',
              'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
              'X-Requested-With': 'XMLHttpRequest',
              'cookie': 'CookieGuid=ab7f1877-dfda-4882-b9c8-6116d3a41166; s_eVar44=brand360sem; _fid=e607b469-3334-454a-90ef-e7282f7a4a45; __guid=206901770.676507428123648000.1541419138672.057; SHBrowseHotel=cn=91450084%2C%2C%2C%2C%2C%2C%3B92375725%2C%2C%2C%2C%2C%2C%3B21201436%2C%2C%2C%2C%2C%2C%3B&; ShHotel=CityID=1201&CityNameCN=%E6%9D%AD%E5%B7%9E%E5%B8%82&CityName=%E6%9D%AD%E5%B7%9E%E5%B8%82&OutDate=2018-11-09&CityNameEN=hangzhou&InDate=2018-11-08; SessionGuid=da5a119d-ff37-4b56-8f62-5b9b0615858a; Esid=59a9e2d7-1b62-4aa6-a390-56a3f43613c3; semtcid=cd67cc75-c72d-42a7-a755-180e4d22f6e9; semid=brand360sem; outerFrom=brand360sem; com.eLong.CommonService.OrderFromCookieInfo=Status=1&Orderfromtype=1&Isusefparam=0&Pkid=50792&Parentid=4300&Coefficient=0.0&Makecomefrom=0&Cookiesdays=0&Savecookies=0&Priority=9001; fv=pcweb; ext_param=bns%3D4%26ct%3D3; s_cc=true; s_visit=1; newjava2=0b48025058222f4e0d14b79dbc0d2df5; JSESSIONID=B97F1B21A3B60E349023A0C13129345D; anti_token=42126823-FBE1-4301-8FE6-50A6812CF3D5; __tctmb=0.2119295649609267.1541672954728.1541672954728.1; __tccgd=0.0; __tctmc=0.136017320; monitor_count=37; s_sq=elongcom%3D%2526pid%253Dhotel.elong.com%25252Fhangzhou%2526pidt%253D1%2526oid%253Djavascript%25253Avoid(0)%2526ot%253DA; __tctmd=0.1'
              }
    for i in range(1, 10):
        data = {'code': '8999595',
                'listRequest.pageIndex': i,
                'listRequest.pageSize': '20',
                'listRequest.cityName': '杭州市',
                }
        # print(data['listRequest.pageIndex'])
        html = requests.post(url, data=data, headers=header)
        text = html.json()['value']['hotelIds']
        text = text.split(',')
        text1 = html.json()['value']['hotelListHtml']
        soup = BeautifulSoup(text1, 'html.parser')
        hotelname = soup.find_all('img')
        # print(hotelname)
        for k in hotelname:
            f = open(k['alt'] + '.txt', 'a+', encoding='UTF-8')
            for a in text:
                print(a)
                for i in range(1, 10):
                    url1 = 'http://hotel.elong.com/ajax/comment/getcommentbypage/?hotelId=' + a + '+&recommendedType=1&pageIndex=' + str(
                        i) + '&mainTagId=0&subTagId=0&rankType=0&eToken=e607b469-3334-454a-90ef-e7282f7a4a45&code=7051534&_=1541673964193'
                    headers = {'Accept': 'application/json, text/javascript, */*; q=0.01',
                               'Accept-Encoding': 'gzip, deflate',
                               'Accept-Language': 'zh-CN,zh;q=0.9',
                               'Connection': 'keep-alive', 'Host': 'hotel.elong.com',
                               'Referer': 'http://hotel.elong.com/21201502/',
                               'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
                               'X-Requested-With': 'XMLHttpRequest'}
                    html1 = requests.get(url1, headers=headers)
                    # request = urllib.request.Request(url1, headers=headers)
                    # response = urllib.request.urlopen(request)

                    for o in range(20):
                        print(o)
                        name = html1.json()['value']['Comments'][o]['CommentUser']['NickName']
                        name = remove_emoji(name, restr='')
                        print(name)
                        comment = html1.json()['value']['Comments'][o]['Content']
                        comment = remove_emoji(comment, restr='')
                        print(comment)
                        time = html1.json()['value']['Comments'][o]['createTimeString']
                        print(time)
                        score = html1.json()['value']['Comments'][o]['Source']
                        score = str(score)
                        print(score)
                        item = YilongItem()
                        item['name'] = name
                        # print(item['name'])
                        item['comment'] = comment
                        item['score'] = score
                        item['time'] = time
                        yield item
item.py:
# -*- coding: utf-8 -*-

Define here the models for your scraped items

See documentation in:

https://doc.scrapy.org/en/latest/topics/items.html

import scrapy

class YilongItem(scrapy.Item):
name = scrapy.Field()
comment = scrapy.Field()
score = scrapy.Field()
time = scrapy.Field()

# define the fields for your item here like:
# name = scrapy.Field()
pass

pipelines.py:

-- coding: utf-8 --

Define your item pipelines here

Don’t forget to add your pipeline to the ITEM_PIPELINES setting

See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

import pymysql

class YilongPipeline(object):
# def init(self):
# self.conn = pymysql.connect(host=‘localhost’,
# user=‘root’,
# password=‘123’,
# db=‘test’,
# charset=‘utf8’
# )
# cursor = self.conn.cursor()
# cursor.execute(“DROP TABLE IF EXISTS comment”)
# sql = “”“CREATE TABLE comment(name char(25),分数 char(110),评价 text(10000),日期 char (25) )”""
# cursor.execute(sql)

def process_item(self, item, spider):
    f = open('D:\python1\comment.txt', 'a+', encoding='UTF-8')
    f.write(item['name']+'\n'+item['score']+'\n'+item['comment']+'\n'+item['time']+'\n\n')
    f.close()
    # cursor = self.conn.cursor()
    # cursor.execute( "INSERT INTO comment(name,分数,评价,日期) VALUES ('%s','%s','%s','%s');" % (item['name'],  item['score'],  item['comment'],  item['time']))
    # self.conn.commit()

    return item

# def close_spider(self):
#     self.conn.close()

学习总结:
1.在该任务中学会了scrapy框架的基本结构,对框架有了一定的了解

你可能感兴趣的:(利用scrapy爬取艺龙评论)