从京东搜索框搜索进入手机进入页面,爬取页面https://item.jd.com/100008348542.html的评论信息。爬取内容是评论信息(用户名、评论内容、评论星级、评论时间等),将信息爬取下来之后,进行数据处理,以方便显示和查看的格式保存下来。
经观察发现,评论页面变化时,总的网页地址并没有发生变化,故需要解析真实地址抓取评论。
抓取真实地址:打开检查功能(F12)并单击Network,搜索comments,刷新网页,找到All中的真实数据文件,在Header中查看评论真实地址。
定位到评论所在的位置:
打开真实地址显示如下图所示:
观察发现,只是page有变化,故翻译操作只需改变page的值即可。
可以将其转化为json格式的数据,将字符串转化为字典对象,发现,评论主要存在与coments标签里的content内容里面。依次提取评论content,星级score。
# -*- coding:utf-8 -*-
import requests
import json
import time
import random
import xlwt
# 创建excell保存数据
file = xlwt.Workbook(encoding='utf-8')
sheet = file.add_sheet('data', cell_overwrite_ok=True)
start = time.time()
for i in range(100): #for循环遍历,批量爬取评论信息
try:
#构造url,通过在网页不断点击下一页发现,url中只有page后数字随页数变化,批量遍历就是根据这个
#url去掉了callback部分,因为这部分内没有有用数据,并且不去掉后面转换为json格式会有问题
url = 'https://club.jd.com/comment/productPageComments.action?&productId=100008348542&score=3&sortType=5&page=%s&pageSize=10&isShadowSku=0&rid=0&fold=1' % i
#构造headers
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.60',
'referer': 'https://item.jd.com/100008348542.html'
}
response = requests.get(url, headers=headers)
data = json.loads(response.text) # 字符串转换为json数据
page = i * 10 #这里是存储在excell中用到的,因为每爬取一个url会有10条评论,占excell 10列
if(data['comments']):
for temp in data['comments']:
sheet.write(page, 0, page) #序号
sheet.write(page, 1, temp['content']) #评论
sheet.write(page, 2, temp['score']) #用户打的星级
page = page + 1
print('第%s页爬取成功' % i)
else:
print('.............第%s页爬取失败' %i)
file.save('comments.xlsx') #保存到本地
except Exception as e:
print('爬取失败,url:%s'%url)
print('page是%s'%i)
continue
time.sleep(random.random() * 5) #每循环一次,随机时间暂停再爬
end = time.time()
file.save('comments.xlsx')
为了提高爬虫的速度,可以采用多线程爬虫,来并发执行爬虫任务。首先定义数组link_list,然后将网页地址存在数组link_list里面,并创建5个线程依次来爬取数据,部分代码如图所示:
import threading
import requests
import time
link_list = []
url_head = 'https://club.jd.com/comment/productPageComments.action?&productId=100008348542&score=3&sortType=5'
url_middle = '&page='
url_end = '&pageSize=10&isShadowSku=0&rid=0&fold=1'
for i in range(0,100):
url = url_head +url_middle + str(i) + url_end
link_list.append(url)
start = time.time()
class myThread (threading.Thread):
def __init__(self, name, link_range):
threading.Thread.__init__(self)
self.name = name
self.link_range = link_range
def run(self):
print ("Starting " + self.name)
crawler(self.name, self.link_range)
print ("Exiting " + self.name)
#解析网页代码
def crawler(threadName, link_range):
for i in range(link_range[0],link_range[1]+1):
try:
r = requests.get(link_list[i], timeout=20)
print (threadName, r.status_code)
except Exception as e:
print(threadName, 'Error: ', e)
thread_list = []
link_range_list = [(0,20),(21,40),(41,60),(61,80),(81,100)]
# 创建新线程
for i in range(1,6):
thread = myThread("Thread-" + str(i), link_range_list[i-1])
thread.start()
thread_list.append(thread)
# 等待所有线程完成
for thread in thread_list:
thread.join()
end = time.time()
print ('简单多线程爬虫的总时间为:', end-start)
print ("Exiting Main Thread")
建立工程包:
scrapy startproject jd
在工程中产生一个Scrapy爬虫
scrapy genspider pachong
import scrapy
import json
from ..items import JdItem
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}
class PachongSpider(scrapy.Spider):
name = 'pachong'
allowed_domains = ['www.jd.com']
url_head = 'https://club.jd.com/comment/productPageComments.action?&productId=100008348542&score=0&sortType=5'
url_middle = '&page='
url_end = '&pageSize=10&isShadowSku=0&fold=1'
def start_requests(self):
#爬取100页评论数据(即1000条)
for i in range(0,100):
url = self.url_head +self.url_middle + str(i) + self.url_end
print("当前页面:", url)
#url='https://club.jd.com/comment/productPageComments.action?&productId=100008348542&score=3&sortType=5&page=1&pageSize=10&isShadowSku=0&rid=0&fold=1'
yield scrapy.Request(url=url, callback = self.parse)
def parse(self, response):
# 爬取每个手机链接
# response = requests.get(start_urls, headers=headers)
json_string = response.text
data = json.loads(json_string)
comments = data['comments']
for i in range(len(comments)):
item = JdItem()
jd_nickname = comments[i]['nickname']
jd_content = comments[i]['content']
jd_score = comments[i]['score']
jd_time = comments[i]['creationTime']
# 变字典
item["nickname"] = jd_nickname
item["content"] = jd_content
item["score"] = jd_score
item["time"] = jd_time
yield item
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import pymysql
def dbHandle():
conn = pymysql.connect(
host = "localhost",
user = "root",
passwd = "123456",
charset = "utf8",
use_unicode = False
)
return conn
class JdPipeline(object):
#填入你的地址
def process_item(self, item, spider):
dbObject = dbHandle()
cursor = dbObject.cursor()
cursor.execute("USE spider")
#插入数据库
sql = "INSERT INTO jd(用户名,评论,星级,评论时间) VALUES (%s,%s,%s,%s)"
try:
cursor.execute(sql,
(item['nickname'], item['content'], item['score'],item['time']))
cursor.connection.commit()
except BaseException as e:
print("错误在这里>>>>>>>>>>>>>", e, "<<<<<<<<<<<<<错误在这里")
dbObject.rollback()
return item
# Scrapy settings for jd project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'jd'
SPIDER_MODULES = ['jd.spiders']
NEWSPIDER_MODULE = 'jd.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36' #代理
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'jd.middlewares.JdSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {#开启代理
'jd.middlewares.my_useragent':543,
#'jd.middlewares.JdDownloaderMiddleware': 543,
}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'jd.pipelines.JdPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
from scrapy import cmdline
cmdline.execute('scrapy crawl pachong'.split())
注:本博客只是记录作者的一些学习过程,若有错误,欢迎指正。