爬取糗事百科笑话https://www.qiushibaike.com/text/
创建项目
scrapy startproject xiubai
创建爬虫主程序
cd spiders
scrapy genspider xiubai_spider www.qiushibaike.com
确定要爬取的目标 (items.py)
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class XiubaiItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 头像
avatar = scrapy.Field()
# 个人主页
profile_link = scrapy.Field()
# 昵称
name = scrapy.Field()
# 性别
gender = scrapy.Field()
# 年龄
age = scrapy.Field()
# 笑话内容
content = scrapy.Field()
# 笑话页面链接
content_link = scrapy.Field()
# 点赞数
up = scrapy.Field()
# 评论数
comment_num = scrapy.Field()
设置User-Agent (settings.py)
USER_AGENT = 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Mobile Safari/537.36'
设置header(setting.py)
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Cookie': '_xsrf=2|a0eca77a|7d13a299dd3eb490a26e81d725dcde83|1538874558; _ga=GA1.2.942515558.1538874563; _gid=GA1.2.1645358480.1538874563; _qqq_uuid_="2|1:0|10:1538880579|10:_qqq_uuid_|56:ZWJmYTE3NjJmYjJjNDI0YTA4N2JiMDRlN2RhZjJhYzg5MzA4OTFmOQ==|318ae98093edcdc4a31802fcb4b593a868e3b4b35f5617a30b1a72a69bcfb8e4"; _gat=1',
'Host': 'www.qiushibaike.com',
'If-None-Match': "9c4491a96c80d5392bfec0e6fc2a1004da84ff5a",
'Referer': 'https://www.qiushibaike.com/text/',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
}
数据解析
# -*- coding: utf-8 -*-
import scrapy
import re
from xiubai.items import XiubaiItem
class XiubaiSpiderSpider(scrapy.Spider):
# 爬虫名
name = 'xiubai_spider'
# 允许的域名
allowed_domains = ['www.qiushibaike.com']
# 入口URL
start_urls = ['https://www.qiushibaike.com/text/']
gender_strip_pattern = re.compile('articleGender |Icon')
def parse(self, response):
article_list = response.xpath("//div[@class='col1']/div")
for article_item in article_list:
xiubai_item = XiubaiItem()
author_div = article_item.xpath("./div[@class='author clearfix']")
author_istrue = author_div.xpath("./a").extract()
if author_istrue:
xiubai_item['avatar'] = "https:"+author_div.xpath("./a/img/@src").extract_first()
xiubai_item['profile_link'] = "https://www.qiushibaike.com"+author_div.xpath("./a/@href").extract_first()
xiubai_item['name'] = author_div.xpath("./a/h2/text()").extract_first()
gender_text = author_div.xpath(".//div[contains(@class,'articleGender')]/@class").extract_first()
xiubai_item['gender'] = self.gender_strip_pattern.sub('', gender_text)
xiubai_item['age'] = author_div.xpath("./div/text()").extract_first()
xiubai_item['content'] = article_item.xpath(".//div[@class='content']/span/text()").extract_first()
xiubai_item['content_link'] = "https://www.qiushibaike.com"+article_item.xpath("./a[@class='contentHerf']/@href").extract_first()
xiubai_item['up'] = article_item.xpath(".//span[@class='stats-vote']/i/text()").extract_first()
xiubai_item['comment_num'] = article_item.xpath(".//span[@class='stats-comments']//i/text()").extract_first()
yield xiubai_item
# print(xiubai_item)
# 解析下一页
next_link = response.xpath("//ul[@class='pagination']/li[last()]/a/@href").extract()
if next_link:
next_link = next_link[0]
yield scrapy.Request("https://www.qiushibaike.com"+next_link,callback=self.parse)
数据存储MySQL
settings.py
ITEM_PIPELINES = {
'xiubai.pipelines.XiubaiPipeline': 300,
}
#Mysql数据库的配置信息
MYSQL_HOST = '127.0.0.1'
MYSQL_DBNAME = 'qiubai' #数据库名字,请修改
MYSQL_USER = 'root' #数据库账号,请修改
MYSQL_PASSWD = '' #数据库密码,请修改
MYSQL_PORT = 3306 #数据库端口,在dbhelper中使用
Pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
from xiubai.settings import MYSQL_HOST,MYSQL_DBNAME,MYSQL_USER,MYSQL_PASSWD,MYSQL_PORT
from xiubai.items import XiubaiItem
class XiubaiPipeline(object):
def __init__(self):
self.connect = pymysql.connect( host=MYSQL_HOST, db=MYSQL_DBNAME, port=MYSQL_PORT, user=MYSQL_USER, passwd=MYSQL_PASSWD, charset='utf8', use_unicode=False)
self.cursor = self.connect.cursor()
def process_item(self, item, spider):
if item.__class__ == XiubaiItem:
try:
sql = "select * from articles where content_link = %s"
params = item["content_link"]
self.cursor.execute(sql,params)
ret = self.cursor.fetchone()
if ret:
sql = "update articles set avatar = %s,profile_link = %s,name = %s, gender = %s,age = %s,content = %s, content_link = %s, up = %s, comment_num = %s where content_link = %s"
params = (item['avatar'], item['profile_link'], item['name'], item['gender'], item['age'], item['content'], item['content_link'], item['up'], item['comment_num'], item['content_link'])
self.cursor.execute(sql,params)
else:
sql = "insert into articles(avatar,profile_link,name,gender,age,content,content_link,up,comment_num) values (%s,%s,%s,%s,%s,%s,%s,%s,%s)"
params = (item['avatar'],item['profile_link'],item['name'],item['gender'],item['age'],item['content'],item['content_link'],item['up'],item['comment_num'])
self.cursor.execute(sql,params)
self.connect.commit()
except Exception as error:
print(error)
return item
else:
return item
设置User-Agent代理(middlewares.py)
class my_useragent(object):
def process_request(self,request,spider):
user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
agent = random.choice(user_agent_list)
request.headers['User-Agent'] = agent
Settings.py
DOWNLOADER_MIDDLEWARES = {
# 'xiubai.middlewares.XiubaiDownloaderMiddleware': 543,
'xiubai.middlewares.my_useragent': 543,
}
感谢:
https://www.imooc.com/learn/1017
https://blog.csdn.net/c315838651/article/details/72675470
https://blog.csdn.net/yancey_blog/article/details/53895821