Scrapy爬虫项目

爬取糗事百科笑话https://www.qiushibaike.com/text/

创建项目

scrapy startproject xiubai

创建爬虫主程序

cd spiders
scrapy genspider xiubai_spider www.qiushibaike.com

确定要爬取的目标 (items.py)

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class XiubaiItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    # 头像
    avatar = scrapy.Field()
    # 个人主页
    profile_link = scrapy.Field()
    # 昵称
    name = scrapy.Field()
    # 性别
    gender = scrapy.Field()
    # 年龄
    age = scrapy.Field()
    # 笑话内容
    content = scrapy.Field()
    # 笑话页面链接
    content_link = scrapy.Field()
    # 点赞数
    up = scrapy.Field()
    # 评论数
    comment_num = scrapy.Field()

设置User-Agent (settings.py)

USER_AGENT = 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Mobile Safari/537.36'

设置header(setting.py)

DEFAULT_REQUEST_HEADERS = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Connection': 'keep-alive',
    'Cookie': '_xsrf=2|a0eca77a|7d13a299dd3eb490a26e81d725dcde83|1538874558; _ga=GA1.2.942515558.1538874563; _gid=GA1.2.1645358480.1538874563; _qqq_uuid_="2|1:0|10:1538880579|10:_qqq_uuid_|56:ZWJmYTE3NjJmYjJjNDI0YTA4N2JiMDRlN2RhZjJhYzg5MzA4OTFmOQ==|318ae98093edcdc4a31802fcb4b593a868e3b4b35f5617a30b1a72a69bcfb8e4"; _gat=1',
    'Host': 'www.qiushibaike.com',
    'If-None-Match': "9c4491a96c80d5392bfec0e6fc2a1004da84ff5a",
    'Referer': 'https://www.qiushibaike.com/text/',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
}

数据解析

# -*- coding: utf-8 -*-
import scrapy
import re
from xiubai.items import XiubaiItem

class XiubaiSpiderSpider(scrapy.Spider):
    # 爬虫名
    name = 'xiubai_spider'
    # 允许的域名
    allowed_domains = ['www.qiushibaike.com']
    # 入口URL
    start_urls = ['https://www.qiushibaike.com/text/']

    gender_strip_pattern = re.compile('articleGender |Icon')

    def parse(self, response):
        article_list = response.xpath("//div[@class='col1']/div")
        for article_item in article_list:
            xiubai_item = XiubaiItem()
            author_div = article_item.xpath("./div[@class='author clearfix']")
            author_istrue = author_div.xpath("./a").extract()
            if author_istrue:
                xiubai_item['avatar'] = "https:"+author_div.xpath("./a/img/@src").extract_first()
                xiubai_item['profile_link'] = "https://www.qiushibaike.com"+author_div.xpath("./a/@href").extract_first()
                xiubai_item['name'] = author_div.xpath("./a/h2/text()").extract_first()
                gender_text = author_div.xpath(".//div[contains(@class,'articleGender')]/@class").extract_first()
                xiubai_item['gender'] = self.gender_strip_pattern.sub('', gender_text)
                xiubai_item['age'] = author_div.xpath("./div/text()").extract_first()
            xiubai_item['content'] = article_item.xpath(".//div[@class='content']/span/text()").extract_first()
            xiubai_item['content_link'] = "https://www.qiushibaike.com"+article_item.xpath("./a[@class='contentHerf']/@href").extract_first()
            xiubai_item['up'] = article_item.xpath(".//span[@class='stats-vote']/i/text()").extract_first()
            xiubai_item['comment_num'] = article_item.xpath(".//span[@class='stats-comments']//i/text()").extract_first()
            yield xiubai_item
            # print(xiubai_item)
        # 解析下一页
        next_link = response.xpath("//ul[@class='pagination']/li[last()]/a/@href").extract()
        if next_link:
            next_link = next_link[0]
            yield scrapy.Request("https://www.qiushibaike.com"+next_link,callback=self.parse)

数据存储MySQL

settings.py

ITEM_PIPELINES = {
   'xiubai.pipelines.XiubaiPipeline': 300,
}
#Mysql数据库的配置信息
MYSQL_HOST = '127.0.0.1'
MYSQL_DBNAME = 'qiubai'         #数据库名字,请修改
MYSQL_USER = 'root'             #数据库账号,请修改 
MYSQL_PASSWD = ''         #数据库密码,请修改
MYSQL_PORT = 3306               #数据库端口,在dbhelper中使用

Pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

import pymysql
from xiubai.settings import MYSQL_HOST,MYSQL_DBNAME,MYSQL_USER,MYSQL_PASSWD,MYSQL_PORT
from xiubai.items import XiubaiItem

class XiubaiPipeline(object):
    def __init__(self):
        self.connect = pymysql.connect( host=MYSQL_HOST, db=MYSQL_DBNAME, port=MYSQL_PORT, user=MYSQL_USER, passwd=MYSQL_PASSWD, charset='utf8', use_unicode=False)
        self.cursor = self.connect.cursor()
    def process_item(self, item, spider):
        if item.__class__ == XiubaiItem:
            try:
                sql = "select * from articles where content_link = %s"
                params = item["content_link"]
                self.cursor.execute(sql,params)
                ret = self.cursor.fetchone()
                if ret:
                    sql = "update articles set avatar = %s,profile_link = %s,name = %s, gender = %s,age = %s,content = %s, content_link = %s, up = %s, comment_num = %s where content_link = %s"
                    params = (item['avatar'], item['profile_link'], item['name'], item['gender'], item['age'], item['content'], item['content_link'], item['up'], item['comment_num'], item['content_link'])
                    self.cursor.execute(sql,params)
                else:
                    sql = "insert into articles(avatar,profile_link,name,gender,age,content,content_link,up,comment_num) values (%s,%s,%s,%s,%s,%s,%s,%s,%s)"
                    params = (item['avatar'],item['profile_link'],item['name'],item['gender'],item['age'],item['content'],item['content_link'],item['up'],item['comment_num'])
                    self.cursor.execute(sql,params)
                self.connect.commit()
            except Exception as error:
                print(error)
            return item
        else:
            return item

设置User-Agent代理(middlewares.py)

class my_useragent(object):
    def process_request(self,request,spider):
        user_agent_list = [
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
            "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
            "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
        ]
        agent = random.choice(user_agent_list)
        request.headers['User-Agent'] = agent

Settings.py

DOWNLOADER_MIDDLEWARES = {
   # 'xiubai.middlewares.XiubaiDownloaderMiddleware': 543,
   'xiubai.middlewares.my_useragent': 543,
}

感谢:
https://www.imooc.com/learn/1017
https://blog.csdn.net/c315838651/article/details/72675470
https://blog.csdn.net/yancey_blog/article/details/53895821

你可能感兴趣的:(Scrapy爬虫项目)