20Python爬虫--Scrapy爬取和讯博客个人所有博客情况并保存到数据库

项目结构:
20Python爬虫--Scrapy爬取和讯博客个人所有博客情况并保存到数据库_第1张图片
Mysql数据库表创建语句

CREATE TABLE `myhexun` (
  `id` int(9) NOT NULL AUTO_INCREMENT COMMENT '文章的id',
  `name` varchar(60) DEFAULT NULL COMMENT '文章名',
  `url` varchar(100) DEFAULT NULL COMMENT '文章url',
  `hits` int(15) DEFAULT NULL COMMENT '文章点击数',
  `comment` int(15) DEFAULT NULL COMMENT '文章评论数',
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1876 DEFAULT CHARSET=utf8;

1、piplines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql

class HexunpjtPipeline(object):
    def __init__(self):
        self.conn = pymysql.connect(host="127.0.0.1", user="root", passwd="123456", db="spider")

    def process_item(self, item, spider):
        # 每个博文列表页中包含多篇博文的信息,我们可以通过for循环一次处理各博文的信息
        for j in range(0, len(item['name'])):
            # 将获取到的name、url、hits、comment分别赋给各变量
            name = item['name'][j]
            url = item['url'][j]
            hits = item['hits'][j]
            comment = item['comment'][j]
            print("name:%s,url:%s,hits:%s,comment:%s"% (name, url, hits, comment))
            # 构造对应的sql语句,实现将获取到的对应数据插入数据库中
            sql = "insert into myhexun(name,url,hits,comment) VALUES ('"+name + "','"+url + "','"+hits + "','"+comment\
                  + "')"
            print("sql语句:%s" % sql)
            # 通过query实现执行对应的sql语句
            self.conn.query(sql)
            self.conn.commit()
        return item

    def close_spider(self, spider):
        # 最后关闭数据库连接
        self.conn.close()

2、settings.py

取消注释

ITEM_PIPELINES = {
   'hexunpjt.pipelines.HexunpjtPipeline': 300,
}

3、item.py

# -*- coding: utf-8 -*-

import scrapy

class HexunpjtItem(scrapy.Item):
    # 建立name存储文章名
    name = scrapy.Field()
    # url存储文章网址
    url = scrapy.Field()
    # hits存储文章阅读数
    hits = scrapy.Field()
    # comment存储文章评论数
    comment = scrapy.Field()

4、myhexunspd.py

# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
from hexunpjt.items import HexunpjtItem
import urllib.request
import re

# http://14755969.blog.hexun.com/
class MyhexunspdSpider(scrapy.Spider):
    name = 'myhexunspd'
    # 设置要爬取的用户的uid,为后续构造爬取网址做准备
    uid = "14755969"
    allowed_domains = ['hexun.com']
    start_urls = ['http://hexun.com/']
    def start_requests(self):
        # 首次爬取模拟成浏览器进行
        yield Request("http://"+str(self.uid)+".blog.hexun.com/p1/default.html", headers={'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.3964.2 Safari/537.36"})

    def parse(self, response):
        item = HexunpjtItem()
        item['name'] = response.xpath("//span[@class='ArticleTitleText']/a/text()").extract()
        item['url'] = response.xpath("//span[@class='ArticleTitleText']/a/@href").extract()
        # 接下来需要使用urllib和re模块获取博文的评论数和阅读数
        # 首先提取存储评论数和点击数网址的正则表达式
        pat1 = '
                    
                    

你可能感兴趣的:(Python,python爬虫)