CREATE TABLE `myhexun` (
`id` int(9) NOT NULL AUTO_INCREMENT COMMENT '文章的id',
`name` varchar(60) DEFAULT NULL COMMENT '文章名',
`url` varchar(100) DEFAULT NULL COMMENT '文章url',
`hits` int(15) DEFAULT NULL COMMENT '文章点击数',
`comment` int(15) DEFAULT NULL COMMENT '文章评论数',
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1876 DEFAULT CHARSET=utf8;
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
class HexunpjtPipeline(object):
def __init__(self):
self.conn = pymysql.connect(host="127.0.0.1", user="root", passwd="123456", db="spider")
def process_item(self, item, spider):
# 每个博文列表页中包含多篇博文的信息,我们可以通过for循环一次处理各博文的信息
for j in range(0, len(item['name'])):
# 将获取到的name、url、hits、comment分别赋给各变量
name = item['name'][j]
url = item['url'][j]
hits = item['hits'][j]
comment = item['comment'][j]
print("name:%s,url:%s,hits:%s,comment:%s"% (name, url, hits, comment))
# 构造对应的sql语句,实现将获取到的对应数据插入数据库中
sql = "insert into myhexun(name,url,hits,comment) VALUES ('"+name + "','"+url + "','"+hits + "','"+comment\
+ "')"
print("sql语句:%s" % sql)
# 通过query实现执行对应的sql语句
self.conn.query(sql)
self.conn.commit()
return item
def close_spider(self, spider):
# 最后关闭数据库连接
self.conn.close()
取消注释
ITEM_PIPELINES = {
'hexunpjt.pipelines.HexunpjtPipeline': 300,
}
# -*- coding: utf-8 -*-
import scrapy
class HexunpjtItem(scrapy.Item):
# 建立name存储文章名
name = scrapy.Field()
# url存储文章网址
url = scrapy.Field()
# hits存储文章阅读数
hits = scrapy.Field()
# comment存储文章评论数
comment = scrapy.Field()
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
from hexunpjt.items import HexunpjtItem
import urllib.request
import re
# http://14755969.blog.hexun.com/
class MyhexunspdSpider(scrapy.Spider):
name = 'myhexunspd'
# 设置要爬取的用户的uid,为后续构造爬取网址做准备
uid = "14755969"
allowed_domains = ['hexun.com']
start_urls = ['http://hexun.com/']
def start_requests(self):
# 首次爬取模拟成浏览器进行
yield Request("http://"+str(self.uid)+".blog.hexun.com/p1/default.html", headers={'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.3964.2 Safari/537.36"})
def parse(self, response):
item = HexunpjtItem()
item['name'] = response.xpath("//span[@class='ArticleTitleText']/a/text()").extract()
item['url'] = response.xpath("//span[@class='ArticleTitleText']/a/@href").extract()
# 接下来需要使用urllib和re模块获取博文的评论数和阅读数
# 首先提取存储评论数和点击数网址的正则表达式
pat1 = '