Scrapy是一个为了爬取网站数据,提取结构性数据而编写的应用框架。 可以应用在包括数据挖掘,信息处理或存储历史数据等一系列的程序中。
1.scrapy安装
pip install scrapy
2.scrapy中文文档
http://scrapy-chs.readthedocs.org/zh_CN/0.24/intro/overview.html
3.scrapy执行步骤
大致步骤如下:
1.新建项目:
scrapy startproject Spider
2.明确抓取内容:
修改Item,定义需要抓取的数据
class Information(scrapy.Item):
title = scrapy.Field()
body = scrapy.Field()
author = scrapy.Field()
source = scrapy.Field()
time = scrapy.Field()
3.编写爬虫:
from scrapy.spider import Spider
class DmozSpider(Spider):
name = "demo"
allowed_domains = ["dmoz.org"]
start_urls = [
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
]
def parse(self, response):
filename = response.url.split("/")[-2]
open(filename, 'wb').write(response.body)
4.开启爬虫:
scrapy crawl demo
5.存储抓取内容
scrapy crawl demo-o items.json -t json
4.小小爬虫有大大梦想
1.文档结构
C:.
│ items.json
│ scrapy.cfg
│
└─Spider
│ items.py
│ items.pyc
│ pipelines.py
│ pipelines.pyc
│ settings.py
│ settings.pyc
│ __init__.py
│ __init__.pyc
│
└─spiders
Information_spider.py
Information_spider.pyc
__init__.py
__init__.pyc
2.爬虫Information_spider.py文件,主要是用来抓取数据
# -*- coding:utf-8 -*-
from scrapy.spider import Spider, Rule, Request
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from Spider.items import Information
from scrapy import log
from bs4 import BeautifulSoup
import datetime
class Information_Spider(Spider):
name = "csdn"
allowed_domains = ["csdn.net"]
# 搜索关键字
categories = ["python", u"测试"]
start_urls = [
"http://so.csdn.net/so/search/s.do?q=" + "".join(categories[0]) + "&t=blog",
"http://so.csdn.net/so/search/s.do?q=" + "".join(categories[1]) + "&t=blog"
]
rules = [
# Rule(SgmlLinkExtractor(allow=('')), callback='parse_article', follow=True)
]
# 获取热门博客下一页链接
def parse(self, response):
base_url = "http://so.csdn.net/so/search/s.do"
soup = BeautifulSoup(response.body, 'html.parser')
links = soup.find("span", "page-nav").find_all("a")
print u"**获取热门博客下一页链接**\n"
for link in links:
href = base_url + link.get("href")
# 将抓取的链接调用parse_link方法进行下一轮抓取
yield Request(href, callback=self.parse_link)
# 获取热门博客链接
def parse_link(self, response):
soup = BeautifulSoup(response.body, 'html.parser')
links = soup.find_all("dl", "search-list")
print u"**获取热门博客链接**\n"
print links
for link in links:
href = link.find("dt").find("a").get("href")
# 将抓取的链接调用parse_article方法进行下一轮抓取
yield Request(href, callback=self.parse_article)
# 获取文章
def parse_article(self, response):
items = []
soup = BeautifulSoup(response.body, 'html.parser')
base_url = "http://blog.csdn.net"
# 抓取文章时间
time = datetime.datetime.today().strftime('%Y-%m-%d')
# 获取文章标题
title_block = soup.find("span", "link_title").find("a")
title = title_block.get_text().encode("utf-8")
# 获取文章链接
title_link_detail = title_block.get("href")
title_link = base_url + title_link_detail
# 获取文章作者
author_block = soup.find("div", {"id": "blog_userface"}).find("span").find("a")
author = author_block.get_text()
# 获取文章内容
body_div = soup.find("div", "markdown_views")
if body_div is None:
body_div = soup.find("div", "article_content")
body_block = body_div.find_all("p")
article = ""
for body in body_block:
article += body.get_text().encode("utf-8") + "\n"
# 将抓取内容存储
if len(article) != 0:
item = Information()
item["title"] = title
item["body"] = article
item["author"] = author
item["source"] = title_link
item["time"] = time
items.append(item)
return items
3.爬虫items.py文件,定义抓取的数据
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class SpiderItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
class Information(scrapy.Item):
title = scrapy.Field()
body = scrapy.Field()
author = scrapy.Field()
source = scrapy.Field()
time = scrapy.Field()
4.爬虫pipelines.py文件,将抓取的数据存储在Mysql中
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import MySQLdb
class SpiderPipeline(object):
# 连接120.24.239.214数据库
def __init__(self):
self.conn = MySQLdb.connect(
host="120.24.239.214",
user="root",
passwd="***********", #密码还是需要保密
db="Teman",
port=3306)
self.cur = self.conn.cursor()
# 将抓取items的数据存储mysql
def process_item(self, item, spider):
try:
information_title = item["title"].strip()
information_body = item["body"].replace("\n", "<br/>")
information_author = item["author"]
information_source = item["source"]
information_time = item["time"]
# 过滤已经添加的文章
sql_select_source = "select * from information where source = \"" + "".join(str(information_source)) + "\""
self.cur.execute(sql_select_source)
judge_source = self.cur.fetchall()
sql_select_title = "select * from information where title = \"" + "".join(str(information_title)) + "\""
self.cur.execute(sql_select_title)
judge_title = self.cur.fetchall()
if len(judge_source) == 0 or len(judge_title) == 0:
sql = "insert into information(title, body, author, source, time) values(\"" + "".join(str(information_title))\
+ "\",\"" + "".join(str(information_body)) + "\",\"" + "".join(str(information_author)) + "\",\"" + \
"".join(str(information_source)) + "\",\"" + "".join(str(information_time)) + "\")"
self.cur.execute(sql)
sql = ""
self.conn.commit()
except MySQLdb.Error, e:
print e
return item
# 关闭mysql连接
def close_spider(self, spider):
self.cur.close()
self.conn.close()
5.总结
小小爬虫有大大梦想,希望大家将爬虫发展起来,用处大家懂的
阳台测试: 239547991(群号)
本人博客:http://xuyangting.sinaapp.com/