Python + Scrapy 小小爬虫有大大梦想

Scrapy是一个为了爬取网站数据,提取结构性数据而编写的应用框架。 可以应用在包括数据挖掘,信息处理或存储历史数据等一系列的程序中。

1.scrapy安装

pip install scrapy

2.scrapy中文文档

http://scrapy-chs.readthedocs.org/zh_CN/0.24/intro/overview.html

3.scrapy执行步骤

大致步骤如下:
1.新建项目:
scrapy startproject Spider
2.明确抓取内容:
修改Item,定义需要抓取的数据
class Information(scrapy.Item):
    title = scrapy.Field() 
    body = scrapy.Field()
    author = scrapy.Field()
    source = scrapy.Field()
    time = scrapy.Field()
3.编写爬虫:
from scrapy.spider import Spider  

class DmozSpider(Spider):  
    name = "demo"  
    allowed_domains = ["dmoz.org"]  
    start_urls = [  
        "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",  
        "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"  
    ]  

    def parse(self, response):  
        filename = response.url.split("/")[-2]  
        open(filename, 'wb').write(response.body) 

4.开启爬虫:
scrapy crawl demo
5.存储抓取内容
scrapy crawl demo-o items.json -t json  

4.小小爬虫有大大梦想

1.文档结构
C:.
│  items.json
│  scrapy.cfg
│
└─Spider
    │  items.py
    │  items.pyc
    │  pipelines.py
    │  pipelines.pyc
    │  settings.py
    │  settings.pyc
    │  __init__.py
    │  __init__.pyc
    │
    └─spiders
            Information_spider.py
            Information_spider.pyc
            __init__.py
            __init__.pyc

2.爬虫Information_spider.py文件,主要是用来抓取数据
# -*- coding:utf-8 -*-
from scrapy.spider import Spider, Rule, Request
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from Spider.items import Information
from scrapy import log
from bs4 import BeautifulSoup
import datetime


class Information_Spider(Spider):
    name = "csdn"
    allowed_domains = ["csdn.net"]
    # 搜索关键字
    categories = ["python", u"测试"]
    start_urls = [
        "http://so.csdn.net/so/search/s.do?q=" + "".join(categories[0]) + "&t=blog",
        "http://so.csdn.net/so/search/s.do?q=" + "".join(categories[1]) + "&t=blog"
    ]
    rules = [
        # Rule(SgmlLinkExtractor(allow=('')), callback='parse_article', follow=True)
    ]

    # 获取热门博客下一页链接
    def parse(self, response):
        base_url = "http://so.csdn.net/so/search/s.do"
        soup = BeautifulSoup(response.body, 'html.parser')
        links = soup.find("span", "page-nav").find_all("a")
        print u"**获取热门博客下一页链接**\n"
        for link in links:
            href = base_url + link.get("href")
            # 将抓取的链接调用parse_link方法进行下一轮抓取
            yield Request(href, callback=self.parse_link)

    # 获取热门博客链接
    def parse_link(self, response):
        soup = BeautifulSoup(response.body, 'html.parser')
        links = soup.find_all("dl", "search-list")
        print u"**获取热门博客链接**\n"
        print links
        for link in links:
            href = link.find("dt").find("a").get("href")
            # 将抓取的链接调用parse_article方法进行下一轮抓取
            yield Request(href, callback=self.parse_article)

    # 获取文章
    def parse_article(self, response):
        items = []
        soup = BeautifulSoup(response.body, 'html.parser')
        base_url = "http://blog.csdn.net"

        # 抓取文章时间
        time = datetime.datetime.today().strftime('%Y-%m-%d')

        # 获取文章标题
        title_block = soup.find("span", "link_title").find("a")
        title = title_block.get_text().encode("utf-8")

        # 获取文章链接
        title_link_detail = title_block.get("href")
        title_link = base_url + title_link_detail

        # 获取文章作者
        author_block = soup.find("div", {"id": "blog_userface"}).find("span").find("a")
        author = author_block.get_text()

        # 获取文章内容
        body_div = soup.find("div", "markdown_views")
        if body_div is None:
            body_div = soup.find("div", "article_content")
        body_block = body_div.find_all("p")
        article = ""
        for body in body_block:
            article += body.get_text().encode("utf-8") + "\n"

        # 将抓取内容存储
        if len(article) != 0:
            item = Information()
            item["title"] = title
            item["body"] = article
            item["author"] = author
            item["source"] = title_link
            item["time"] = time
            items.append(item)
            return items   

3.爬虫items.py文件,定义抓取的数据
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class SpiderItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass


class Information(scrapy.Item):
    title = scrapy.Field()
    body = scrapy.Field()
    author = scrapy.Field()
    source = scrapy.Field()
    time = scrapy.Field()

4.爬虫pipelines.py文件,将抓取的数据存储在Mysql中
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import MySQLdb


class SpiderPipeline(object):
    # 连接120.24.239.214数据库
    def __init__(self):
        self.conn = MySQLdb.connect(
            host="120.24.239.214",
            user="root",
            passwd="***********",   #密码还是需要保密
            db="Teman",
            port=3306)
        self.cur = self.conn.cursor()

    # 将抓取items的数据存储mysql
    def process_item(self, item, spider):
        try:
            information_title = item["title"].strip()
            information_body = item["body"].replace("\n", "<br/>")
            information_author = item["author"]
            information_source = item["source"]
            information_time = item["time"]
            # 过滤已经添加的文章
            sql_select_source = "select * from information where source = \"" + "".join(str(information_source)) + "\""
            self.cur.execute(sql_select_source)
            judge_source = self.cur.fetchall()
            sql_select_title = "select * from information where title = \"" + "".join(str(information_title)) + "\""
            self.cur.execute(sql_select_title)
            judge_title = self.cur.fetchall()
            if len(judge_source) == 0 or len(judge_title) == 0:
                sql = "insert into information(title, body, author, source, time) values(\"" + "".join(str(information_title))\
                    + "\",\"" + "".join(str(information_body)) + "\",\"" + "".join(str(information_author)) + "\",\"" + \
                    "".join(str(information_source)) + "\",\"" + "".join(str(information_time)) + "\")"
                self.cur.execute(sql)
            sql = ""
            self.conn.commit()
        except MySQLdb.Error, e:
            print e
        return item

    # 关闭mysql连接
    def close_spider(self, spider):
        self.cur.close()
        self.conn.close()

5.总结

小小爬虫有大大梦想,希望大家将爬虫发展起来,用处大家懂的

阳台测试: 239547991(群号)

本人博客:http://xuyangting.sinaapp.com/

你可能感兴趣的:(python,scrapy,爬虫)