SCRAPY爬虫实例

一:Scrapy简介

scrapy是一个爬虫框架,支持多线程爬取数据,使用简单,爬取效率高。

二:项目介绍

1:爬取对象
#乐彩网历史双色球开奖号码,网址如下
http://www.17500.cn/ssq/awardlist.php
2:实现流程
(1):创建项目
mkdir scrapyApp
cd scrapyApp
/usr/rain/python/bin/scrapy startproject caipiaoSSQ
cd caipiaoSSQ
/usr/rain/python/bin/scrapy genspider ssq 17500.cn
ll
[root@rainsty caipiaoSSQ]# ll
total 12
drwxr-xr-x 4 root root 4096 Jan 14 10:04 caipiaoSSQ
-rw-r--r-- 1 root root  263 Jan  7 19:32 scrapy.cfg
-rwxr-xr-x 1 root root  420 Jan 14 09:42 start.sh
#目录树
[root@rainsty caipiaoSSQ]# tree
.
├── caipiaoSSQ
│   ├── __init__.py
│   ├── items.py
│   ├── middlewares.py
│   ├── pipelines.py
│   ├── __pycache__
│   │   ├── __init__.cpython-36.pyc
│   │   ├── items.cpython-36.pyc
│   │   ├── pipelines.cpython-36.pyc
│   │   └── settings.cpython-36.pyc
│   ├── settings.py
│   └── spiders
│       ├── __init__.py
│       ├── __pycache__
│       │   ├── __init__.cpython-36.pyc
│       │   └── ssq.cpython-36.pyc
│       └── ssq.py
├── scrapy.cfg
└── start.sh

4 directories, 15 files
[root@rainsty caipiaoSSQ]#
(2):构建item
#item.py
import scrapy

class CaipiaossqItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    name = scrapy.Field()
(3):编写爬虫ssq
#ssq.py
import scrapy
import re
from caipiaoSSQ.items import CaipiaossqItem


class SsqSpider(scrapy.Spider):
    name = 'ssq'
    allowed_domains = ['17500.cn']
    start_urls = ['http://www.17500.cn/ssq/awardlist.php?p=1']

    url_set = set()

    def parse(self, response):
        allSSQ = response.css('table.sortable')

        file_name = ['期号', '日期', '周', '开奖号码', '本期投注', '返奖比', '奖池金额', '一等奖', '奖金一',
             '二等奖', '奖金二', '三等奖', '奖金三', '四等奖', '奖金四', '五等奖', '奖金五', '六等奖', '奖金六']

        file_value = []
        for t in allSSQ:
            for ta in t.xpath('./tbody/tr/td').extract():
                ta = re.sub('|||', '', ta)
                file_value.append(ta)

        file_result = []
        item = CaipiaossqItem()
        for l in range(len(file_value)):
            file_result.append(file_value[l])
            if (l + 1) % 19 == 0:
                item['name'] = dict(zip(file_name, file_result))
                yield item
                file_result = []

        page_num = ''
        for p in response.xpath('//div[contains(@id,"sortlist")]/a').extract():
            if '下一页' in p:
                page_num = p
                break

        try:
            page = int(re.findall('\"?p=(.*)\"', page_num)[0])
            url = 'http://www.17500.cn/ssq/awardlist.php?p=%s' % str(page)
            print(url)
            if url in SsqSpider.url_set:
                pass
            else:
                SsqSpider.url_set.add(url)
                yield self.make_requests_from_url(url)
        except BaseException as e:
            print('end...')
            print(e)
(4):编写数据存储pipelines
#pipelines.py
import json

class CaipiaossqPipeline(object):
    def process_item(self, item, spider):
        with open("./my_ssq.txt",'a') as fp:
            fp.write(json.dumps(item['name'], ensure_ascii=False) + '\n')
(5):设置setting
#settings.py
 ITEM_PIPELINES = {
     'caipiaoSSQ.pipelines.CaipiaossqPipeline': 300,
 }
(6):启动爬虫
[root@rainsty caipiaoSSQ]# ll
total 12
drwxr-xr-x 4 root root 4096 Jan 14 10:49 caipiaoSSQ
-rw-r--r-- 1 root root  263 Jan  7 19:32 scrapy.cfg
-rwxr-xr-x 1 root root  420 Jan 14 09:42 start.sh
[root@rainsty caipiaoSSQ]# ./start.sh 
[root@rainsty caipiaoSSQ]# ll
total 1300
drwxr-xr-x 4 root root    4096 Jan 14 10:49 caipiaoSSQ
-rw-r--r-- 1 root root 1034521 Jan 14 10:49 my_ssq.json
-rw-r--r-- 1 root root  281880 Jan 14 10:49 nohup.out
-rw-r--r-- 1 root root     263 Jan  7 19:32 scrapy.cfg
-rwxr-xr-x 1 root root     420 Jan 14 09:42 start.sh

----nohup.out:日志输出记录
----my_ssq.json:爬取结果
----start.sh:启动文件,如下:

#!/bin/bash
#filename: start.sh
#cteatedtime: 2019-01-08

#mkdir scrapyApp
#cd scrapyApp
#/usr/rain/python/bin/scrapy startproject caipiaoSSQ
#cd caipiaoSSQ
#/usr/rain/python/bin/scrapy genspider ssq 17500.cn


function Main(){
    /usr/rain/python/bin/scrapy crawl ssq > nohup.out 2>&1

    name="./my_ssq.txt"

    if [ -e $name ];then
        cat my_ssq.txt > my_ssq.json
        rm -rf my_ssq.txt
    fi
    }

Main
(7):结果展示
{"期号": "2019006", "日期": "2019-01-13", "周": "日", "开奖号码": "01 05 10 19 26 28+12", "本期投注": "375,825,296", "返奖比": "79.5%", "奖池金额": "1,211,353,738", "一等奖": "32", "奖金一": "5,514,887", "二等奖":
 "96", "奖金二": "214,536", "三等奖": "1820", "奖金三": "3,000", "四等奖": "83388", "奖金四": "200", "五等奖": "1500047", "奖金五": "10", "六等奖": "12926873", "奖金六": "5"}
{"期号": "2019005", "日期": "2019-01-10", "周": "四", "开奖号码": "21 22 26 28 31 32+07", "本期投注": "343,365,078", "返奖比": "55%", "奖池金额": "1,326,043,660", "一等奖": "11", "奖金一": "6,133,650", "二等奖": "
68", "奖金二": "229,230", "三等奖": "1308", "奖金三": "3,000", "四等奖": "61547", "奖金四": "200", "五等奖": "1102950", "奖金五": "10", "六等奖": "15727044", "奖金六": "5"}
{"期号": "2019004", "日期": "2019-01-08", "周": "二", "开奖号码": "08 12 16 19 26 32+03", "本期投注": "338,080,388", "返奖比": "45%", "奖池金额": "1,346,750,738", "一等奖": "8", "奖金一": "7,428,584", "二等奖": "2
22", "奖金二": "109,395", "三等奖": "1308", "奖金三": "3,000", "四等奖": "69480", "奖金四": "200", "五等奖": "1385574", "奖金五": "10", "六等奖": "7368055", "奖金六": "5"}
{"期号": "2019003", "日期": "2019-01-06", "周": "日", "开奖号码": "13 17 20 21 22 27+01", "本期投注": "373,728,002", "返奖比": "38.6%", "奖池金额": "1,333,321,882", "一等奖": "5", "奖金一": "9,635,283", "二等奖": 
"129", "奖金二": "224,577", "三等奖": "920", "奖金三": "3,000", "四等奖": "59266", "奖金四": "200", "五等奖": "1278481", "奖金五": "10", "六等奖": "7969323", "奖金六": "5"}
{"期号": "2019002", "日期": "2019-01-03", "周": "四", "开奖号码": "04 05 06 08 09 18+11", "本期投注": "345,144,344", "返奖比": "45.7%", "奖池金额": "1,294,586,730", "一等奖": "9", "奖金一": "7,273,230", "二等奖": 
"129", "奖金二": "198,246", "三等奖": "1352", "奖金三": "3,000", "四等奖": "74631", "奖金四": "200", "五等奖": "1506728", "奖金五": "10", "六等奖": "6555174", "奖金六": "5"}
{"期号": "2019001", "日期": "2019-01-01", "周": "二", "开奖号码": "06 10 13 15 32 33+15", "本期投注": "341,453,440", "返奖比": "50.8%", "奖池金额": "1,283,324,272", "一等奖": "14", "奖金一": "6,657,038", "二等奖":
 "147", "奖金二": "197,266", "三等奖": "822", "奖金三": "3,000", "四等奖": "47264", "奖金四": "200", "五等奖": "1059312", "奖金五": "10", "六等奖": "5761518", "奖金六": "5"}
{"期号": "2018153", "日期": "2018-12-30", "周": "日", "开奖号码": "01 07 17 23 25 31+11", "本期投注": "363,663,410", "返奖比": "37.5%", "奖池金额": "1,289,528,301", "一等奖": "4", "奖金一": "10,000,000", "二等奖":
 "133", "奖金二": "205,313", "三等奖": "1259", "奖金三": "3,000", "四等奖": "58445", "奖金四": "200", "五等奖": "1096798", "奖金五": "10", "六等奖": "8506891", "奖金六": "5"}
{"期号": "2018152", "日期": "2018-12-27", "周": "四", "开奖号码": "04 14 16 23 28 29+03", "本期投注": "342,935,808", "返奖比": "70.5%", "奖池金额": "1,247,608,325", "一等奖": "20", "奖金一": "5,479,653", "二等奖":
 "168", "奖金二": "71,376", "三等奖": "5101", "奖金三": "3,000", "四等奖": "155083", "奖金四": "200", "五等奖": "1893432", "奖金五": "10", "六等奖": "10963861", "奖金六": "5"}
{"期号": "2018151", "日期": "2018-12-25", "周": "二", "开奖号码": "05 15 19 25 26 29+15", "本期投注": "345,353,380", "返奖比": "39.8%", "奖池金额": "1,321,227,399", "一等奖": "4", "奖金一": "9,708,849", "二等奖": 
"203", "奖金二": "115,981", "三等奖": "2250", "奖金三": "3,000", "四等奖": "87941", "奖金四": "200", "五等奖": "1559540", "奖金五": "10", "六等奖": "7022515", "奖金六": "5"}

三:源码地址分享

源码地址:Github:[https://github.com/Rainstyed/rainsty/tree/master/Spiders/ScrapyApp/caipiaoSSQ]

你可能感兴趣的:(python,spider)