scrapy是一个爬虫框架,支持多线程爬取数据,使用简单,爬取效率高。
#乐彩网历史双色球开奖号码,网址如下
http://www.17500.cn/ssq/awardlist.php
mkdir scrapyApp
cd scrapyApp
/usr/rain/python/bin/scrapy startproject caipiaoSSQ
cd caipiaoSSQ
/usr/rain/python/bin/scrapy genspider ssq 17500.cn
ll
[root@rainsty caipiaoSSQ]# ll
total 12
drwxr-xr-x 4 root root 4096 Jan 14 10:04 caipiaoSSQ
-rw-r--r-- 1 root root 263 Jan 7 19:32 scrapy.cfg
-rwxr-xr-x 1 root root 420 Jan 14 09:42 start.sh
#目录树
[root@rainsty caipiaoSSQ]# tree
.
├── caipiaoSSQ
│ ├── __init__.py
│ ├── items.py
│ ├── middlewares.py
│ ├── pipelines.py
│ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ ├── items.cpython-36.pyc
│ │ ├── pipelines.cpython-36.pyc
│ │ └── settings.cpython-36.pyc
│ ├── settings.py
│ └── spiders
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ └── ssq.cpython-36.pyc
│ └── ssq.py
├── scrapy.cfg
└── start.sh
4 directories, 15 files
[root@rainsty caipiaoSSQ]#
#item.py
import scrapy
class CaipiaossqItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name = scrapy.Field()
#ssq.py
import scrapy
import re
from caipiaoSSQ.items import CaipiaossqItem
class SsqSpider(scrapy.Spider):
name = 'ssq'
allowed_domains = ['17500.cn']
start_urls = ['http://www.17500.cn/ssq/awardlist.php?p=1']
url_set = set()
def parse(self, response):
allSSQ = response.css('table.sortable')
file_name = ['期号', '日期', '周', '开奖号码', '本期投注', '返奖比', '奖池金额', '一等奖', '奖金一',
'二等奖', '奖金二', '三等奖', '奖金三', '四等奖', '奖金四', '五等奖', '奖金五', '六等奖', '奖金六']
file_value = []
for t in allSSQ:
for ta in t.xpath('./tbody/tr/td').extract():
ta = re.sub('| ||', '', ta)
file_value.append(ta)
file_result = []
item = CaipiaossqItem()
for l in range(len(file_value)):
file_result.append(file_value[l])
if (l + 1) % 19 == 0:
item['name'] = dict(zip(file_name, file_result))
yield item
file_result = []
page_num = ''
for p in response.xpath('//div[contains(@id,"sortlist")]/a').extract():
if '下一页' in p:
page_num = p
break
try:
page = int(re.findall('\"?p=(.*)\"', page_num)[0])
url = 'http://www.17500.cn/ssq/awardlist.php?p=%s' % str(page)
print(url)
if url in SsqSpider.url_set:
pass
else:
SsqSpider.url_set.add(url)
yield self.make_requests_from_url(url)
except BaseException as e:
print('end...')
print(e)
#pipelines.py
import json
class CaipiaossqPipeline(object):
def process_item(self, item, spider):
with open("./my_ssq.txt",'a') as fp:
fp.write(json.dumps(item['name'], ensure_ascii=False) + '\n')
#settings.py
ITEM_PIPELINES = {
'caipiaoSSQ.pipelines.CaipiaossqPipeline': 300,
}
[root@rainsty caipiaoSSQ]# ll
total 12
drwxr-xr-x 4 root root 4096 Jan 14 10:49 caipiaoSSQ
-rw-r--r-- 1 root root 263 Jan 7 19:32 scrapy.cfg
-rwxr-xr-x 1 root root 420 Jan 14 09:42 start.sh
[root@rainsty caipiaoSSQ]# ./start.sh
[root@rainsty caipiaoSSQ]# ll
total 1300
drwxr-xr-x 4 root root 4096 Jan 14 10:49 caipiaoSSQ
-rw-r--r-- 1 root root 1034521 Jan 14 10:49 my_ssq.json
-rw-r--r-- 1 root root 281880 Jan 14 10:49 nohup.out
-rw-r--r-- 1 root root 263 Jan 7 19:32 scrapy.cfg
-rwxr-xr-x 1 root root 420 Jan 14 09:42 start.sh
----nohup.out:日志输出记录
----my_ssq.json:爬取结果
----start.sh:启动文件,如下:
#!/bin/bash
#filename: start.sh
#cteatedtime: 2019-01-08
#mkdir scrapyApp
#cd scrapyApp
#/usr/rain/python/bin/scrapy startproject caipiaoSSQ
#cd caipiaoSSQ
#/usr/rain/python/bin/scrapy genspider ssq 17500.cn
function Main(){
/usr/rain/python/bin/scrapy crawl ssq > nohup.out 2>&1
name="./my_ssq.txt"
if [ -e $name ];then
cat my_ssq.txt > my_ssq.json
rm -rf my_ssq.txt
fi
}
Main
{"期号": "2019006", "日期": "2019-01-13", "周": "日", "开奖号码": "01 05 10 19 26 28+12", "本期投注": "375,825,296", "返奖比": "79.5%", "奖池金额": "1,211,353,738", "一等奖": "32", "奖金一": "5,514,887", "二等奖":
"96", "奖金二": "214,536", "三等奖": "1820", "奖金三": "3,000", "四等奖": "83388", "奖金四": "200", "五等奖": "1500047", "奖金五": "10", "六等奖": "12926873", "奖金六": "5"}
{"期号": "2019005", "日期": "2019-01-10", "周": "四", "开奖号码": "21 22 26 28 31 32+07", "本期投注": "343,365,078", "返奖比": "55%", "奖池金额": "1,326,043,660", "一等奖": "11", "奖金一": "6,133,650", "二等奖": "
68", "奖金二": "229,230", "三等奖": "1308", "奖金三": "3,000", "四等奖": "61547", "奖金四": "200", "五等奖": "1102950", "奖金五": "10", "六等奖": "15727044", "奖金六": "5"}
{"期号": "2019004", "日期": "2019-01-08", "周": "二", "开奖号码": "08 12 16 19 26 32+03", "本期投注": "338,080,388", "返奖比": "45%", "奖池金额": "1,346,750,738", "一等奖": "8", "奖金一": "7,428,584", "二等奖": "2
22", "奖金二": "109,395", "三等奖": "1308", "奖金三": "3,000", "四等奖": "69480", "奖金四": "200", "五等奖": "1385574", "奖金五": "10", "六等奖": "7368055", "奖金六": "5"}
{"期号": "2019003", "日期": "2019-01-06", "周": "日", "开奖号码": "13 17 20 21 22 27+01", "本期投注": "373,728,002", "返奖比": "38.6%", "奖池金额": "1,333,321,882", "一等奖": "5", "奖金一": "9,635,283", "二等奖":
"129", "奖金二": "224,577", "三等奖": "920", "奖金三": "3,000", "四等奖": "59266", "奖金四": "200", "五等奖": "1278481", "奖金五": "10", "六等奖": "7969323", "奖金六": "5"}
{"期号": "2019002", "日期": "2019-01-03", "周": "四", "开奖号码": "04 05 06 08 09 18+11", "本期投注": "345,144,344", "返奖比": "45.7%", "奖池金额": "1,294,586,730", "一等奖": "9", "奖金一": "7,273,230", "二等奖":
"129", "奖金二": "198,246", "三等奖": "1352", "奖金三": "3,000", "四等奖": "74631", "奖金四": "200", "五等奖": "1506728", "奖金五": "10", "六等奖": "6555174", "奖金六": "5"}
{"期号": "2019001", "日期": "2019-01-01", "周": "二", "开奖号码": "06 10 13 15 32 33+15", "本期投注": "341,453,440", "返奖比": "50.8%", "奖池金额": "1,283,324,272", "一等奖": "14", "奖金一": "6,657,038", "二等奖":
"147", "奖金二": "197,266", "三等奖": "822", "奖金三": "3,000", "四等奖": "47264", "奖金四": "200", "五等奖": "1059312", "奖金五": "10", "六等奖": "5761518", "奖金六": "5"}
{"期号": "2018153", "日期": "2018-12-30", "周": "日", "开奖号码": "01 07 17 23 25 31+11", "本期投注": "363,663,410", "返奖比": "37.5%", "奖池金额": "1,289,528,301", "一等奖": "4", "奖金一": "10,000,000", "二等奖":
"133", "奖金二": "205,313", "三等奖": "1259", "奖金三": "3,000", "四等奖": "58445", "奖金四": "200", "五等奖": "1096798", "奖金五": "10", "六等奖": "8506891", "奖金六": "5"}
{"期号": "2018152", "日期": "2018-12-27", "周": "四", "开奖号码": "04 14 16 23 28 29+03", "本期投注": "342,935,808", "返奖比": "70.5%", "奖池金额": "1,247,608,325", "一等奖": "20", "奖金一": "5,479,653", "二等奖":
"168", "奖金二": "71,376", "三等奖": "5101", "奖金三": "3,000", "四等奖": "155083", "奖金四": "200", "五等奖": "1893432", "奖金五": "10", "六等奖": "10963861", "奖金六": "5"}
{"期号": "2018151", "日期": "2018-12-25", "周": "二", "开奖号码": "05 15 19 25 26 29+15", "本期投注": "345,353,380", "返奖比": "39.8%", "奖池金额": "1,321,227,399", "一等奖": "4", "奖金一": "9,708,849", "二等奖":
"203", "奖金二": "115,981", "三等奖": "2250", "奖金三": "3,000", "四等奖": "87941", "奖金四": "200", "五等奖": "1559540", "奖金五": "10", "六等奖": "7022515", "奖金六": "5"}
源码地址:Github:[https://github.com/Rainstyed/rainsty/tree/master/Spiders/ScrapyApp/caipiaoSSQ]