python爬虫--彩票数据爬取

最近对彩票比较感兴趣,所以想要爬取一些彩票数据。爬虫相关的库使用的不是很熟练,基本是现学现用,使用如下代码:

import pandas as pd
import requests
import xlwt
import time
from bs4 import BeautifulSoup
import numpy as np
from matplotlib import pyplot as plt
# 初始化
def get_html(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
    }
    response = requests.get(url, headers = headers)
    if response.status_code == 200:
        return response.text
    else:
        print('无数据!')
    return None
#网页解析
def getCDf(c,strDate):
    qishu=[]
    num1=[]
    num2=[]
    num3=[]
    num4=[]
    num5=[]
    for z in c:
        num=z.select('span')[1]
        num1.append(num.find_all('span','ds_ib mr5')[0].text)
        num2.append(num.find_all('span','ds_ib mr5')[1].text)
        num3.append(num.find_all('span','ds_ib mr5')[2].text)
        num4.append(num.find_all('span','ds_ib mr5')[3].text)
        num5.append(num.find_all('span','ds_ib mr5')[4].text)

        qishu.append(int(z.select('span')[0].text))

    dict1={
        "qishu":qishu,
        "num1":num1,
        "num2":num2,
        "num3":num3,
        "num4":num4,
        "num5":num5,
    }
    df=pd.DataFrame(dict1)
    df['date']=strDate
    return df

def getDayDf(url):
    strDate=url[-15:-5]
    html = get_html(url)
    soup = BeautifulSoup(html, 'lxml')
    a=soup.find_all('div','ov bg_faf9f5 bs_bb')
    c1=a[0].find_all('div','ov h34 bg_white')
    c2=a[0].find_all('div','ov h34 bg_grayf2')
    c=c1+c2
    df=getCDf(c,strDate)
    df=df.sort_values('qishu')
    df['dd']=df['date']+'-'+df['qishu'].astype(str)
    return df
ndate='2019-01-01'
url='https://www.caibow.com/kj/cqssc/{0}.html'.format(ndate)
df=getDayDf(url)

你可能感兴趣的:(python,爬虫)