#本文主要是为了批量爬取竞彩网赛果开奖数据,网址为:http://info.sporttery.cn/basketball/match_result.php
#而且需要的数据是每场比赛的开奖结果中的详细固定奖金
#如果手动爬取的话是个很麻烦的纯体力活,所以果断写了个爬虫来完成。
#首先不知道怎么模拟打开浏览器的请看我前面的文章:http://blog.csdn.net/trisyp/article/details/78688106
#根据那篇文章配置好了运行环境之后就可以执行本博文代码
完整代码如下:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import os
import time
def startDriver(): #启动chrome浏览器
# 构造模拟浏览器
chromedriver = "C:/Program Files (x86)/Google/Chrome/Application/chromedriver" # 驱动所在路径
os.environ["webdriver.chrome.driver"] = chromedriver
driver = webdriver.Chrome(chromedriver) # 模拟打开浏览器
time.sleep(2)
return driver
def getDatenumber(url): #获取主页中的赛事日期和赛事编号
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
html = r.text
soup = BeautifulSoup(html, 'html.parser') # 用HTML解析网址
tag = soup.find_all('table', attrs={"class": {"m-tab"}})
tag1 = tag[0].find_all('tr')
dateNumber = []
for i in range(len(tag1)):
tag2 = tag1[i].find_all('td')
try:
info = tag2[0].text+tag2[1].text #获取标中的内容
except:
break
dateNumber.append(info)
del dateNumber[-1]
return dateNumber
def getHTML(driver, url, xpath): #模拟浏览器打开网页,并获得最新窗口中的网页
driver.get(url) # 打开网址
# 模拟点击更多评论
time.sleep(2)
driver.find_element_by_xpath(xpath).click()
time.sleep(3)
driver.switch_to_window(driver.window_handles[-1]) #跳转到当前窗口
html = driver.page_source
return html
def getTableName(html): #获取每个table的表名
soup = BeautifulSoup(html, 'html.parser') # 用HTML解析网址
tag = soup.find_all('div', attrs={'class': {'kj-tit'}})
tableName = []
for infoi in tag:
tableName.append(infoi.text.replace("\n", "").replace(" ", ""))
return tableName
def fillUnivlist(driver, url): #保存网页中间两个表格的内容
dateNumbers = getDatenumber(url)
result = []
count = 0
for k in range(len(dateNumbers)):
xpath = "/html/body/div[4]/div[4]/table/tbody/tr["+str(k+1)+"]/td[13]/a"
html = getHTML(driver, url, xpath) # 获取HTML
tableNames = getTableName(html) #获取表名
soup = BeautifulSoup(html, 'html.parser') # 用HTML解析网址
tag = soup.find_all('table', attrs={'class': {'kj-table'}}) #获取所有表格
# print(str(tag[0]))
for i in range(1, 3):
infoTag = tag[i]
contentTr = infoTag.find_all('tr')
for j in range(len(contentTr)):
if j == 0:
contentTh = contentTr[j].find_all('th')
info1 = dateNumbers[k] + "," + tableNames[i]
for infok in contentTh:
info1 = info1 + "," + infok.text.replace(" ", "")
else:
contentTd = contentTr[j].find_all('td')
info1 = dateNumbers[k] + "," + tableNames[i]
for infok in contentTd:
info1 = info1 + "," + infok.text
result.append(info1)
count += 1
print("\r当前页进度: {:.2f}%".format(count * 100 / len(dateNumbers)), end="")
return result
def writeUnivlist(result, fpath, num):
with open(fpath, 'a', encoding='utf-8') as f: #以追加的方式存储内容
for i in range(num):
f.write(result[i] + '\n')
f.close()
def main():
for i in range(9):
driver = startDriver()
url = "http://info.sporttery.cn/basketball/match_result.php?page="+str(i+1)+"&start_date=2017-11-05&end_date=2017-12-05" # 要访问的网址
result = fillUnivlist(driver, url)
output_file = 'D:/page' + str(i + 1) + '.txt'
writeUnivlist(result, output_file, len(result))
driver.close()
time.sleep(2)
print("第"+str(i+1)+"页爬取完毕!")
if __name__ == '__main__':
main()
#至于Xpath的获取参考上篇博客最后一句话:http://blog.csdn.net/trisyp/article/details/78712715。