这次依然选用天气预报为例子(数据多)
但换成了国内的,API似乎有点简单
解析网页 以及 正则表达式 那里卡了很久
在看源码的过程中,我发现 网页广告 是如何插入网页的,其实就是一堆链接。。。。。
import re
import csv
import time
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.support.select import Select
weather_list = []
def get_url():
url='http://www.weather.com.cn/weather/101270101.shtml'
#www.weather.com.cn/weather1d/101270101.shtml#input
return url
def get_urlText(url):
try:
kv={'user-agent' : 'Mozilla/5.0'}
r = requests.get(url,headers=kv)
r.raise_for_status()
r.encoding = r.apparent_encoding #使其encoding更准确
#print (r.text) #1
return r.text
except:
print('error 1')
return
def get_parseText(parse_url):
try:
soup = BeautifulSoup(parse_url,'html.parser')
lists=[ ]
lists = soup.find('ul','t clearfix').find_all('li')
for elem in lists:
date = elem.find('h1').get_text()
weather = elem.find('p','wea').get_text()
temperature = elem.find('p','tem').find('i').get_text()
win = re.findall('(?<= title=").*?(?=\")', str(elem.find('p', 'win').find('em')))
#*?匹配前面那个子表达式0/1次,最小匹配 ?= 捕获以title= 开头的内容 ?=查找“前面的。
wind = '-'.join(win)
# print(wind)
wind_lev=elem.find('p','win').find('i').get_text()
global weather_list
weather_list.append([date,weather,temperature,wind,wind_lev])
except:
print('error 2')
return
def prints(weather_list):
f = open('weatherlist', 'w',encoding='utf8')
tplt = '{0:^10}\t{1:^10}\t{2:^10}\t{3:^10}\t{4:^10}' #居中对齐
#print(tplt.format('日期','天气','温度','风向','风级',chr(12288)))
f.write(tplt.format('日期','天气','温度','风向','风级',chr(12288))) #以中文空格隔开
f.write('\n')
for elem in weather_list:
f.write(tplt.format(elem[0],elem[1],elem[2],elem[3],elem[4],chr(12288)))
#print(tplt.format(elem[0],elem[1],elem[2],elem[3],elem[4))
f.write('\n')
f.close()
def main():
url = get_url()
parse_text=get_urlText(url)
get_parseText(parse_text)
prints(weather_list)
main()
以下为源码
(不得不说500多行 可能不太熟练 找了很久每一天的天气为
-
4日(今天)
多云转小雨
16/9℃
<3级
-
5日(明天)
小雨
15/9℃
<3级
-
6日(后天)
多云
15/9℃
<3级
-
7日(周六)
多云转小雨
17/11℃
<3级
-
8日(周日)
小雨
15/9℃
<3级
-
9日(周一)
多云
14/6℃
<3级
-
10日(周二)
多云转晴
15/3℃
<3级
用csv输入输出
将中国各个城市的代码写入文件中。
import re
import csv
import time
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.support.select import Select
weather_list = []
def get_url():
url='http://www.weather.com.cn/weather/101270101.shtml'
#www.weather.com.cn/weather1d/101270101.shtml#input
return url
def get_urlText(url):
try:
kv={'user-agent' : 'Mozilla/5.0'}
r = requests.get(url,headers=kv)
r.raise_for_status()
r.encoding = r.apparent_encoding #使其encoding更准确
#print (r.text) #1
return r.text
except:
print('error 1')
return
def get_parseText(parse_url):
try:
soup = BeautifulSoup(parse_url,'html.parser')
lists=[ ]
lists = soup.find('ul','t clearfix').find_all('li')
for elem in lists:
date = elem.find('h1').get_text()
weather = elem.find('p','wea').get_text()
temperature = elem.find('p','tem').find('i').get_text()
win = re.findall('(?<= title=").*?(?=\")', str(elem.find('p', 'win').find('em')))
#*?匹配前面那个子表达式0/1次,最小匹配 ?= 捕获以title= 开头的内容 ?=查找“前面的。
wind = '-'.join(win)
# print(wind)
wind_lev=elem.find('p','win').find('i').get_text()
global weather_list
weather_list.append([date,weather,temperature,wind,wind_lev])
except:
print('error 2')
return
def prints(weather_list):
titles=['日期','天气','温度','风向','风级']
with open('weather.csv','w',encoding='utf8') as f:
f_csv = csv.writer(f)
f_csv.writerow(titles)
for row in weather_list:
f_csv.writerow(row)
def main():
url = get_url()
parse_text=get_urlText(url)
get_parseText(parse_text)
prints(weather_list)
main()