链接:https://pan.baidu.com/s/1b39J-dEfUt1ZROO93aEkag
提取码:8848
1、主要使用BeautifulSoup进行解析,BeautifulSoup语法需要掌握find_all,find 方法,自行百度了解
2、使用pandas和numpy进行数据清洗和挖掘,干货满满。
import requests
from bs4 import BeautifulSoup
import re
import time
import csv
import random
from lxml import etree
def get_html(i):
url = 'https://travel.qunar.com/travelbook/list.htm?page={}&order=hot_heat'.format(i)
headers={'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.360',
'cookies':'实时的cookies',
'referer':'https://travel.qunar.com/?from=header'}
response = requests.get(url=url,headers = headers)
response.encoding = 'utf-8'
html_ = response.text
# print(html_)
发布受限,源码里有
return html_
保存数据代码:
with open('D:/爬取的内容/去哪儿网站攻略爬取/Travel.csv', 'a+', encoding='utf-8-sig', newline='') as csvFile:
csv.writer(csvFile).writerow(['短评', '出发时间', '天数','人均费用','人物','玩法','浏览量','途经'])
csvFile.close()
for i in range(1,110):
# print(i)
html = get_html(i)
get_info(html)
time.sleep(random.randint(3, 5))
print("第{}页爬取成功!".format(i))
print("爬取结束")
import pandas as pd
df = pd.read_csv("D:/爬取的内容/去哪儿网站攻略爬取/Travel.csv" , encoding = 'utf-8-sig')
df
观察后发现,数据很不规范,我们后续进行数据清洗:
1、提取出发时间
2、提取天数数字、
3、人均费用可以用均值代替空值
4、人物可以用众数代替空值
5、流量量需要提取数字注意万字需*10000
6、、途径我们可以拆分为起始地和目的地两个字段
接下来一一展示代码实现:
df['新天数'] = '0'
df
import re
for index, row in df.iterrows():
day = re.findall('\d+',str(row['天数']))[0]
print(day)
row['新天数'] = day
df
df['新出发日期'] = df['出发时间'].str[:-2]
df
df['浏览量'].count() #1090 无空值
df['新浏览量'] = '0'
for index, row in df.iterrows():
if '万' in row['浏览量'] :
view = float(re.findall(r"\d+\.?\d*",str(row['浏览量']))[0])*10000
print(view)
row['新浏览量'] = str(int(view))
else:
view = re.findall(r"\d+",str(row['浏览量']))[0]
row['新浏览量'] = view
df
df['起始地'] = '.'
df['目的地'] = '.'
df['途经'].count() #1090 无空值
for index, row in df.iterrows():
if '-' in row['途经']:
row['起始地'] = '-'
row['目的地'] = '-'
else:
s_next = row['途经'].split(':')[1]
if '>' in s_next:
li = s_next.split('>')
row['起始地'] = li[0]
row['目的地'] = li[-1]
else:
row['起始地'] = s_next
row['目的地'] = s_next
df
df['新人物'] = '独自一人'
df
for index, row in df.iterrows():
if '-' not in row['人物'] :
row['新人物'] = row['人物']
df
import numpy as np
import re
df['新人均费用'] = '0'
df
for index, row in df.iterrows():
if '-' not in row['人均费用'] :
a = float(re.findall(r"\d+",str(row['人均费用']))[0])
row['新人均费用'] = str(a)
df
df['新人均费用'].replace('0',np.nan,inplace=True)
df
#均值填充:
mean_val = round(df['新人均费用'].astype(float).mean(),2)
mean_val
df['新人均费用'].fillna(mean_val, inplace=True)
df
df.drop(labels=['出发时间','出发时间','人均费用','人物','浏览量','天数','途经'],axis=1,inplace=True)
df
df.to_csv('new_Travel.csv',encoding='utf-8-sig')
这样一个简单的旅游数据采集、清洗挖掘实践案例就做好了,还有很多平时积累的案例,后续我会持续编写分享的,如果您觉得有一定的意义,请点个关注呗,您的支持是我创作的最大动力