《Python3爬虫、数据清洗和可视化实战》之阅读不懂处、主要代码总结(10章)

《Python3爬虫、数据清洗和可视化实战》

零一 韩要宾 黄园园 著


第十章 综合应用实例

实例:按性价比给用户推荐旅游产品

第一部分:数据采集
import requests
import json
import urllib. Request
import time
import csv
import random
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver,support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys


def globalVals():
	global driver
	global driver_
	
	driver = webdriver.Chrome()
	driver_ = webdriver.Chrome ()

def init_ csv() :
	global f
	global writer
	csvFile = "D:/qunar_routes.csv"
	#打幵文件后如果乱码,則將utf-8改成gb18030
	f = open(csvFile, "w",newline="", encoding='utf-8')
	writer = csv.writer(f)
	writer.writerow(["出父地","目的地","路践信息","酒店信息"])

def close_csv() :
	global f
	f.close()

def dump_routes_csv(dep,arr):
	global driver
	global driver_
	global writer

	#定位所有路銭信息
	routes = driver.find_elemerts_by_css_selector(".item.g-flexbox.list-item")
	for route in routes:
		try:

			print ("\nroute info:%s" % route.text)
			#获取路线详细页URL
			url = route.get_attribute ("data-ur1")
			print ("url:%s" % url)

			#在另一个浏览器对象打开路线详情页
			driver_.get(url)
			time.sleep(random.uniform(2, 3))

			if "fhtouch" in url: 	#机酒自由行
				try:
					# we have to wait for the page to refresh
							WebDriverWait(driver_,10).until(EC.presence_of_element_located((By .css_SELECTOR,”#allHotels”)))
					Source=diver_.find_element_css_selector(‘#main-page’)
					target=diver_.find_element_css_selector(‘#allHotels’)
				except:	
					print (str(e))
					continue
			else: 			#自由行
				try:
					#等待頁面刷新成功
					WebDriverWait(driver_,10).until(EC.presence_of_element_located((By .css_SELECTOR,”.m-ball.m-ball-back”)))
					Source=diver_.find_element_css_selector(‘.flex.scrollable’)
					target=diver_.find_element_css_selector(‘.m-ball.m-ball-back’)
				except:	
					print (str(e))
					continue

			#路线详情页需須通过drag_and_drop动作获得焦点,否则[rage Down]鍵无效			ActionChains(driver_).drag_and_drop(source, target).perform()

			for i in tange(3):
				#模拟[Page Down]鍵的輸入,实现下拉滚动条动作 (3次)
				ActionChains(driver_).send_keys (Keys.PAGE_DOWN).perform()

			#路线详情页下拉滚动条后才可定位到下面的元素
			try:
				# we have to wait for the page to refresh
				WebDriverWait(driver_,10).until(EC.presence_of_elenent_located(By.css_SELECTOR,".tit .score")))
			except Exception as e:
				print(str(e))
				continue

			try:
				#获取酒店评分
				rating = driver_.find_element_by_cas_selector(“.tit  .score"
				
				#获取酒店类型
				type=driver.find_element_by_css_selector(".tit+ .tag-list > .g-tag.solid")

				#拼接成酒店信息
				hotel = '\n'.join([rating.text, type.text])
				print ("hotel info:%s" % hotel)
			except Exception as e:
				print (str(e))
				continue

			#将这一条路线信息写入CSV文件
			writet.writerow([dep, arr, route.text, hotel])
		except:
			continue

if __name__ == "__main_":
	globalVals()
	init_csv()
	dep.cities = [“杭州"]

	for ecp in dep cities:
		strhtmI = requests.get('https://m.dujia.qunar.com/golfz/sight/arriveRecommend?dep=' + urllib.request.quote(dep) + '&exclude=&extensionImg=255, 175’)
		arrive_dict = json.loads(strhtml.text)
		for arr_item in arrive_dict['data']:
			#本例只爬取国内自由行路线,如需爬取国际路线,可将下面两行注释掉
			if acr_item['title'] != "国内":
				continue

			for arr_item_1 in arr_item[ 'subModules'] :
				for guery in arr_item_1['items'] :
				#本例只爬取杭州-丽江的自由行路线,如需爬取杭州-全国路线,注释下面两行
				if query['query'] != "丽江":
					continue

				#打幵移动端自由行路线捜索结果頁面
				driver.get ("https://touch.dujia.qunar.com/p/list?cfrom=zyx&dep=" + urllib. request.quote(dep) + "&query=" + urlib.request.quote(query['query']) + "%e8%87%aa%e7%94%b%e8%a1%8c%it=n_index_free"l
				try:
					#we have to wait for the page to refresh
					WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CLASS_NAME,"item g-flexbox list-item ")))
				except Exception as e:
					print(str(e))
					raise

				print("dep:%s arr:%s" % (dep, query["query"]))
				
				#连续下拉滚动条50次获取更多的信息
				for I in range(50):
					time.sleep(random.uniform(2, 3))
					print("page %d" % (i+1))
					#模拟动作实现下拉
					ActionChains(driver).send_keys (Keys.PAGE_DOWN).perform()

				#将出发地-目的地的自由行路线写入СЅV 文件
				dump_ routes_csv (dep, query["query"])

close_csv()
driver.close()
driver_.close()
第二部分:数据清洗、建模
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api assm

#读取路线信息csv文件
df = pd.read_csv("D: /qunar_coutes.csv")
print(df.head())
print(df.info())

#从路线信息中提取天数、价格信息
df[“天数”]=df.路线信息.str.extract(‘(d+)天\d+晚’).apply(lambda x: int(x))
df["价格"]=df.路线信息.str.extract('(\d+)起/人').apply(lambda x: int(x))
#从酒店信息中提取评分、等级信息
df["酒店评分"]=df.酒店信息,str.extract('(\d\.\d)分').apply(lambda x: float(x))
df["酒店等级"]=df.酒店信息.str.extract('\n(.*)')

print (df.head())
print (df.info() )

#将酒店等级信息由文本型映射成数值型
class map = {"其他":0, "经济型":1, "舒适型":2, "高档型":3, "豪华型":4}
df["酒店等级"]=df["酒店等级"].map (class_map)

#对变量画直方图,查看是否有异常值
fig, axes = plt.subplots(1,3,figsize=(12,4))
df["酒店等级"].plot (ax=axes[0],kind='hist',title="酒店等级")
df["酒店评分"].plot(ax=axes[1], kind='hist',title="酒店评分")
df["价格"].plot (ax=axes[2],kind='hist', title="价格")

#提取自变量X,因变量y
X,y = df.ix[:,4:-1].values,df.ix[:,-1].values

#拟合OLS线性回归模型
ols = sm.OLS (y,X)
result = ols.fit()

#查看拟合效果,R=0.886
print (result. summary())

#用训练好的线性回归模型来预测路线价格
y_pred = result.predict (X)

#性价比定义为预测价格和实际价格的比值
ratio = y_pred/y
df["性价比"] = ratio

#按性价比从高到低排序
print(df.sort_values ("性价比",ascending=False))

"Don't waste your time looking back, you're not going that way."--《Vikings》

你可能感兴趣的:(Python3.7,网络爬虫,书籍笔记)